feat: update rate limiter to use semaphore that will block without re… (#6853 )

* feat: update rate limiter to use semaphore that will block without return error Signed-off-by: Ning Sun <sunning@greptime.com> * fix: remove unused error Signed-off-by: Ning Sun <sunning@greptime.com> --------- Signed-off-by: Ning Sun <sunning@greptime.com>
feat: Support more key types for the DictionaryVector (#6855 )
2026-01-04 20:32:56 +00:00 · 2025-08-30 18:17:08 +00:00 · 2025-08-29 13:23:25 +00:00 · 2025-08-29 12:21:59 +00:00 · 2025-08-29 12:16:03 +00:00 · 2025-08-29 09:23:59 +00:00
218 changed files with 6938 additions and 2186 deletions
--- a/.github/actions/setup-etcd-cluster/action.yml
+++ b/.github/actions/setup-etcd-cluster/action.yml
@@ -12,7 +12,7 @@ runs:
  steps:
  - name: Install Etcd cluster
    shell: bash
-    run: | 
+    run: |
      helm upgrade \
        --install etcd oci://registry-1.docker.io/bitnamicharts/etcd \
        --set replicaCount=${{ inputs.etcd-replicas }} \
@@ -24,4 +24,9 @@ runs:
        --set auth.rbac.token.enabled=false \
        --set persistence.size=2Gi \
        --create-namespace \
+        --set global.security.allowInsecureImages=true \
+        --set image.registry=public.ecr.aws/i8k6a5e1 \
+        --set image.repository=bitnami/etcd \
+        --set image.tag=3.6.1-debian-12-r3 \
+        --version 12.0.8 \
        -n ${{ inputs.namespace }}
--- a/.github/actions/setup-kafka-cluster/action.yml
+++ b/.github/actions/setup-kafka-cluster/action.yml
@@ -12,7 +12,7 @@ runs:
  steps:
  - name: Install Kafka cluster
    shell: bash
-    run: | 
+    run: |
      helm upgrade \
        --install kafka oci://registry-1.docker.io/bitnamicharts/kafka \
        --set controller.replicaCount=${{ inputs.controller-replicas }} \
@@ -23,4 +23,8 @@ runs:
        --set listeners.controller.protocol=PLAINTEXT \
        --set listeners.client.protocol=PLAINTEXT \
        --create-namespace \
+        --set image.registry=public.ecr.aws/i8k6a5e1 \
+        --set image.repository=bitnami/kafka \
+        --set image.tag=3.9.0-debian-12-r1 \
+        --version 31.0.0 \
        -n ${{ inputs.namespace }}
--- a/.github/actions/setup-postgres-cluster/action.yml
+++ b/.github/actions/setup-postgres-cluster/action.yml
@@ -6,9 +6,7 @@ inputs:
    description: "Number of PostgreSQL replicas"
  namespace:
    default: "postgres-namespace"
-  postgres-version:
-    default: "14.2"
-    description: "PostgreSQL version"
+    description: "The PostgreSQL namespace"
  storage-size:
    default: "1Gi"
    description: "Storage size for PostgreSQL"
@@ -22,7 +20,11 @@ runs:
      helm upgrade \
        --install postgresql oci://registry-1.docker.io/bitnamicharts/postgresql \
        --set replicaCount=${{ inputs.postgres-replicas }} \
-        --set image.tag=${{ inputs.postgres-version }} \
+        --set global.security.allowInsecureImages=true \
+        --set image.registry=public.ecr.aws/i8k6a5e1 \
+        --set image.repository=bitnami/postgresql \
+        --set image.tag=17.5.0-debian-12-r3 \
+        --version 16.7.4 \
        --set persistence.size=${{ inputs.storage-size }} \
        --set postgresql.username=greptimedb \
        --set postgresql.password=admin \
--- a/.github/scripts/pull-test-deps-images.sh
+++ b/.github/scripts/pull-test-deps-images.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# This script is used to pull the test dependency images that are stored in public ECR one by one to avoid rate limiting.
+
+set -e
+
+MAX_RETRIES=3
+
+IMAGES=(
+  "public.ecr.aws/i8k6a5e1/bitnami/zookeeper:3.7"
+  "public.ecr.aws/i8k6a5e1/bitnami/kafka:3.9.0-debian-12-r1"
+  "public.ecr.aws/i8k6a5e1/bitnami/etcd:3.6.1-debian-12-r3"
+  "public.ecr.aws/i8k6a5e1/bitnami/minio:2024"
+  "public.ecr.aws/i8k6a5e1/bitnami/mysql:5.7"
+)
+
+for image in "${IMAGES[@]}"; do
+  for ((attempt=1; attempt<=MAX_RETRIES; attempt++)); do
+    if docker pull "$image"; then
+      # Successfully pulled the image.
+      break
+    else
+      # Use some simple exponential backoff to avoid rate limiting.
+      if [ $attempt -lt $MAX_RETRIES ]; then
+        sleep_seconds=$((attempt * 5))
+        echo "Attempt $attempt failed for $image, waiting $sleep_seconds seconds"
+        sleep $sleep_seconds  # 5s, 10s delays
+      else
+        echo "Failed to pull $image after $MAX_RETRIES attempts"
+        exit 1
+      fi
+    fi
+  done
+done
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -618,10 +618,12 @@ jobs:
      - uses: actions/checkout@v4
        with:
          persist-credentials: false
+
      - if: matrix.mode.kafka
        name: Setup kafka server
        working-directory: tests-integration/fixtures
-        run: docker compose up -d --wait kafka
+        run:  ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+
      - name: Download pre-built binaries
        uses: actions/download-artifact@v4
        with:
@@ -683,6 +685,30 @@ jobs:
      - name: Run cargo clippy
        run: make clippy

+  check-udeps:
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    name: Check Unused Dependencies
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+      - uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          shared-key: "check-udeps"
+          cache-all-crates: "true"
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+      - name: Install cargo-udeps
+        run: cargo install cargo-udeps --locked
+      - name: Check unused dependencies
+        run: make check-udeps
+
  conflict-check:
    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
    name: Check for conflict
@@ -698,7 +724,7 @@ jobs:
    if: ${{ github.repository == 'GreptimeTeam/greptimedb' && github.event_name != 'merge_group' }}
    runs-on: ubuntu-22.04-arm
    timeout-minutes: 60
-    needs:  [conflict-check, clippy, fmt]
+    needs: [conflict-check, clippy, fmt, check-udeps]
    steps:
      - uses: actions/checkout@v4
        with:
@@ -720,9 +746,11 @@ jobs:
          save-if: ${{ github.ref == 'refs/heads/main' }}
      - name: Install latest nextest release
        uses: taiki-e/install-action@nextest
+
      - name: Setup external services
        working-directory: tests-integration/fixtures
-        run: docker compose up -d --wait
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
+
      - name: Run nextest cases
        run: cargo nextest run --workspace -F dashboard -F pg_kvbackend -F mysql_kvbackend
        env:
@@ -739,8 +767,11 @@ jobs:
          GT_MINIO_ACCESS_KEY: superpower_password
          GT_MINIO_REGION: us-west-2
          GT_MINIO_ENDPOINT_URL: http://127.0.0.1:9000
+          GT_ETCD_TLS_ENDPOINTS: https://127.0.0.1:2378
          GT_ETCD_ENDPOINTS: http://127.0.0.1:2379
          GT_POSTGRES_ENDPOINTS: postgres://greptimedb:admin@127.0.0.1:5432/postgres
+          GT_POSTGRES15_ENDPOINTS: postgres://test_user:test_password@127.0.0.1:5433/postgres
+          GT_POSTGRES15_SCHEMA: test_schema
          GT_MYSQL_ENDPOINTS: mysql://greptimedb:admin@127.0.0.1:3306/mysql
          GT_KAFKA_ENDPOINTS: 127.0.0.1:9092
          GT_KAFKA_SASL_ENDPOINTS: 127.0.0.1:9093
@@ -773,9 +804,11 @@ jobs:
        uses: taiki-e/install-action@nextest
      - name: Install cargo-llvm-cov
        uses: taiki-e/install-action@cargo-llvm-cov
+
      - name: Setup external services
        working-directory: tests-integration/fixtures
-        run: docker compose up -d --wait
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
+        
      - name: Run nextest cases
        run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend
        env:
@@ -791,8 +824,11 @@ jobs:
          GT_MINIO_ACCESS_KEY: superpower_password
          GT_MINIO_REGION: us-west-2
          GT_MINIO_ENDPOINT_URL: http://127.0.0.1:9000
+          GT_ETCD_TLS_ENDPOINTS: https://127.0.0.1:2378
          GT_ETCD_ENDPOINTS: http://127.0.0.1:2379
          GT_POSTGRES_ENDPOINTS: postgres://greptimedb:admin@127.0.0.1:5432/postgres
+          GT_POSTGRES15_ENDPOINTS: postgres://test_user:test_password@127.0.0.1:5433/postgres
+          GT_POSTGRES15_SCHEMA: test_schema
          GT_MYSQL_ENDPOINTS: mysql://greptimedb:admin@127.0.0.1:3306/mysql
          GT_KAFKA_ENDPOINTS: 127.0.0.1:9092
          GT_KAFKA_SASL_ENDPOINTS: 127.0.0.1:9093
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -67,6 +67,12 @@ jobs:
    steps:
      - run: 'echo "No action required"'

+  check-udeps:
+    name: Unused Dependencies
+    runs-on: ubuntu-latest
+    steps:
+      - run: 'echo "No action required"'
+
  coverage:
    runs-on: ubuntu-latest
    steps:
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,9 @@ venv/
 tests-fuzz/artifacts/
 tests-fuzz/corpus/

+# cargo-udeps reports
+udeps-report.json
+
 # Nix
 .direnv
 .envrc
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -57,13 +57,16 @@ GreptimeDB uses the [Apache 2.0 license](https://github.com/GreptimeTeam/greptim
 - Make sure all your codes are formatted and follow the [coding style](https://pingcap.github.io/style-guide/rust/) and [style guide](docs/style-guide.md).
 - Make sure all unit tests are passed using [nextest](https://nexte.st/index.html) `cargo nextest run --workspace --features pg_kvbackend,mysql_kvbackend` or `make test`.
 - Make sure all clippy warnings are fixed (you can check it locally by running `cargo clippy --workspace --all-targets -- -D warnings` or `make clippy`).
+- Ensure there are no unused dependencies by running `make check-udeps` (clean them up with `make fix-udeps` if reported).
+- If you must keep a target-specific dependency (e.g. under `[target.'cfg(...)'.dev-dependencies]`), add a cargo-udeps ignore entry in the same `Cargo.toml`, for example:
+  `[package.metadata.cargo-udeps.ignore]` with `development = ["rexpect"]` (or `dependencies`/`build` as appropriate).
 - When modifying sample configuration files in `config/`, run `make config-docs` (which requires Docker to be installed) to update the configuration documentation and include it in your commit.

 #### `pre-commit` Hooks

 You could setup the [`pre-commit`](https://pre-commit.com/#plugins) hooks to run these checks on every commit automatically.

-1. Install `pre-commit`
+1.  Install `pre-commit`

        pip install pre-commit

@@ -71,7 +74,7 @@ You could setup the [`pre-commit`](https://pre-commit.com/#plugins) hooks to run

        brew install pre-commit

-2. Install the `pre-commit` hooks
+2.  Install the `pre-commit` hooks

        $ pre-commit install
        pre-commit installed at .git/hooks/pre-commit
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2189,7 +2189,7 @@ dependencies = [
 "hyperloglogplus",
 "jsonb",
 "memchr",
- "nalgebra 0.33.2",
+ "nalgebra",
 "num",
 "num-traits",
 "once_cell",
@@ -2201,7 +2201,6 @@ dependencies = [
 "session",
 "snafu 0.8.6",
 "sql",
- "statrs",
 "store-api",
 "table",
 "tokio",
@@ -2283,15 +2282,10 @@ dependencies = [
 name = "common-macro"
 version = "0.17.0"
 dependencies = [
- "arc-swap",
- "common-query",
- "datatypes",
 "greptime-proto",
 "once_cell",
 "proc-macro2",
 "quote",
- "snafu 0.8.6",
- "static_assertions",
 "syn 2.0.104",
 ]

@@ -2352,7 +2346,6 @@ dependencies = [
 "greptime-proto",
 "hex",
 "humantime-serde",
- "hyper 0.14.32",
 "itertools 0.14.0",
 "lazy_static",
 "moka",
@@ -2469,7 +2462,6 @@ dependencies = [
 "snafu 0.8.6",
 "sqlparser 0.55.0-greptime",
 "sqlparser_derive 0.1.1",
- "statrs",
 "store-api",
 "tokio",
 ]
@@ -2512,7 +2504,6 @@ dependencies = [
 "paste",
 "pin-project",
 "prometheus",
- "rand 0.9.1",
 "ratelimit",
 "serde",
 "serde_json",
@@ -2538,7 +2529,6 @@ name = "common-sql"
 version = "0.17.0"
 dependencies = [
 "common-base",
- "common-datasource",
 "common-decimal",
 "common-error",
 "common-macro",
@@ -2650,7 +2640,6 @@ dependencies = [
 name = "common-workload"
 version = "0.17.0"
 dependencies = [
- "api",
 "common-telemetry",
 "serde",
 ]
@@ -2912,7 +2901,7 @@ dependencies = [
 "cast",
 "ciborium",
 "clap 3.2.25",
- "criterion-plot",
+ "criterion-plot 0.5.0",
 "futures",
 "itertools 0.10.5",
 "lazy_static",
@@ -2939,7 +2928,7 @@ dependencies = [
 "cast",
 "ciborium",
 "clap 4.5.40",
- "criterion-plot",
+ "criterion-plot 0.5.0",
 "is-terminal",
 "itertools 0.10.5",
 "num-traits",
@@ -2955,6 +2944,29 @@ dependencies = [
 "walkdir",
 ]

+[[package]]
+name = "criterion"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap 4.5.40",
+ "criterion-plot 0.6.0",
+ "itertools 0.13.0",
+ "num-traits",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
 [[package]]
 name = "criterion-plot"
 version = "0.5.0"
@@ -2965,6 +2977,16 @@ dependencies = [
 "itertools 0.10.5",
 ]

+[[package]]
+name = "criterion-plot"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
+dependencies = [
+ "cast",
+ "itertools 0.13.0",
+]
+
 [[package]]
 name = "crossbeam"
 version = "0.8.4"
@@ -5277,7 +5299,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=df2bb74b5990c159dfd5b7a344eecf8f4307af64#df2bb74b5990c159dfd5b7a344eecf8f4307af64"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=66eb089afa6baaa3ddfafabd0a4abbe317d012c3#66eb089afa6baaa3ddfafabd0a4abbe317d012c3"
 dependencies = [
 "prost 0.13.5",
 "prost-types 0.13.5",
@@ -7021,7 +7043,6 @@ dependencies = [
 "protobuf-build",
 "raft-engine",
 "rand 0.9.1",
- "rand_distr",
 "rskafka",
 "serde",
 "serde_json",
@@ -7369,12 +7390,10 @@ dependencies = [
 "http-body-util",
 "humantime",
 "humantime-serde",
- "hyper 0.14.32",
 "hyper-util",
 "itertools 0.14.0",
 "lazy_static",
 "local-ip-address",
- "log-store",
 "once_cell",
 "parking_lot 0.12.4",
 "prometheus",
@@ -7433,6 +7452,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "base64 0.22.1",
+ "bytes",
 "common-base",
 "common-error",
 "common-macro",
@@ -7529,6 +7549,7 @@ dependencies = [
 "common-recordbatch",
 "common-telemetry",
 "common-time",
+ "criterion 0.7.0",
 "datafusion-common",
 "datafusion-expr",
 "datatypes",
@@ -7550,6 +7571,7 @@ dependencies = [
 "async-trait",
 "bytemuck",
 "bytes",
+ "chrono",
 "common-base",
 "common-config",
 "common-datasource",
@@ -7867,24 +7889,6 @@ dependencies = [
 "zstd 0.13.3",
 ]

-[[package]]
-name = "nalgebra"
-version = "0.29.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff"
-dependencies = [
- "approx 0.5.1",
- "matrixmultiply",
- "nalgebra-macros 0.1.0",
- "num-complex",
- "num-rational",
- "num-traits",
- "rand 0.8.5",
- "rand_distr",
- "simba 0.6.0",
- "typenum",
-]
-
 [[package]]
 name = "nalgebra"
 version = "0.33.2"
@@ -7893,25 +7897,14 @@ checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b"
 dependencies = [
 "approx 0.5.1",
 "matrixmultiply",
- "nalgebra-macros 0.2.2",
+ "nalgebra-macros",
 "num-complex",
 "num-rational",
 "num-traits",
- "simba 0.9.0",
+ "simba",
 "typenum",
 ]

-[[package]]
-name = "nalgebra-macros"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "nalgebra-macros"
 version = "0.2.2"
@@ -9347,7 +9340,6 @@ dependencies = [
 "serde",
 "serde_json",
 "session",
- "simd-json",
 "snafu 0.8.6",
 "sql",
 "table",
@@ -10122,7 +10114,7 @@ dependencies = [
 "log-query",
 "meter-core",
 "meter-macros",
- "nalgebra 0.33.2",
+ "nalgebra",
 "num",
 "num-traits",
 "object-store",
@@ -10142,7 +10134,6 @@ dependencies = [
 "snafu 0.8.6",
 "sql",
 "sqlparser 0.55.0-greptime",
- "statrs",
 "store-api",
 "substrait 0.17.0",
 "table",
@@ -10787,7 +10778,7 @@ dependencies = [
 [[package]]
 name = "rskafka"
 version = "0.6.0"
-source = "git+https://github.com/influxdata/rskafka.git?rev=a62120b6c74d68953464b256f858dc1c41a903b4#a62120b6c74d68953464b256f858dc1c41a903b4"
+source = "git+https://github.com/WenyXu/rskafka.git?rev=7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76#7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76"
 dependencies = [
 "bytes",
 "chrono",
@@ -11528,7 +11519,6 @@ dependencies = [
 "client",
 "common-base",
 "common-catalog",
- "common-config",
 "common-error",
 "common-frontend",
 "common-grpc",
@@ -11740,19 +11730,6 @@ dependencies = [
 "rand_core 0.6.4",
 ]

-[[package]]
-name = "simba"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f"
-dependencies = [
- "approx 0.5.1",
- "num-complex",
- "num-traits",
- "paste",
- "wide",
-]
-
 [[package]]
 name = "simba"
 version = "0.9.0"
@@ -12341,19 +12318,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"

-[[package]]
-name = "statrs"
-version = "0.16.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b35a062dbadac17a42e0fc64c27f419b25d6fae98572eb43c8814c9e873d7721"
-dependencies = [
- "approx 0.5.1",
- "lazy_static",
- "nalgebra 0.29.0",
- "num-traits",
- "rand 0.8.5",
-]
-
 [[package]]
 name = "store-api"
 version = "0.17.0"
@@ -13523,6 +13487,7 @@ dependencies = [
 "percent-encoding",
 "pin-project",
 "prost 0.13.5",
+ "rustls-native-certs 0.8.1",
 "socket2 0.5.10",
 "tokio",
 "tokio-rustls",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -138,11 +138,14 @@ deadpool-postgres = "0.14"
 derive_builder = "0.20"
 dotenv = "0.15"
 either = "1.15"
-etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7" }
+etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7", features = [
+    "tls",
+    "tls-roots",
+] }
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "df2bb74b5990c159dfd5b7a344eecf8f4307af64" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "66eb089afa6baaa3ddfafabd0a4abbe317d012c3" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -191,7 +194,7 @@ reqwest = { version = "0.12", default-features = false, features = [
    "stream",
    "multipart",
 ] }
-rskafka = { git = "https://github.com/influxdata/rskafka.git", rev = "a62120b6c74d68953464b256f858dc1c41a903b4", features = [
+rskafka = { git = "https://github.com/WenyXu/rskafka.git", rev = "7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76", features = [
    "transport-tls",
 ] }
 rstest = "0.25"
--- a/11
+++ b/11
@@ -193,6 +193,17 @@ clippy: ## Check clippy rules.
 fix-clippy: ## Fix clippy violations.
 	cargo clippy --workspace --all-targets --all-features --fix

+.PHONY: check-udeps
+check-udeps: ## Check unused dependencies.
+	cargo udeps --workspace --all-targets
+
+.PHONY: fix-udeps
+fix-udeps: ## Remove unused dependencies automatically.
+	@echo "Running cargo-udeps to find unused dependencies..."
+	@cargo udeps --workspace --all-targets --output json > udeps-report.json || true
+	@echo "Removing unused dependencies..."
+	@python3 scripts/fix-udeps.py udeps-report.json
+	
 .PHONY: fmt-check
 fmt-check: ## Check code format.
 	cargo fmt --all -- --check
--- a/config/config.md
+++ b/config/config.md
@@ -245,6 +245,16 @@
 | `grpc.tls.cert_path` | String | Unset | Certificate file path. |
 | `grpc.tls.key_path` | String | Unset | Private key file path. |
 | `grpc.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload.<br/>For now, gRPC tls config does not support auto reload. |
+| `internal_grpc` | -- | -- | The internal gRPC server options. Internal gRPC port for nodes inside cluster to access frontend. |
+| `internal_grpc.bind_addr` | String | `127.0.0.1:4010` | The address to bind the gRPC server. |
+| `internal_grpc.server_addr` | String | `127.0.0.1:4010` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
+| `internal_grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
+| `internal_grpc.flight_compression` | String | `arrow_ipc` | Compression mode for frontend side Arrow IPC service. Available options:<br/>- `none`: disable all compression<br/>- `transport`: only enable gRPC transport compression (zstd)<br/>- `arrow_ipc`: only enable Arrow IPC compression (lz4)<br/>- `all`: enable all compression.<br/>Default to `none` |
+| `internal_grpc.tls` | -- | -- | internal gRPC server TLS options, see `mysql.tls` section. |
+| `internal_grpc.tls.mode` | String | `disable` | TLS mode. |
+| `internal_grpc.tls.cert_path` | String | Unset | Certificate file path. |
+| `internal_grpc.tls.key_path` | String | Unset | Private key file path. |
+| `internal_grpc.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload.<br/>For now, gRPC tls config does not support auto reload. |
 | `mysql` | -- | -- | MySQL server options. |
 | `mysql.enable` | Bool | `true` | Whether to enable. |
 | `mysql.addr` | String | `127.0.0.1:4002` | The addr to bind the MySQL server. |
@@ -333,6 +343,7 @@
 | `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
 | `backend` | String | `etcd_store` | The datastore for meta server.<br/>Available values:<br/>- `etcd_store` (default value)<br/>- `memory_store`<br/>- `postgres_store`<br/>- `mysql_store` |
 | `meta_table_name` | String | `greptime_metakv` | Table name in RDS to store metadata. Effect when using a RDS kvbackend.<br/>**Only used when backend is `postgres_store`.** |
+| `meta_schema_name` | String | `greptime_schema` | Optional PostgreSQL schema for metadata table and election table name qualification.<br/>When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),<br/>set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.<br/>GreptimeDB will NOT create the schema automatically; please ensure it exists or the user has permission.<br/>**Only used when backend is `postgres_store`.** |
 | `meta_election_lock_id` | Integer | `1` | Advisory lock id in PostgreSQL for election. Effect when using PostgreSQL as kvbackend<br/>Only used when backend is `postgres_store`. |
 | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
 | `use_memory_store` | Bool | `false` | Store data in memory. |
@@ -344,7 +355,7 @@
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
-| `backend_tls` | -- | -- | TLS configuration for kv store backend (only applicable for PostgreSQL/MySQL backends)<br/>When using PostgreSQL or MySQL as metadata store, you can configure TLS here |
+| `backend_tls` | -- | -- | TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)<br/>When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here |
 | `backend_tls.mode` | String | `prefer` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- "disable" - No TLS<br/>- "prefer" (default) - Try TLS, fallback to plain<br/>- "require" - Require TLS<br/>- "verify_ca" - Require TLS and verify CA<br/>- "verify_full" - Require TLS and verify hostname |
 | `backend_tls.cert_path` | String | `""` | Path to client certificate file (for client authentication)<br/>Like "/path/to/client.crt" |
 | `backend_tls.key_path` | String | `""` | Path to client private key file (for client authentication)<br/>Like "/path/to/client.key" |
@@ -379,8 +390,9 @@
 | `wal.provider` | String | `raft_engine` | -- |
 | `wal.broker_endpoints` | Array | -- | The broker endpoints of the Kafka cluster.<br/><br/>**It's only used when the provider is `kafka`**. |
 | `wal.auto_create_topics` | Bool | `true` | Automatically create topics for WAL.<br/>Set to `true` to automatically create topics for WAL.<br/>Otherwise, use topics named `topic_name_prefix_[0..num_topics)`<br/>**It's only used when the provider is `kafka`**. |
-| `wal.auto_prune_interval` | String | `10m` | Interval of automatically WAL pruning.<br/>Set to `0s` to disable automatically WAL pruning which delete unused remote WAL entries periodically.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.auto_prune_interval` | String | `30m` | Interval of automatically WAL pruning.<br/>Set to `0s` to disable automatically WAL pruning which delete unused remote WAL entries periodically.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.flush_trigger_size` | String | `512MB` | Estimated size threshold to trigger a flush when using Kafka remote WAL.<br/>Since multiple regions may share a Kafka topic, the estimated size is calculated as:<br/>  (latest_entry_id - flushed_entry_id) * avg_record_size<br/>MetaSrv triggers a flush for a region when this estimated size exceeds `flush_trigger_size`.<br/>- `latest_entry_id`: The latest entry ID in the topic.<br/>- `flushed_entry_id`: The last flushed entry ID for the region.<br/>Set to "0" to let the system decide the flush trigger size.<br/>**It's only used when the provider is `kafka`**. |
+| `wal.checkpoint_trigger_size` | String | `128MB` | Estimated size threshold to trigger a checkpoint when using Kafka remote WAL.<br/>The estimated size is calculated as:<br/>  (latest_entry_id - last_checkpoint_entry_id) * avg_record_size<br/>MetaSrv triggers a checkpoint for a region when this estimated size exceeds `checkpoint_trigger_size`.<br/>Set to "0" to let the system decide the checkpoint trigger size.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.auto_prune_parallelism` | Integer | `10` | Concurrent task limit for automatically WAL pruning.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.num_topics` | Integer | `64` | Number of topics used for remote WAL.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.selector_type` | String | `round_robin` | Topic selector type.<br/>Available selector types:<br/>- `round_robin` (default)<br/>**It's only used when the provider is `kafka`**. |
@@ -509,6 +521,8 @@
 | `region_engine.mito.worker_channel_size` | Integer | `128` | Request channel size of each worker. |
 | `region_engine.mito.worker_request_batch_size` | Integer | `64` | Max batch size for a worker to handle requests. |
 | `region_engine.mito.manifest_checkpoint_distance` | Integer | `10` | Number of meta action updated to trigger a new checkpoint for the manifest. |
+| `region_engine.mito.experimental_manifest_keep_removed_file_count` | Integer | `256` | Number of removed files to keep in manifest's `removed_files` field before also<br/>remove them from `removed_files`. Mostly for debugging purpose.<br/>If set to 0, it will only use `keep_removed_file_ttl` to decide when to remove files<br/>from `removed_files` field. |
+| `region_engine.mito.experimental_manifest_keep_removed_file_ttl` | String | `1h` | How long to keep removed files in the `removed_files` field of manifest<br/>after they are removed from manifest.<br/>files will only be removed from `removed_files` field<br/>if both `keep_removed_file_count` and `keep_removed_file_ttl` is reached. |
 | `region_engine.mito.compress_manifest` | Bool | `false` | Whether to compress manifest and checkpoint file by gzip (default false). |
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -409,6 +409,19 @@ worker_request_batch_size = 64
 ## Number of meta action updated to trigger a new checkpoint for the manifest.
 manifest_checkpoint_distance = 10

+
+## Number of removed files to keep in manifest's `removed_files` field before also
+## remove them from `removed_files`. Mostly for debugging purpose.
+## If set to 0, it will only use `keep_removed_file_ttl` to decide when to remove files
+## from `removed_files` field.
+experimental_manifest_keep_removed_file_count = 256
+
+## How long to keep removed files in the `removed_files` field of manifest
+## after they are removed from manifest.
+## files will only be removed from `removed_files` field
+## if both `keep_removed_file_count` and `keep_removed_file_ttl` is reached.
+experimental_manifest_keep_removed_file_ttl = "1h"
+
 ## Whether to compress manifest and checkpoint file by gzip (default false).
 compress_manifest = false

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -79,6 +79,42 @@ key_path = ""
 ## For now, gRPC tls config does not support auto reload.
 watch = false

+## The internal gRPC server options. Internal gRPC port for nodes inside cluster to access frontend.
+[internal_grpc]
+## The address to bind the gRPC server.
+bind_addr = "127.0.0.1:4010"
+## The address advertised to the metasrv, and used for connections from outside the host.
+## If left empty or unset, the server will automatically use the IP address of the first network interface
+## on the host, with the same port number as the one specified in `grpc.bind_addr`.
+server_addr = "127.0.0.1:4010"
+## The number of server worker threads.
+runtime_size = 8
+## Compression mode for frontend side Arrow IPC service. Available options:
+## - `none`: disable all compression
+## - `transport`: only enable gRPC transport compression (zstd)
+## - `arrow_ipc`: only enable Arrow IPC compression (lz4)
+## - `all`: enable all compression.
+## Default to `none`
+flight_compression = "arrow_ipc"
+
+## internal gRPC server TLS options, see `mysql.tls` section.
+[internal_grpc.tls]
+## TLS mode.
+mode = "disable"
+
+## Certificate file path.
+## @toml2docs:none-default
+cert_path = ""
+
+## Private key file path.
+## @toml2docs:none-default
+key_path = ""
+
+## Watch for Certificate and key file change and auto reload.
+## For now, gRPC tls config does not support auto reload.
+watch = false
+
+
 ## MySQL server options.
 [mysql]
 ## Whether to enable.
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -23,6 +23,14 @@ backend = "etcd_store"
 ## **Only used when backend is `postgres_store`.**
 meta_table_name = "greptime_metakv"

+## Optional PostgreSQL schema for metadata table and election table name qualification.
+## When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),
+## set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.
+## GreptimeDB will NOT create the schema automatically; please ensure it exists or the user has permission.
+## **Only used when backend is `postgres_store`.**
+
+meta_schema_name = "greptime_schema"
+
 ## Advisory lock id in PostgreSQL for election. Effect when using PostgreSQL as kvbackend
 ## Only used when backend is `postgres_store`.
 meta_election_lock_id = 1
@@ -65,8 +73,8 @@ node_max_idle_time = "24hours"
 ## The number of threads to execute the runtime for global write operations.
 #+ compact_rt_size = 4

-## TLS configuration for kv store backend (only applicable for PostgreSQL/MySQL backends)
-## When using PostgreSQL or MySQL as metadata store, you can configure TLS here
+## TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)
+## When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here
 [backend_tls]
 ## TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
 ## - "disable" - No TLS
@@ -190,7 +198,7 @@ auto_create_topics = true
 ## Interval of automatically WAL pruning.
 ## Set to `0s` to disable automatically WAL pruning which delete unused remote WAL entries periodically.
 ## **It's only used when the provider is `kafka`**.
-auto_prune_interval = "10m"
+auto_prune_interval = "30m"


 ## Estimated size threshold to trigger a flush when using Kafka remote WAL.
@@ -203,6 +211,14 @@ auto_prune_interval = "10m"
 ## **It's only used when the provider is `kafka`**.
 flush_trigger_size = "512MB"

+## Estimated size threshold to trigger a checkpoint when using Kafka remote WAL.
+## The estimated size is calculated as:
+##   (latest_entry_id - last_checkpoint_entry_id) * avg_record_size
+## MetaSrv triggers a checkpoint for a region when this estimated size exceeds `checkpoint_trigger_size`.
+## Set to "0" to let the system decide the checkpoint trigger size.
+## **It's only used when the provider is `kafka`**.
+checkpoint_trigger_size = "128MB"
+
 ## Concurrent task limit for automatically WAL pruning.
 ## **It's only used when the provider is `kafka`**.
 auto_prune_parallelism = 10
--- a/docker/dev-builder/android/Dockerfile
+++ b/docker/dev-builder/android/Dockerfile
@@ -13,7 +13,8 @@ RUN apt-get update && apt-get install -y \
    git \
    unzip \
    build-essential \
-    pkg-config
+    pkg-config \
+    openssh-client

 # Install protoc
 ARG PROTOBUF_VERSION=29.3
--- a/docker/docker-compose/cluster-with-etcd.yaml
+++ b/docker/docker-compose/cluster-with-etcd.yaml
@@ -34,6 +34,48 @@ services:
    networks:
      - greptimedb

+  etcd-tls:
+    <<: *etcd_common_settings
+    container_name: etcd-tls
+    ports:
+      - 2378:2378
+      - 2381:2381
+    command:
+      - --name=etcd-tls
+      - --data-dir=/var/lib/etcd
+      - --initial-advertise-peer-urls=https://etcd-tls:2381
+      - --listen-peer-urls=https://0.0.0.0:2381
+      - --listen-client-urls=https://0.0.0.0:2378
+      - --advertise-client-urls=https://etcd-tls:2378
+      - --heartbeat-interval=250
+      - --election-timeout=1250
+      - --initial-cluster=etcd-tls=https://etcd-tls:2381
+      - --initial-cluster-state=new
+      - --initial-cluster-token=etcd-tls-cluster
+      - --cert-file=/certs/server.crt
+      - --key-file=/certs/server-key.pem
+      - --peer-cert-file=/certs/server.crt
+      - --peer-key-file=/certs/server-key.pem
+      - --trusted-ca-file=/certs/ca.crt
+      - --peer-trusted-ca-file=/certs/ca.crt
+      - --client-cert-auth
+      - --peer-client-cert-auth
+    volumes:
+      - ./greptimedb-cluster-docker-compose/etcd-tls:/var/lib/etcd
+      - ./greptimedb-cluster-docker-compose/certs:/certs:ro
+    environment:
+      - ETCDCTL_API=3
+      - ETCDCTL_CACERT=/certs/ca.crt
+      - ETCDCTL_CERT=/certs/server.crt
+      - ETCDCTL_KEY=/certs/server-key.pem
+    healthcheck:
+      test: [ "CMD", "etcdctl", "--endpoints=https://etcd-tls:2378", "--cacert=/certs/ca.crt", "--cert=/certs/server.crt", "--key=/certs/server-key.pem", "endpoint", "health" ]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - greptimedb
+
  metasrv:
    image: *greptimedb_image
    container_name: metasrv
--- a/docs/rfcs/2025-08-16-async-index-build.md
+++ b/docs/rfcs/2025-08-16-async-index-build.md
@@ -0,0 +1,112 @@
+---
+Feature Name: Async Index Build
+Tracking Issue: https://github.com/GreptimeTeam/greptimedb/issues/6756
+Date: 2025-08-16
+Author: "SNC123 <sinhco@outlook.com>"
+---
+
+# Summary
+This RFC proposes an asynchronous index build mechanism in the database, with a configuration option to choose between synchronous and asynchronous modes, aiming to improve flexibility and adapt to different workload requirements.
+
+# Motivation
+Currently, index creation is performed synchronously, which may lead to prolonged write suspension and impact business continuity. As data volume grows, the time required for index building increases significantly. An asynchronous solution is urgently needed to enhance user experience and system throughput.
+
+# Details
+
+## Overview
+
+The following table highlights the difference between async and sync index approach:
+
+| Approach | Trigger | Data Source | Additional Index Metadata Installation | Fine-grained `FileMeta` Index |
+| :--- | :--- | :--- | :--- | :--- |
+| Sync Index | On `write_sst` | Memory (on flush) / Disk (on compact) | Not required(already installed synchronously) | Not required |
+| Async Index | 4 trigger types | Disk | Required | Required |
+
+The index build mode (synchronous or asynchronous) can be selected via configuration file. 
+
+### Four Trigger Types
+
+This RFC introduces four `IndexBuildType`s to trigger index building:
+
+- **Manual Rebuild**: Triggered by the user via `ADMIN build_index("table_name")`, for scenarios like recovering from failed builds or migrating data. SST files whose `ColumnIndexMetadata` (see below) is already consistent with the `RegionMetadata` will be skipped.
+- **Schema Change**: Automatically triggered when the schema of an indexed column is altered.
+- **Flush**: Automatically builds indexes for new SST files created by a flush.
+- **Compact**: Automatically builds indexes for new SST files created by a compaction.
+
+### Additional Index Metadata Installation
+
+Previously, index information in the in-memory `FileMeta` was updated synchronously. The async approach requires an explicit installation step.
+
+A race condition can occur when compaction and index building run concurrently, leading to:
+1. Building an index for a file that is about to be deleted by compaction.
+2. Creating an unnecessary index file and an incorrect manifest record.
+3. On restart, replaying the manifest could load metadata for a non-existent file.
+
+To prevent this, the system checks if a file's `FileMeta` is in a `compacting` state before updating the manifest. If it is, the installation is aborted.
+
+### Fine-grained `FileMeta` Index
+
+The original `FileMeta` only stored file-level index information. However, manual rebuilds require column-level details to identify files inconsistent with the current DDL. Therefore, the `indexes` field in `FileMeta` is updated as follows:
+```rust
+struct FileMeta {
+    ...
+    // From file-level:
+    // available_indexes: SmallVec<[IndexType; 4]>
+    // To column-level:
+    indexes: Vec<ColumnIndexMetadata>,
+    ...
+}
+pub struct ColumnIndexMetadata {
+    pub column_id: ColumnId,
+    pub created_indexes: IndexTypes,
+}
+```
+
+## Process
+
+The index building process is similar to a flush and is illustrated below:
+
+```mermaid
+sequenceDiagram
+    Region0->>Region0: Triggered by one of 4 conditions, targets specific files
+    loop For each target file
+    Region0->>IndexBuildScheduler: Submits an index build task
+    end
+    IndexBuildScheduler->>IndexBuildTask: Executes the task
+    IndexBuildTask->>Storage Interfaces: Reads SST data from disk
+    IndexBuildTask->>IndexBuildTask: Builds the index file
+    alt Index file size > 0
+    IndexBuildTask->>Region0: Sends IndexBuildFinished notification
+    end
+    alt File exists in Version and is not compacting
+    Region0->>Storage Interfaces: Updates manifest and Version
+    end
+```
+
+### Task Triggering and Scheduling
+
+The process starts with one of the four `IndexBuildType` triggers. In `handle_rebuild_index`, the `RegionWorkerLoop` identifies target SSTs from the request or the current region version. It then creates an `IndexBuildTask` for each file and submits it to the `index_build_scheduler`.
+
+Similar to Flush and Compact operations, index build tasks are ultimately dispatched to the LocalScheduler. Resource usage can be adjusted via configuration files. Since asynchronous index tasks are both memory-intensive and IO-intensive but have lower priority, it is recommended to allocate fewer resources to them compared to compaction and flush tasks—for example, limiting them to 1/8 of the CPU cores. 
+
+### Index Building and Notification
+
+The scheduled `IndexBuildTask` executes its `index_build` method. It uses an `indexer_builder` to create an `Indexer` that reads SST data and builds the index. If a new index file is created (`IndexOutput.file_size > 0`), the task sends an `IndexBuildFinished` notification back to the `RegionWorkerLoop`.
+
+### Index Metadata Installation
+
+Upon receiving the `IndexBuildFinished` notification in `handle_index_build_finished`, the `RegionWorkerLoop` verifies that the file still exists in the current `version` and is not being compacted. If the check passes, it calls `manifest_ctx.update_manifest` to apply a `RegionEdit` with the new index information, completing the installation.
+
+# Drawbacks
+
+Asynchronous index building may consume extra system resources, potentially affecting overall performance during peak periods.
+
+There may be a delay before the new index becomes available for queries, which could impact certain use cases.
+
+# Unresolved Questions and Future Work
+
+**Resource Management and Throttling**: The resource consumption (CPU, I/O) of background index building can be managed and limited to some extent by configuring a dedicated background thread pool. However, this approach cannot fully eliminate resource contention, especially under heavy workloads or when I/O is highly competitive. Additional throttling mechanisms or dynamic prioritization may still be necessary to avoid impacting foreground operations.
+
+# Alternatives
+
+Instead of being triggered by events like Flush or Compact, index building could be performed in batches during scheduled maintenance windows. This offers predictable resource usage but delays index availability.
--- a/scripts/fix-udeps.py
+++ b/scripts/fix-udeps.py
@@ -0,0 +1,265 @@
+# Copyright 2023 Greptime Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+import sys
+
+
+def load_udeps_report(report_path):
+    try:
+        with open(report_path, "r") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: Report file '{report_path}' not found.")
+        return None
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON in report file: {e}")
+        return None
+
+
+def extract_unused_dependencies(report):
+    """
+    Extract and organize unused dependencies from the cargo-udeps JSON report.
+
+    The cargo-udeps report has this structure:
+    {
+        "unused_deps": {
+            "package_name v0.1.0 (/path/to/package)": {
+                "normal": ["dep1", "dep2"],
+                "development": ["dev_dep1"],
+                "build": ["build_dep1"],
+                "manifest_path": "/path/to/Cargo.toml"
+            }
+        }
+    }
+
+    Args:
+        report (dict): The parsed JSON report from cargo-udeps
+
+    Returns:
+        dict: Organized unused dependencies by package name:
+        {
+            "package_name": {
+                "dependencies": [("dep1", "normal"), ("dev_dep1", "dev")],
+                "manifest_path": "/path/to/Cargo.toml"
+            }
+        }
+    """
+    if not report or "unused_deps" not in report:
+        return {}
+
+    unused_deps = {}
+    for package_full_name, deps_info in report["unused_deps"].items():
+        package_name = package_full_name.split(" ")[0]
+
+        all_unused = []
+        if deps_info.get("normal"):
+            all_unused.extend([(dep, "normal") for dep in deps_info["normal"]])
+        if deps_info.get("development"):
+            all_unused.extend([(dep, "dev") for dep in deps_info["development"]])
+        if deps_info.get("build"):
+            all_unused.extend([(dep, "build") for dep in deps_info["build"]])
+
+        if all_unused:
+            unused_deps[package_name] = {
+                "dependencies": all_unused,
+                "manifest_path": deps_info.get("manifest_path", "unknown"),
+            }
+
+    return unused_deps
+
+
+def get_section_pattern(dep_type):
+    """
+    Get regex patterns to identify different dependency sections in Cargo.toml.
+
+    Args:
+        dep_type (str): Type of dependency ("normal", "dev", or "build")
+
+    Returns:
+        list: List of regex patterns to match the appropriate section headers
+
+    """
+    patterns = {
+        "normal": [r"\[dependencies\]", r"\[dependencies\..*?\]"],
+        "dev": [r"\[dev-dependencies\]", r"\[dev-dependencies\..*?\]"],
+        "build": [r"\[build-dependencies\]", r"\[build-dependencies\..*?\]"],
+    }
+    return patterns.get(dep_type, [])
+
+
+def remove_dependency_line(content, dep_name, section_start, section_end):
+    """
+    Remove a dependency line from a specific section of a Cargo.toml file.
+
+    Args:
+        content (str): The entire content of the Cargo.toml file
+        dep_name (str): Name of the dependency to remove (e.g., "serde", "tokio")
+        section_start (int): Starting position of the section in the content
+        section_end (int): Ending position of the section in the content
+
+    Returns:
+        tuple: (new_content, removed) where:
+            - new_content (str): The modified content with dependency removed
+            - removed (bool): True if dependency was found and removed, False otherwise
+
+    Example input content format:
+        content = '''
+        [package]
+        name = "my-crate"
+        version = "0.1.0"
+
+        [dependencies]
+        serde = "1.0"
+        tokio = { version = "1.0", features = ["full"] }
+        serde_json.workspace = true
+
+        [dev-dependencies]
+        tempfile = "3.0"
+        '''
+
+        # If dep_name = "serde", section_start = start of [dependencies],
+        # section_end = start of [dev-dependencies], this function will:
+        # 1. Extract the section: "serde = "1.0"\ntokio = { version = "1.0", features = ["full"] }\nserde_json.workspace = true\n"
+        # 2. Find and remove the line: "serde = "1.0""
+        # 3. Return the modified content with that line removed
+    """
+    section_content = content[section_start:section_end]
+
+    dep_patterns = [
+        rf"^{re.escape(dep_name)}\s*=.*$",  # e.g., "serde = "1.0""
+        rf"^{re.escape(dep_name)}\.workspace\s*=.*$",  # e.g., "serde_json.workspace = true"
+    ]
+
+    for pattern in dep_patterns:
+        match = re.search(pattern, section_content, re.MULTILINE)
+        if match:
+            line_start = section_start + match.start()  # Start of the matched line
+            line_end = section_start + match.end()  # End of the matched line
+
+            if line_end < len(content) and content[line_end] == "\n":
+                line_end += 1
+
+            return content[:line_start] + content[line_end:], True
+
+    return content, False
+
+
+def remove_dependency_from_toml(file_path, dep_name, dep_type):
+    """
+    Remove a specific dependency from a Cargo.toml file.
+
+    Args:
+        file_path (str): Path to the Cargo.toml file
+        dep_name (str): Name of the dependency to remove
+        dep_type (str): Type of dependency ("normal", "dev", or "build")
+
+    Returns:
+        bool: True if dependency was successfully removed, False otherwise
+    """
+    try:
+        with open(file_path, "r") as f:
+            content = f.read()
+
+        section_patterns = get_section_pattern(dep_type)
+        if not section_patterns:
+            return False
+
+        for pattern in section_patterns:
+            section_match = re.search(pattern, content, re.IGNORECASE)
+            if not section_match:
+                continue
+
+            section_start = section_match.end()
+            next_section = re.search(r"\n\s*\[", content[section_start:])
+            section_end = (
+                section_start + next_section.start() if next_section else len(content)
+            )
+
+            new_content, removed = remove_dependency_line(
+                content, dep_name, section_start, section_end
+            )
+            if removed:
+                with open(file_path, "w") as f:
+                    f.write(new_content)
+                return True
+
+        return False
+
+    except Exception as e:
+        print(f"Error processing {file_path}: {e}")
+        return False
+
+
+def process_unused_dependencies(unused_deps):
+    """
+    Process and remove all unused dependencies from their respective Cargo.toml files.
+
+    Args:
+        unused_deps (dict): Dictionary of unused dependencies organized by package:
+            {
+                "package_name": {
+                    "dependencies": [("dep1", "normal"), ("dev_dep1", "dev")],
+                    "manifest_path": "/path/to/Cargo.toml"
+                }
+            }
+
+    """
+    if not unused_deps:
+        print("No unused dependencies found.")
+        return
+
+    total_removed = 0
+    total_failed = 0
+
+    for package, info in unused_deps.items():
+        deps = info["dependencies"]
+        manifest_path = info["manifest_path"]
+
+        if not os.path.exists(manifest_path):
+            print(f"Manifest file not found: {manifest_path}")
+            total_failed += len(deps)
+            continue
+
+        for dep, dep_type in deps:
+            if remove_dependency_from_toml(manifest_path, dep, dep_type):
+                print(f"Removed {dep} from {package}")
+                total_removed += 1
+            else:
+                print(f"Failed to remove {dep} from {package}")
+                total_failed += 1
+
+    print(f"Removed {total_removed} dependencies")
+    if total_failed > 0:
+        print(f"Failed to remove {total_failed} dependencies")
+
+
+def main():
+    if len(sys.argv) > 1:
+        report_path = sys.argv[1]
+    else:
+        report_path = "udeps-report.json"
+
+    report = load_udeps_report(report_path)
+    if report is None:
+        sys.exit(1)
+
+    unused_deps = extract_unused_dependencies(report)
+    process_unused_dependencies(unused_deps)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/generate-etcd-tls-certs.sh
+++ b/scripts/generate-etcd-tls-certs.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Generate TLS certificates for etcd testing
+# This script creates certificates for TLS-enabled etcd in testing environments
+
+set -euo pipefail
+
+CERT_DIR="${1:-$(dirname "$0")/../tests-integration/fixtures/etcd-tls-certs}"
+DAYS="${2:-365}"
+
+echo "Generating TLS certificates for etcd in ${CERT_DIR}..."
+
+mkdir -p "${CERT_DIR}"
+cd "${CERT_DIR}"
+
+echo "Generating CA private key..."
+openssl genrsa -out ca-key.pem 2048
+
+echo "Generating CA certificate..."
+openssl req -new -x509 -key ca-key.pem -out ca.crt -days "${DAYS}" \
+  -subj "/C=US/ST=CA/L=SF/O=Greptime/CN=etcd-ca"
+
+# Create server certificate config with Subject Alternative Names
+echo "Creating server certificate configuration..."
+cat > server.conf << 'EOF'
+[req]
+distinguished_name = req
+[v3_req]
+basicConstraints = CA:FALSE
+keyUsage = keyEncipherment, dataEncipherment
+subjectAltName = @alt_names
+[alt_names]
+DNS.1 = localhost
+DNS.2 = etcd-tls
+DNS.3 = 127.0.0.1
+IP.1 = 127.0.0.1
+IP.2 = ::1
+EOF
+
+echo "Generating server private key..."
+openssl genrsa -out server-key.pem 2048
+
+echo "Generating server certificate signing request..."
+openssl req -new -key server-key.pem -out server.csr \
+  -subj "/CN=etcd-tls"
+
+echo "Generating server certificate..."
+openssl x509 -req -in server.csr -CA ca.crt \
+  -CAkey ca-key.pem -CAcreateserial -out server.crt \
+  -days "${DAYS}" -extensions v3_req -extfile server.conf
+
+echo "Generating client private key..."
+openssl genrsa -out client-key.pem 2048
+
+echo "Generating client certificate signing request..."
+openssl req -new -key client-key.pem -out client.csr \
+  -subj "/CN=etcd-client"
+
+echo "Generating client certificate..."
+openssl x509 -req -in client.csr -CA ca.crt \
+  -CAkey ca-key.pem -CAcreateserial -out client.crt \
+  -days "${DAYS}"
+
+echo "Setting proper file permissions..."
+chmod 644 ca.crt server.crt client.crt
+chmod 600 ca-key.pem server-key.pem client-key.pem
+
+# Clean up intermediate files
+rm -f server.csr client.csr server.conf
+
+echo "TLS certificates generated successfully in ${CERT_DIR}"
--- a/src/auth/src/permission.rs
+++ b/src/auth/src/permission.rs
@@ -32,6 +32,7 @@ pub enum PermissionReq<'a> {
    PromStoreRead,
    Otlp,
    LogWrite,
+    BulkInsert,
 }

 #[derive(Debug)]
--- a/src/cli/src/bench.rs
+++ b/src/cli/src/bench.rs
@@ -66,6 +66,9 @@ pub struct BenchTableMetadataCommand {
    #[cfg(feature = "pg_kvbackend")]
    #[clap(long)]
    postgres_addr: Option<String>,
+    #[cfg(feature = "pg_kvbackend")]
+    #[clap(long)]
+    postgres_schema: Option<String>,
    #[cfg(feature = "mysql_kvbackend")]
    #[clap(long)]
    mysql_addr: Option<String>,
--- a/src/cli/src/metadata/common.rs
+++ b/src/cli/src/metadata/common.rs
@@ -19,8 +19,9 @@ use common_error::ext::BoxedError;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
 use common_meta::kv_backend::KvBackendRef;
-use meta_srv::bootstrap::create_etcd_client;
+use meta_srv::bootstrap::create_etcd_client_with_tls;
 use meta_srv::metasrv::BackendImpl;
+use servers::tls::{TlsMode, TlsOption};

 use crate::error::{EmptyStoreAddrsSnafu, UnsupportedMemoryBackendSnafu};

@@ -55,6 +56,30 @@ pub(crate) struct StoreConfig {
    #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
    #[clap(long, default_value = common_meta::kv_backend::DEFAULT_META_TABLE_NAME)]
    meta_table_name: String,
+
+    /// Optional PostgreSQL schema for metadata table (defaults to current search_path if unset).
+    #[cfg(feature = "pg_kvbackend")]
+    #[clap(long)]
+    meta_schema_name: Option<String>,
+    /// TLS mode for backend store connections (etcd, PostgreSQL, MySQL)
+    #[clap(long = "backend-tls-mode", value_enum, default_value = "disable")]
+    backend_tls_mode: TlsMode,
+
+    /// Path to TLS certificate file for backend store connections
+    #[clap(long = "backend-tls-cert-path", default_value = "")]
+    backend_tls_cert_path: String,
+
+    /// Path to TLS private key file for backend store connections
+    #[clap(long = "backend-tls-key-path", default_value = "")]
+    backend_tls_key_path: String,
+
+    /// Path to TLS CA certificate file for backend store connections
+    #[clap(long = "backend-tls-ca-cert-path", default_value = "")]
+    backend_tls_ca_cert_path: String,
+
+    /// Enable watching TLS certificate files for changes
+    #[clap(long = "backend-tls-watch")]
+    backend_tls_watch: bool,
 }

 impl StoreConfig {
@@ -67,7 +92,18 @@ impl StoreConfig {
        } else {
            let kvbackend = match self.backend {
                BackendImpl::EtcdStore => {
-                    let etcd_client = create_etcd_client(store_addrs)
+                    let tls_config = if self.backend_tls_mode != TlsMode::Disable {
+                        Some(TlsOption {
+                            mode: self.backend_tls_mode.clone(),
+                            cert_path: self.backend_tls_cert_path.clone(),
+                            key_path: self.backend_tls_key_path.clone(),
+                            ca_cert_path: self.backend_tls_ca_cert_path.clone(),
+                            watch: self.backend_tls_watch,
+                        })
+                    } else {
+                        None
+                    };
+                    let etcd_client = create_etcd_client_with_tls(store_addrs, tls_config.as_ref())
                        .await
                        .map_err(BoxedError::new)?;
                    Ok(EtcdStore::with_etcd_client(etcd_client, max_txn_ops))
@@ -78,8 +114,10 @@ impl StoreConfig {
                    let pool = meta_srv::bootstrap::create_postgres_pool(store_addrs, None)
                        .await
                        .map_err(BoxedError::new)?;
+                    let schema_name = self.meta_schema_name.as_deref();
                    Ok(common_meta::kv_backend::rds::PgStore::with_pg_pool(
                        pool,
+                        schema_name,
                        table_name,
                        max_txn_ops,
                    )
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -473,8 +473,8 @@ impl Database {
        }) = &self.ctx.auth_header
        {
            let encoded = BASE64_STANDARD.encode(format!("{username}:{password}"));
-            let value =
-                MetadataValue::from_str(&encoded).context(InvalidTonicMetadataValueSnafu)?;
+            let value = MetadataValue::from_str(&format!("Basic {encoded}"))
+                .context(InvalidTonicMetadataValueSnafu)?;
            request.metadata_mut().insert("x-greptime-auth", value);
        }

--- a/src/client/src/inserter.rs
+++ b/src/client/src/inserter.rs
@@ -16,7 +16,7 @@ use std::time::Duration;

 use api::v1::RowInsertRequests;
 use humantime::format_duration;
-use store_api::mito_engine_options::{APPEND_MODE_KEY, TTL_KEY};
+use store_api::mito_engine_options::{APPEND_MODE_KEY, TTL_KEY, TWCS_TIME_WINDOW};

 use crate::error::Result;

@@ -35,15 +35,23 @@ pub struct InsertOptions {
    pub ttl: Duration,
    /// Whether to use append mode for the insert.
    pub append_mode: bool,
+    /// Time window for twcs compaction.
+    pub twcs_compaction_time_window: Option<Duration>,
 }

 impl InsertOptions {
    /// Converts the insert options to a list of key-value string hints.
    pub fn to_hints(&self) -> Vec<(&'static str, String)> {
-        vec![
+        let mut hints = vec![
            (TTL_KEY, format_duration(self.ttl).to_string()),
            (APPEND_MODE_KEY, self.append_mode.to_string()),
-        ]
+        ];
+
+        if let Some(time_window) = self.twcs_compaction_time_window {
+            hints.push((TWCS_TIME_WINDOW, format_duration(time_window).to_string()));
+        }
+
+        hints
    }
 }

--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -103,3 +103,6 @@ tempfile.workspace = true

 [target.'cfg(not(windows))'.dev-dependencies]
 rexpect = "0.5"
+
+[package.metadata.cargo-udeps.ignore]
+development = ["rexpect"]
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -41,6 +41,7 @@ use frontend::server::Services;
 use meta_client::{MetaClientOptions, MetaClientType};
 use servers::addrs;
 use servers::export_metrics::ExportMetricsTask;
+use servers::grpc::GrpcOptions;
 use servers::tls::{TlsMode, TlsOption};
 use snafu::{OptionExt, ResultExt};
 use tracing_appender::non_blocking::WorkerGuard;
@@ -144,6 +145,14 @@ pub struct StartCommand {
    /// on the host, with the same port number as the one specified in `rpc_bind_addr`.
    #[clap(long, alias = "rpc-hostname")]
    rpc_server_addr: Option<String>,
+    /// The address to bind the internal gRPC server.
+    #[clap(long, alias = "internal-rpc-addr")]
+    internal_rpc_bind_addr: Option<String>,
+    /// The address advertised to the metasrv, and used for connections from outside the host.
+    /// If left empty or unset, the server will automatically use the IP address of the first network interface
+    /// on the host, with the same port number as the one specified in `internal_rpc_bind_addr`.
+    #[clap(long, alias = "internal-rpc-hostname")]
+    internal_rpc_server_addr: Option<String>,
    #[clap(long)]
    http_addr: Option<String>,
    #[clap(long)]
@@ -241,6 +250,31 @@ impl StartCommand {
            opts.grpc.server_addr.clone_from(addr);
        }

+        if let Some(addr) = &self.internal_rpc_bind_addr {
+            if let Some(internal_grpc) = &mut opts.internal_grpc {
+                internal_grpc.bind_addr = addr.to_string();
+            } else {
+                let grpc_options = GrpcOptions {
+                    bind_addr: addr.to_string(),
+                    ..Default::default()
+                };
+
+                opts.internal_grpc = Some(grpc_options);
+            }
+        }
+
+        if let Some(addr) = &self.internal_rpc_server_addr {
+            if let Some(internal_grpc) = &mut opts.internal_grpc {
+                internal_grpc.server_addr = addr.to_string();
+            } else {
+                let grpc_options = GrpcOptions {
+                    server_addr: addr.to_string(),
+                    ..Default::default()
+                };
+                opts.internal_grpc = Some(grpc_options);
+            }
+        }
+
        if let Some(addr) = &self.mysql_addr {
            opts.mysql.enable = true;
            opts.mysql.addr.clone_from(addr);
@@ -448,6 +482,8 @@ mod tests {
            http_addr: Some("127.0.0.1:1234".to_string()),
            mysql_addr: Some("127.0.0.1:5678".to_string()),
            postgres_addr: Some("127.0.0.1:5432".to_string()),
+            internal_rpc_bind_addr: Some("127.0.0.1:4010".to_string()),
+            internal_rpc_server_addr: Some("10.0.0.24:4010".to_string()),
            influxdb_enable: Some(false),
            disable_dashboard: Some(false),
            ..Default::default()
@@ -460,6 +496,10 @@ mod tests {
        assert_eq!(opts.mysql.addr, "127.0.0.1:5678");
        assert_eq!(opts.postgres.addr, "127.0.0.1:5432");

+        let internal_grpc = opts.internal_grpc.as_ref().unwrap();
+        assert_eq!(internal_grpc.bind_addr, "127.0.0.1:4010");
+        assert_eq!(internal_grpc.server_addr, "10.0.0.24:4010");
+
        let default_opts = FrontendOptions::default().component;

        assert_eq!(opts.grpc.bind_addr, default_opts.grpc.bind_addr);
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -146,6 +146,7 @@ fn test_load_frontend_example_config() {
            grpc: GrpcOptions::default()
                .with_bind_addr("127.0.0.1:4001")
                .with_server_addr("127.0.0.1:4001"),
+            internal_grpc: Some(GrpcOptions::internal_default()),
            http: HttpOptions {
                cors_allowed_origins: vec!["https://example.com".to_string()],
                ..Default::default()
@@ -198,6 +199,7 @@ fn test_load_metasrv_example_config() {
                ca_cert_path: String::new(),
                watch: false,
            }),
+            meta_schema_name: Some("greptime_schema".to_string()),
            ..Default::default()
        },
        ..Default::default()
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -290,6 +290,8 @@ macro_rules! define_into_tonic_status {
                use tonic::metadata::MetadataMap;
                use $crate::GREPTIME_DB_HEADER_ERROR_CODE;

+                common_telemetry::error!(err; "Failed to handle request");
+
                let mut headers = HeaderMap::<HeaderValue>::with_capacity(2);

                // If either of the status_code or error msg cannot convert to valid HTTP header value
--- a/src/common/event-recorder/src/recorder.rs
+++ b/src/common/event-recorder/src/recorder.rs
@@ -56,6 +56,8 @@ pub type EventRecorderRef = Arc<dyn EventRecorder>;
 pub const DEFAULT_FLUSH_INTERVAL_SECONDS: Duration = Duration::from_secs(5);
 /// The default TTL(90 days) for the events table.
 const DEFAULT_EVENTS_TABLE_TTL: Duration = Duration::from_days(90);
+/// The default compaction time window for the events table.
+pub const DEFAULT_COMPACTION_TIME_WINDOW: Duration = Duration::from_days(1);
 // The capacity of the tokio channel for transmitting events to background processor.
 const DEFAULT_CHANNEL_SIZE: usize = 2048;
 // The size of the buffer for batching events before flushing to event handler.
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -57,7 +57,6 @@ serde_json.workspace = true
 session.workspace = true
 snafu.workspace = true
 sql.workspace = true
-statrs = "0.16"
 store-api.workspace = true
 table.workspace = true
 uddsketch = { git = "https://github.com/GreptimeTeam/timescaledb-toolkit.git", rev = "84828fe8fb494a6a61412a3da96517fc80f7bb20" }
--- a/src/common/macro/Cargo.toml
+++ b/src/common/macro/Cargo.toml
@@ -21,8 +21,3 @@ syn = { version = "2.0", features = [
 ] }

 [dev-dependencies]
-arc-swap = "1.0"
-common-query.workspace = true
-datatypes.workspace = true
-snafu.workspace = true
-static_assertions = "1.1.0"
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -94,5 +94,4 @@ common-procedure = { workspace = true, features = ["testing"] }
 common-test-util.workspace = true
 common-wal = { workspace = true, features = ["testing"] }
 datatypes.workspace = true
-hyper = { version = "0.14", features = ["full"] }
 uuid.workspace = true
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -242,6 +242,7 @@ mod tests {
                    flow_name: "my_flow".to_string(),
                    raw_sql: "sql".to_string(),
                    expire_after: Some(300),
+                    eval_interval_secs: None,
                    comment: "comment".to_string(),
                    options: Default::default(),
                    created_time: chrono::Utc::now(),
--- a/src/common/meta/src/ddl/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables.rs
@@ -50,6 +50,8 @@ use crate::rpc::router::RegionRoute;
 pub struct AlterLogicalTablesProcedure {
    pub context: DdlContext,
    pub data: AlterTablesData,
+    /// Physical table route cache.
+    pub physical_table_route: Option<PhysicalTableRouteValue>,
 }

 /// Builds the validator from the [`AlterTablesData`].
@@ -93,16 +95,20 @@ impl AlterLogicalTablesProcedure {
                table_info_values: vec![],
                physical_table_id,
                physical_table_info: None,
-                physical_table_route: None,
                physical_columns: vec![],
                table_cache_keys_to_invalidate: vec![],
            },
+            physical_table_route: None,
        }
    }

    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
-        Ok(Self { context, data })
+        Ok(Self {
+            context,
+            data,
+            physical_table_route: None,
+        })
    }

    pub(crate) async fn on_prepare(&mut self) -> Result<Status> {
@@ -141,21 +147,24 @@ impl AlterLogicalTablesProcedure {
        // Updates the procedure state.
        retain_unskipped(&mut self.data.tasks, &skip_alter);
        self.data.physical_table_info = Some(physical_table_info);
-        self.data.physical_table_route = Some(physical_table_route);
        self.data.table_info_values = table_info_values;
        debug_assert_eq!(self.data.tasks.len(), self.data.table_info_values.len());
+        self.physical_table_route = Some(physical_table_route);
        self.data.state = AlterTablesState::SubmitAlterRegionRequests;
        Ok(Status::executing(true))
    }

    pub(crate) async fn on_submit_alter_region_requests(&mut self) -> Result<Status> {
-        // Safety: we have checked the state in on_prepare
-        let physical_table_route = &self.data.physical_table_route.as_ref().unwrap();
+        self.fetch_physical_table_route_if_non_exist().await?;
+        // Safety: fetched in `fetch_physical_table_route_if_non_exist`.
+        let region_routes = &self.physical_table_route.as_ref().unwrap().region_routes;
+
        let executor = build_executor_from_alter_expr(&self.data);
        let mut results = executor
            .on_alter_regions(
                &self.context.node_manager,
-                &physical_table_route.region_routes,
+                // Avoid double-borrowing self by extracting the region_routes first
+                region_routes,
            )
            .await?;

@@ -166,7 +175,7 @@ impl AlterLogicalTablesProcedure {
        } else {
            warn!("altering logical table result doesn't contains extension key `{ALTER_PHYSICAL_EXTENSION_KEY}`,leaving the physical table's schema unchanged");
        }
-        self.submit_sync_region_requests(results, &physical_table_route.region_routes)
+        self.submit_sync_region_requests(results, region_routes)
            .await;
        self.data.state = AlterTablesState::UpdateMetadata;
        Ok(Status::executing(true))
@@ -232,6 +241,21 @@ impl AlterLogicalTablesProcedure {
            .await?;
        Ok(Status::done())
    }
+
+    /// Fetches the physical table route if it is not already fetched.
+    async fn fetch_physical_table_route_if_non_exist(&mut self) -> Result<()> {
+        if self.physical_table_route.is_none() {
+            let (_, physical_table_route) = self
+                .context
+                .table_metadata_manager
+                .table_route_manager()
+                .get_physical_table_route(self.data.physical_table_id)
+                .await?;
+            self.physical_table_route = Some(physical_table_route);
+        }
+
+        Ok(())
+    }
 }

 #[async_trait]
@@ -261,6 +285,10 @@ impl Procedure for AlterLogicalTablesProcedure {
            AlterTablesState::UpdateMetadata => self.on_update_metadata().await,
            AlterTablesState::InvalidateTableCache => self.on_invalidate_table_cache().await,
        }
+        .inspect_err(|_| {
+            // Reset the physical table route cache.
+            self.physical_table_route = None;
+        })
        .map_err(map_to_procedure_error)
    }

@@ -298,7 +326,6 @@ pub struct AlterTablesData {
    /// Physical table info
    physical_table_id: TableId,
    physical_table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
-    physical_table_route: Option<PhysicalTableRouteValue>,
    physical_columns: Vec<ColumnMetadata>,
    table_cache_keys_to_invalidate: Vec<CacheIdent>,
 }
@@ -311,7 +338,6 @@ impl AlterTablesData {
        self.table_info_values.clear();
        self.physical_table_id = 0;
        self.physical_table_info = None;
-        self.physical_table_route = None;
        self.physical_columns.clear();
    }
 }
--- a/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
@@ -28,9 +28,11 @@ use crate::rpc::router::region_distribution;

 impl AlterLogicalTablesProcedure {
    pub(crate) async fn update_physical_table_metadata(&mut self) -> Result<()> {
+        self.fetch_physical_table_route_if_non_exist().await?;
        // Safety: must exist.
        let physical_table_info = self.data.physical_table_info.as_ref().unwrap();
-        let physical_table_route = self.data.physical_table_route.as_ref().unwrap();
+        // Safety: fetched in `fetch_physical_table_route_if_non_exist`.
+        let physical_table_route = self.physical_table_route.as_ref().unwrap();
        let region_distribution = region_distribution(&physical_table_route.region_routes);

        // Updates physical table's metadata.
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -445,6 +445,10 @@ impl From<&CreateFlowData> for CreateRequest {
            create_if_not_exists: true,
            or_replace: value.task.or_replace,
            expire_after: value.task.expire_after.map(|value| ExpireAfter { value }),
+            eval_interval: value
+                .task
+                .eval_interval_secs
+                .map(|seconds| api::v1::EvalInterval { seconds }),
            comment: value.task.comment.clone(),
            sql: value.task.sql.clone(),
            flow_options: value.task.flow_options.clone(),
@@ -464,6 +468,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            flow_name,
            sink_table_name,
            expire_after,
+            eval_interval_secs: eval_interval,
            comment,
            sql,
            flow_options: mut options,
@@ -503,6 +508,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            flow_name,
            raw_sql: sql,
            expire_after,
+            eval_interval_secs: eval_interval,
            comment,
            options,
            created_time: create_time,
--- a/src/common/meta/src/ddl/tests/create_flow.rs
+++ b/src/common/meta/src/ddl/tests/create_flow.rs
@@ -45,6 +45,7 @@ pub(crate) fn test_create_flow_task(
        or_replace: false,
        create_if_not_exists,
        expire_after: Some(300),
+        eval_interval_secs: None,
        comment: "".to_string(),
        sql: "select 1".to_string(),
        flow_options: Default::default(),
@@ -189,6 +190,7 @@ fn create_test_flow_task_for_serialization() -> CreateFlowTask {
        or_replace: false,
        create_if_not_exists: false,
        expire_after: None,
+        eval_interval_secs: None,
        comment: "test comment".to_string(),
        sql: "SELECT * FROM source_table".to_string(),
        flow_options: HashMap::new(),
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -108,6 +108,10 @@ pub struct OpenRegion {
    pub region_wal_options: HashMap<RegionNumber, String>,
    #[serde(default)]
    pub skip_wal_replay: bool,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub replay_entry_id: Option<u64>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub metadata_replay_entry_id: Option<u64>,
 }

 impl OpenRegion {
@@ -124,8 +128,22 @@ impl OpenRegion {
            region_options,
            region_wal_options,
            skip_wal_replay,
+            replay_entry_id: None,
+            metadata_replay_entry_id: None,
        }
    }
+
+    /// Sets the replay entry id.
+    pub fn with_replay_entry_id(mut self, replay_entry_id: Option<u64>) -> Self {
+        self.replay_entry_id = replay_entry_id;
+        self
+    }
+
+    /// Sets the metadata replay entry id.
+    pub fn with_metadata_replay_entry_id(mut self, metadata_replay_entry_id: Option<u64>) -> Self {
+        self.metadata_replay_entry_id = metadata_replay_entry_id;
+        self
+    }
 }

 /// The instruction of downgrading leader region.
@@ -352,6 +370,8 @@ mod tests {
            region_options,
            region_wal_options: HashMap::new(),
            skip_wal_replay: false,
+            replay_entry_id: None,
+            metadata_replay_entry_id: None,
        };
        assert_eq!(expected, deserialized);
    }
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -155,6 +155,7 @@ use crate::error::{self, Result, SerdeJsonSnafu};
 use crate::key::flow::flow_state::FlowStateValue;
 use crate::key::node_address::NodeAddressValue;
 use crate::key::table_route::TableRouteKey;
+use crate::key::topic_region::TopicRegionValue;
 use crate::key::txn_helper::TxnOpGetResponseSet;
 use crate::kv_backend::txn::{Txn, TxnOp};
 use crate::kv_backend::KvBackendRef;
@@ -164,6 +165,7 @@ use crate::state_store::PoisonValue;
 use crate::DatanodeId;

 pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*";
+pub const TOPIC_NAME_PATTERN: &str = r"[a-zA-Z0-9_:-][a-zA-Z0-9_:\-\.@#]*";
 pub const LEGACY_MAINTENANCE_KEY: &str = "__maintenance";
 pub const MAINTENANCE_KEY: &str = "__switches/maintenance";
 pub const PAUSE_PROCEDURE_KEY: &str = "__switches/pause_procedure";
@@ -271,6 +273,10 @@ lazy_static! {
    pub static ref NAME_PATTERN_REGEX: Regex = Regex::new(NAME_PATTERN).unwrap();
 }

+lazy_static! {
+    pub static ref TOPIC_NAME_PATTERN_REGEX: Regex = Regex::new(TOPIC_NAME_PATTERN).unwrap();
+}
+
 lazy_static! {
    static ref TABLE_INFO_KEY_PATTERN: Regex =
        Regex::new(&format!("^{TABLE_INFO_KEY_PREFIX}/([0-9]+)$")).unwrap();
@@ -326,7 +332,7 @@ lazy_static! {

 lazy_static! {
    pub static ref TOPIC_REGION_PATTERN: Regex = Regex::new(&format!(
-        "^{TOPIC_REGION_PREFIX}/({NAME_PATTERN})/([0-9]+)$"
+        "^{TOPIC_REGION_PREFIX}/({TOPIC_NAME_PATTERN})/([0-9]+)$"
    ))
    .unwrap();
 }
@@ -622,7 +628,6 @@ impl TableMetadataManager {
        &self.topic_region_manager
    }

-    #[cfg(feature = "testing")]
    pub fn kv_backend(&self) -> &KvBackendRef {
        &self.kv_backend
    }
@@ -1434,7 +1439,8 @@ impl_metadata_value! {
    NodeAddressValue,
    SchemaNameValue,
    FlowStateValue,
-    PoisonValue
+    PoisonValue,
+    TopicRegionValue
 }

 impl_optional_metadata_value! {
@@ -1676,9 +1682,11 @@ mod tests {
                .topic_region_manager
                .regions(&topic)
                .await
-                .unwrap();
+                .unwrap()
+                .into_keys()
+                .collect::<Vec<_>>();
            assert_eq!(regions.len(), 8);
-            assert_eq!(regions[0], region_id);
+            assert!(regions.contains(&region_id));
        }
    }

--- a/src/common/meta/src/key/flow.rs
+++ b/src/common/meta/src/key/flow.rs
@@ -464,6 +464,7 @@ mod tests {
            flownode_ids,
            raw_sql: "raw".to_string(),
            expire_after: Some(300),
+            eval_interval_secs: None,
            comment: "hi".to_string(),
            options: Default::default(),
            created_time: chrono::Utc::now(),
@@ -638,6 +639,7 @@ mod tests {
            flownode_ids: [(0, 1u64)].into(),
            raw_sql: "raw".to_string(),
            expire_after: Some(300),
+            eval_interval_secs: None,
            comment: "hi".to_string(),
            options: Default::default(),
            created_time: chrono::Utc::now(),
@@ -1013,6 +1015,7 @@ mod tests {
            flownode_ids: [(0, 1u64)].into(),
            raw_sql: "raw".to_string(),
            expire_after: Some(300),
+            eval_interval_secs: None,
            comment: "hi".to_string(),
            options: Default::default(),
            created_time: chrono::Utc::now(),
--- a/src/common/meta/src/key/flow/flow_info.rs
+++ b/src/common/meta/src/key/flow/flow_info.rs
@@ -135,6 +135,12 @@ pub struct FlowInfoValue {
    /// The expr of expire.
    /// Duration in seconds as `i64`.
    pub expire_after: Option<i64>,
+    /// The eval interval.
+    /// Duration in seconds as `i64`.
+    /// If `None`, will automatically decide when to evaluate the flow.
+    /// If `Some`, it will be evaluated every `eval_interval` seconds.
+    #[serde(default)]
+    pub eval_interval_secs: Option<i64>,
    /// The comment.
    pub comment: String,
    /// The options.
@@ -191,6 +197,10 @@ impl FlowInfoValue {
        self.expire_after
    }

+    pub fn eval_interval(&self) -> Option<i64> {
+        self.eval_interval_secs
+    }
+
    pub fn comment(&self) -> &String {
        &self.comment
    }
--- a/src/common/meta/src/key/topic_region.rs
+++ b/src/common/meta/src/key/topic_region.rs
@@ -12,20 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 use std::collections::HashMap;
 use std::fmt::{self, Display};

@@ -37,10 +23,12 @@ use table::metadata::TableId;

 use crate::ddl::utils::parse_region_wal_options;
 use crate::error::{Error, InvalidMetadataSnafu, Result};
-use crate::key::{MetadataKey, TOPIC_REGION_PATTERN, TOPIC_REGION_PREFIX};
+use crate::key::{MetadataKey, MetadataValue, TOPIC_REGION_PATTERN, TOPIC_REGION_PREFIX};
 use crate::kv_backend::txn::{Txn, TxnOp};
 use crate::kv_backend::KvBackendRef;
-use crate::rpc::store::{BatchDeleteRequest, BatchPutRequest, PutRequest, RangeRequest};
+use crate::rpc::store::{
+    BatchDeleteRequest, BatchGetRequest, BatchPutRequest, PutRequest, RangeRequest,
+};
 use crate::rpc::KeyValue;

 // The TopicRegionKey is a key for the topic-region mapping in the kvbackend.
@@ -51,8 +39,20 @@ pub struct TopicRegionKey<'a> {
    pub topic: &'a str,
 }

-#[derive(Debug, Serialize, Deserialize)]
-pub struct TopicRegionValue;
+/// Represents additional information for a region when using a shared WAL.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
+pub struct TopicRegionValue {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub checkpoint: Option<ReplayCheckpoint>,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
+pub struct ReplayCheckpoint {
+    #[serde(default)]
+    pub entry_id: u64,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub metadata_entry_id: Option<u64>,
+}

 impl<'a> TopicRegionKey<'a> {
    pub fn new(region_id: RegionId, topic: &'a str) -> Self {
@@ -118,9 +118,47 @@ impl<'a> TryFrom<&'a str> for TopicRegionKey<'a> {
    }
 }

-fn topic_region_decoder(value: &KeyValue) -> Result<TopicRegionKey<'_>> {
+impl ReplayCheckpoint {
+    /// Creates a new [`ReplayCheckpoint`] with the given entry id and metadata entry id.
+    pub fn new(entry_id: u64, metadata_entry_id: Option<u64>) -> Self {
+        Self {
+            entry_id,
+            metadata_entry_id,
+        }
+    }
+}
+
+impl TopicRegionValue {
+    /// Creates a new [`TopicRegionValue`] with the given checkpoint.
+    pub fn new(checkpoint: Option<ReplayCheckpoint>) -> Self {
+        Self { checkpoint }
+    }
+
+    /// Returns the minimum entry id of the region.
+    ///
+    /// If the metadata entry id is not set, it returns the entry id.
+    pub fn min_entry_id(&self) -> Option<u64> {
+        match self.checkpoint {
+            Some(ReplayCheckpoint {
+                entry_id,
+                metadata_entry_id,
+            }) => match metadata_entry_id {
+                Some(metadata_entry_id) => Some(entry_id.min(metadata_entry_id)),
+                None => Some(entry_id),
+            },
+            None => None,
+        }
+    }
+}
+
+fn topic_region_decoder(value: &KeyValue) -> Result<(TopicRegionKey<'_>, TopicRegionValue)> {
    let key = TopicRegionKey::from_bytes(&value.key)?;
-    Ok(key)
+    let value = if value.value.is_empty() {
+        TopicRegionValue::default()
+    } else {
+        TopicRegionValue::try_from_raw_value(&value.value)?
+    };
+    Ok((key, value))
 }

 /// Manages map of topics and regions in kvbackend.
@@ -143,21 +181,59 @@ impl TopicRegionManager {
        Ok(())
    }

-    pub async fn batch_put(&self, keys: Vec<TopicRegionKey<'_>>) -> Result<()> {
+    pub async fn batch_get(
+        &self,
+        keys: Vec<TopicRegionKey<'_>>,
+    ) -> Result<HashMap<RegionId, TopicRegionValue>> {
+        let raw_keys = keys.iter().map(|key| key.to_bytes()).collect::<Vec<_>>();
+        let req = BatchGetRequest { keys: raw_keys };
+        let resp = self.kv_backend.batch_get(req).await?;
+
+        let v = resp
+            .kvs
+            .into_iter()
+            .map(|kv| topic_region_decoder(&kv).map(|(key, value)| (key.region_id, value)))
+            .collect::<Result<HashMap<_, _>>>()?;
+
+        Ok(v)
+    }
+
+    pub async fn get(&self, key: TopicRegionKey<'_>) -> Result<Option<TopicRegionValue>> {
+        let key_bytes = key.to_bytes();
+        let resp = self.kv_backend.get(&key_bytes).await?;
+        let value = resp
+            .map(|kv| topic_region_decoder(&kv).map(|(_, value)| value))
+            .transpose()?;
+
+        Ok(value)
+    }
+
+    pub async fn batch_put(
+        &self,
+        keys: &[(TopicRegionKey<'_>, Option<TopicRegionValue>)],
+    ) -> Result<()> {
        let req = BatchPutRequest {
            kvs: keys
-                .into_iter()
-                .map(|key| KeyValue {
-                    key: key.to_bytes(),
-                    value: vec![],
+                .iter()
+                .map(|(key, value)| {
+                    let value = value
+                        .map(|v| v.try_as_raw_value())
+                        .transpose()?
+                        .unwrap_or_default();
+
+                    Ok(KeyValue {
+                        key: key.to_bytes(),
+                        value,
+                    })
                })
-                .collect(),
+                .collect::<Result<Vec<_>>>()?,
            prev_kv: false,
        };
        self.kv_backend.batch_put(req).await?;
        Ok(())
    }

+    /// Build a create topic region mapping transaction. It only executes while the primary keys comparing successes.
    pub fn build_create_txn(
        &self,
        table_id: TableId,
@@ -176,8 +252,8 @@ impl TopicRegionManager {
        Ok(Txn::new().and_then(operations))
    }

-    /// Returns the list of region ids using specified topic.
-    pub async fn regions(&self, topic: &str) -> Result<Vec<RegionId>> {
+    /// Returns the map of [`RegionId`] to their corresponding topic [`TopicRegionValue`].
+    pub async fn regions(&self, topic: &str) -> Result<HashMap<RegionId, TopicRegionValue>> {
        let prefix = TopicRegionKey::range_topic_key(topic);
        let req = RangeRequest::new().with_prefix(prefix.as_bytes());
        let resp = self.kv_backend.range(req).await?;
@@ -186,7 +262,10 @@ impl TopicRegionManager {
            .iter()
            .map(topic_region_decoder)
            .collect::<Result<Vec<_>>>()?;
-        Ok(region_ids.iter().map(|key| key.region_id).collect())
+        Ok(region_ids
+            .into_iter()
+            .map(|(key, value)| (key.region_id, value))
+            .collect())
    }

    pub async fn delete(&self, key: TopicRegionKey<'_>) -> Result<()> {
@@ -248,15 +327,24 @@ mod tests {

        let topics = (0..16).map(|i| format!("topic_{}", i)).collect::<Vec<_>>();
        let keys = (0..64)
-            .map(|i| TopicRegionKey::new(RegionId::from_u64(i), &topics[(i % 16) as usize]))
+            .map(|i| {
+                (
+                    TopicRegionKey::new(RegionId::from_u64(i), &topics[(i % 16) as usize]),
+                    None,
+                )
+            })
            .collect::<Vec<_>>();

-        manager.batch_put(keys.clone()).await.unwrap();
-
-        let mut key_values = manager.regions(&topics[0]).await.unwrap();
+        manager.batch_put(&keys).await.unwrap();
+        let mut key_values = manager
+            .regions(&topics[0])
+            .await
+            .unwrap()
+            .into_keys()
+            .collect::<Vec<_>>();
        let expected = keys
            .iter()
-            .filter_map(|key| {
+            .filter_map(|(key, _)| {
                if key.topic == topics[0] {
                    Some(key.region_id)
                } else {
@@ -269,10 +357,15 @@ mod tests {

        let key = TopicRegionKey::new(RegionId::from_u64(0), "topic_0");
        manager.delete(key.clone()).await.unwrap();
-        let mut key_values = manager.regions(&topics[0]).await.unwrap();
+        let mut key_values = manager
+            .regions(&topics[0])
+            .await
+            .unwrap()
+            .into_keys()
+            .collect::<Vec<_>>();
        let expected = keys
            .iter()
-            .filter_map(|key| {
+            .filter_map(|(key, _)| {
                if key.topic == topics[0] && key.region_id != RegionId::from_u64(0) {
                    Some(key.region_id)
                } else {
@@ -324,4 +417,18 @@ mod tests {
        expected.sort_by_key(|(region_id, _)| region_id.as_u64());
        assert_eq!(topic_region_map, expected);
    }
+
+    #[test]
+    fn test_topic_region_key_is_match() {
+        let key = "__topic_region/6f153a64-7fac-4cf6-8b0b-a7967dd73879_2/4410931412992";
+        let topic_region_key = TopicRegionKey::try_from(key).unwrap();
+        assert_eq!(
+            topic_region_key.topic,
+            "6f153a64-7fac-4cf6-8b0b-a7967dd73879_2"
+        );
+        assert_eq!(
+            topic_region_key.region_id,
+            RegionId::from_u64(4410931412992)
+        );
+    }
 }
--- a/src/common/meta/src/kv_backend/etcd.rs
+++ b/src/common/meta/src/kv_backend/etcd.rs
@@ -32,6 +32,8 @@ use crate::rpc::store::{
 };
 use crate::rpc::KeyValue;

+const DEFAULT_MAX_DECODING_SIZE: usize = 32 * 1024 * 1024; // 32MB
+
 pub struct EtcdStore {
    client: Client,
    // Maximum number of operations permitted in a transaction.
@@ -39,6 +41,8 @@ pub struct EtcdStore {
    //
    // For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
    max_txn_ops: usize,
+    // Maximum decoding message size in bytes. Default 32MB.
+    max_decoding_size: usize,
 }

 impl EtcdStore {
@@ -59,9 +63,20 @@ impl EtcdStore {
        Arc::new(Self {
            client,
            max_txn_ops,
+            max_decoding_size: DEFAULT_MAX_DECODING_SIZE,
        })
    }

+    pub fn set_max_decoding_size(&mut self, max_decoding_size: usize) {
+        self.max_decoding_size = max_decoding_size;
+    }
+
+    fn kv_client(&self) -> etcd_client::KvClient {
+        self.client
+            .kv_client()
+            .max_decoding_message_size(self.max_decoding_size)
+    }
+
    async fn do_multi_txn(&self, txn_ops: Vec<TxnOp>) -> Result<Vec<TxnResponse>> {
        let max_txn_ops = self.max_txn_ops();
        if txn_ops.len() < max_txn_ops {
@@ -71,7 +86,6 @@ impl EtcdStore {
                .start_timer();
            let txn = Txn::new().and_then(txn_ops);
            let txn_res = self
-                .client
                .kv_client()
                .txn(txn)
                .await
@@ -110,7 +124,6 @@ impl KvBackend for EtcdStore {
        let Get { key, options } = req.try_into()?;

        let mut res = self
-            .client
            .kv_client()
            .get(key, options)
            .await
@@ -136,7 +149,6 @@ impl KvBackend for EtcdStore {
        } = req.try_into()?;

        let mut res = self
-            .client
            .kv_client()
            .put(key, value, options)
            .await
@@ -201,7 +213,6 @@ impl KvBackend for EtcdStore {
        let Delete { key, options } = req.try_into()?;

        let mut res = self
-            .client
            .kv_client()
            .delete(key, options)
            .await
@@ -265,7 +276,6 @@ impl TxnService for EtcdStore {

        let etcd_txn: Txn = txn.into();
        let txn_res = self
-            .client
            .kv_client()
            .txn(etcd_txn)
            .await
@@ -564,6 +574,7 @@ mod tests {
        Some(EtcdStore {
            client,
            max_txn_ops: 128,
+            max_decoding_size: DEFAULT_MAX_DECODING_SIZE,
        })
    }

--- a/src/common/meta/src/kv_backend/rds/postgres.rs
+++ b/src/common/meta/src/kv_backend/rds/postgres.rs
@@ -192,50 +192,61 @@ fn pg_generate_in_placeholders(from: usize, to: usize) -> Vec<String> {

 /// Factory for building sql templates.
 struct PgSqlTemplateFactory<'a> {
+    schema_name: Option<&'a str>,
    table_name: &'a str,
 }

 impl<'a> PgSqlTemplateFactory<'a> {
-    /// Creates a new [`SqlTemplateFactory`] with the given table name.
-    fn new(table_name: &'a str) -> Self {
-        Self { table_name }
+    /// Creates a new factory with optional schema.
+    fn new(schema_name: Option<&'a str>, table_name: &'a str) -> Self {
+        Self {
+            schema_name,
+            table_name,
+        }
    }

    /// Builds the template set for the given table name.
    fn build(&self) -> PgSqlTemplateSet {
-        let table_name = self.table_name;
+        let table_ident = Self::format_table_ident(self.schema_name, self.table_name);
        // Some of queries don't end with `;`, because we need to add `LIMIT` clause.
        PgSqlTemplateSet {
-            table_name: table_name.to_string(),
+            table_ident: table_ident.clone(),
+            // Do not attempt to create schema implicitly to avoid extra privileges requirement.
            create_table_statement: format!(
-                "CREATE TABLE IF NOT EXISTS \"{table_name}\"(k bytea PRIMARY KEY, v bytea)",
+                "CREATE TABLE IF NOT EXISTS {table_ident}(k bytea PRIMARY KEY, v bytea)",
            ),
            range_template: RangeTemplate {
-                point: format!("SELECT k, v FROM \"{table_name}\" WHERE k = $1"),
+                point: format!("SELECT k, v FROM {table_ident} WHERE k = $1"),
                range: format!(
-                    "SELECT k, v FROM \"{table_name}\" WHERE k >= $1 AND k < $2 ORDER BY k"
+                    "SELECT k, v FROM {table_ident} WHERE k >= $1 AND k < $2 ORDER BY k"
                ),
-                full: format!("SELECT k, v FROM \"{table_name}\" ORDER BY k"),
-                left_bounded: format!("SELECT k, v FROM \"{table_name}\" WHERE k >= $1 ORDER BY k"),
-                prefix: format!("SELECT k, v FROM \"{table_name}\" WHERE k LIKE $1 ORDER BY k"),
+                full: format!("SELECT k, v FROM {table_ident} ORDER BY k"),
+                left_bounded: format!("SELECT k, v FROM {table_ident} WHERE k >= $1 ORDER BY k"),
+                prefix: format!("SELECT k, v FROM {table_ident} WHERE k LIKE $1 ORDER BY k"),
            },
            delete_template: RangeTemplate {
-                point: format!("DELETE FROM \"{table_name}\" WHERE k = $1 RETURNING k,v;"),
-                range: format!(
-                    "DELETE FROM \"{table_name}\" WHERE k >= $1 AND k < $2 RETURNING k,v;"
-                ),
-                full: format!("DELETE FROM \"{table_name}\" RETURNING k,v"),
-                left_bounded: format!("DELETE FROM \"{table_name}\" WHERE k >= $1 RETURNING k,v;"),
-                prefix: format!("DELETE FROM \"{table_name}\" WHERE k LIKE $1 RETURNING k,v;"),
+                point: format!("DELETE FROM {table_ident} WHERE k = $1 RETURNING k,v;"),
+                range: format!("DELETE FROM {table_ident} WHERE k >= $1 AND k < $2 RETURNING k,v;"),
+                full: format!("DELETE FROM {table_ident} RETURNING k,v"),
+                left_bounded: format!("DELETE FROM {table_ident} WHERE k >= $1 RETURNING k,v;"),
+                prefix: format!("DELETE FROM {table_ident} WHERE k LIKE $1 RETURNING k,v;"),
            },
        }
    }
+
+    /// Formats the table reference with schema if provided.
+    fn format_table_ident(schema_name: Option<&str>, table_name: &str) -> String {
+        match schema_name {
+            Some(s) if !s.is_empty() => format!("\"{}\".\"{}\"", s, table_name),
+            _ => format!("\"{}\"", table_name),
+        }
+    }
 }

 /// Templates for the given table name.
 #[derive(Debug, Clone)]
 pub struct PgSqlTemplateSet {
-    table_name: String,
+    table_ident: String,
    create_table_statement: String,
    range_template: RangeTemplate,
    delete_template: RangeTemplate,
@@ -244,27 +255,24 @@ pub struct PgSqlTemplateSet {
 impl PgSqlTemplateSet {
    /// Generates the sql for batch get.
    fn generate_batch_get_query(&self, key_len: usize) -> String {
-        let table_name = &self.table_name;
        let in_clause = pg_generate_in_placeholders(1, key_len).join(", ");
        format!(
-            "SELECT k, v FROM \"{table_name}\" WHERE k in ({});",
-            in_clause
+            "SELECT k, v FROM {} WHERE k in ({});",
+            self.table_ident, in_clause
        )
    }

    /// Generates the sql for batch delete.
    fn generate_batch_delete_query(&self, key_len: usize) -> String {
-        let table_name = &self.table_name;
        let in_clause = pg_generate_in_placeholders(1, key_len).join(", ");
        format!(
-            "DELETE FROM \"{table_name}\" WHERE k in ({}) RETURNING k,v;",
-            in_clause
+            "DELETE FROM {} WHERE k in ({}) RETURNING k,v;",
+            self.table_ident, in_clause
        )
    }

    /// Generates the sql for batch upsert.
    fn generate_batch_upsert_query(&self, kv_len: usize) -> String {
-        let table_name = &self.table_name;
        let in_placeholders: Vec<String> = (1..=kv_len).map(|i| format!("${}", i)).collect();
        let in_clause = in_placeholders.join(", ");
        let mut param_index = kv_len + 1;
@@ -278,9 +286,9 @@ impl PgSqlTemplateSet {
        format!(
            r#"
    WITH prev AS (
-        SELECT k,v FROM "{table_name}" WHERE k IN ({in_clause})
+        SELECT k,v FROM {table} WHERE k IN ({in_clause})
    ), update AS (
-    INSERT INTO "{table_name}" (k, v) VALUES
+    INSERT INTO {table} (k, v) VALUES
        {values_clause}
    ON CONFLICT (
        k
@@ -289,7 +297,10 @@ impl PgSqlTemplateSet {
    )

    SELECT k, v FROM prev;
-    "#
+    "#,
+            table = self.table_ident,
+            in_clause = in_clause,
+            values_clause = values_clause
        )
    }
 }
@@ -835,7 +846,7 @@ impl PgStore {
                .context(CreatePostgresPoolSnafu)?,
        };

-        Self::with_pg_pool(pool, table_name, max_txn_ops).await
+        Self::with_pg_pool(pool, None, table_name, max_txn_ops).await
    }

    /// Create [PgStore] impl of [KvBackendRef] from url (backward compatibility).
@@ -843,15 +854,14 @@ impl PgStore {
        Self::with_url_and_tls(url, table_name, max_txn_ops, None).await
    }

-    /// Create [PgStore] impl of [KvBackendRef] from [deadpool_postgres::Pool].
+    /// Create [PgStore] impl of [KvBackendRef] from [deadpool_postgres::Pool] with optional schema.
    pub async fn with_pg_pool(
        pool: Pool,
+        schema_name: Option<&str>,
        table_name: &str,
        max_txn_ops: usize,
    ) -> Result<KvBackendRef> {
-        // This step ensures the postgres metadata backend is ready to use.
-        // We check if greptime_metakv table exists, and we will create a new table
-        // if it does not exist.
+        // Ensure the postgres metadata backend is ready to use.
        let client = match pool.get().await {
            Ok(client) => client,
            Err(e) => {
@@ -861,8 +871,9 @@ impl PgStore {
                .fail();
            }
        };
-        let template_factory = PgSqlTemplateFactory::new(table_name);
+        let template_factory = PgSqlTemplateFactory::new(schema_name, table_name);
        let sql_template_set = template_factory.build();
+        // Do not attempt to create schema implicitly.
        client
            .execute(&sql_template_set.create_table_statement, &[])
            .await
@@ -890,7 +901,7 @@ mod tests {
        test_txn_compare_less, test_txn_compare_not_equal, test_txn_one_compare_op,
        text_txn_multi_compare_op, unprepare_kv,
    };
-    use crate::maybe_skip_postgres_integration_test;
+    use crate::{maybe_skip_postgres15_integration_test, maybe_skip_postgres_integration_test};

    async fn build_pg_kv_backend(table_name: &str) -> Option<PgStore> {
        let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default();
@@ -905,8 +916,10 @@ mod tests {
            .context(CreatePostgresPoolSnafu)
            .unwrap();
        let client = pool.get().await.unwrap();
-        let template_factory = PgSqlTemplateFactory::new(table_name);
+        // use the default schema (i.e., public)
+        let template_factory = PgSqlTemplateFactory::new(None, table_name);
        let sql_templates = template_factory.build();
+        // Do not attempt to create schema implicitly.
        client
            .execute(&sql_templates.create_table_statement, &[])
            .await
@@ -923,6 +936,61 @@ mod tests {
        })
    }

+    async fn build_pg15_pool() -> Option<Pool> {
+        let url = std::env::var("GT_POSTGRES15_ENDPOINTS").unwrap_or_default();
+        if url.is_empty() {
+            return None;
+        }
+        let mut cfg = Config::new();
+        cfg.url = Some(url);
+        let pool = cfg
+            .create_pool(Some(Runtime::Tokio1), NoTls)
+            .context(CreatePostgresPoolSnafu)
+            .ok()?;
+        Some(pool)
+    }
+
+    #[tokio::test]
+    async fn test_pg15_create_table_in_public_should_fail() {
+        maybe_skip_postgres15_integration_test!();
+        let Some(pool) = build_pg15_pool().await else {
+            return;
+        };
+        let res = PgStore::with_pg_pool(pool, None, "pg15_public_should_fail", 128).await;
+        assert!(
+            res.is_err(),
+            "creating table in public should fail for test_user"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_pg15_create_table_in_test_schema_and_crud_should_succeed() {
+        maybe_skip_postgres15_integration_test!();
+        let Some(pool) = build_pg15_pool().await else {
+            return;
+        };
+        let schema_name = std::env::var("GT_POSTGRES15_SCHEMA").unwrap();
+        let client = pool.get().await.unwrap();
+        let factory = PgSqlTemplateFactory::new(Some(&schema_name), "pg15_ok");
+        let templates = factory.build();
+        client
+            .execute(&templates.create_table_statement, &[])
+            .await
+            .unwrap();
+        let kv = PgStore {
+            max_txn_ops: 128,
+            sql_template_set: templates,
+            txn_retry_count: RDS_STORE_TXN_RETRY_COUNT,
+            executor_factory: PgExecutorFactory { pool },
+            _phantom: PhantomData,
+        };
+        let prefix = b"pg15_crud/";
+        prepare_kv_with_prefix(&kv, prefix.to_vec()).await;
+        test_kv_put_with_prefix(&kv, prefix.to_vec()).await;
+        test_kv_batch_get_with_prefix(&kv, prefix.to_vec()).await;
+        unprepare_kv(&kv, prefix).await;
+    }
+
    #[tokio::test]
    async fn test_pg_put() {
        maybe_skip_postgres_integration_test!();
@@ -1024,4 +1092,31 @@ mod tests {
        test_txn_compare_less(&kv_backend).await;
        test_txn_compare_not_equal(&kv_backend).await;
    }
+
+    #[test]
+    fn test_pg_template_with_schema() {
+        let factory = PgSqlTemplateFactory::new(Some("test_schema"), "greptime_metakv");
+        let t = factory.build();
+        assert!(t
+            .create_table_statement
+            .contains("\"test_schema\".\"greptime_metakv\""));
+        let upsert = t.generate_batch_upsert_query(1);
+        assert!(upsert.contains("\"test_schema\".\"greptime_metakv\""));
+        let get = t.generate_batch_get_query(1);
+        assert!(get.contains("\"test_schema\".\"greptime_metakv\""));
+        let del = t.generate_batch_delete_query(1);
+        assert!(del.contains("\"test_schema\".\"greptime_metakv\""));
+    }
+
+    #[test]
+    fn test_format_table_ident() {
+        let t = PgSqlTemplateFactory::format_table_ident(None, "test_table");
+        assert_eq!(t, "\"test_table\"");
+
+        let t = PgSqlTemplateFactory::format_table_ident(Some("test_schema"), "test_table");
+        assert_eq!(t, "\"test_schema\".\"test_table\"");
+
+        let t = PgSqlTemplateFactory::format_table_ident(Some(""), "test_table");
+        assert_eq!(t, "\"test_table\"");
+    }
 }
--- a/src/common/meta/src/region_registry.rs
+++ b/src/common/meta/src/region_registry.rs
@@ -133,6 +133,34 @@ impl LeaderRegionManifestInfo {
        }
    }

+    /// Returns the replay entry id of the data region.
+    pub fn replay_entry_id(&self) -> u64 {
+        match self {
+            LeaderRegionManifestInfo::Mito {
+                flushed_entry_id,
+                topic_latest_entry_id,
+                ..
+            } => (*flushed_entry_id).max(*topic_latest_entry_id),
+            LeaderRegionManifestInfo::Metric {
+                data_flushed_entry_id,
+                data_topic_latest_entry_id,
+                ..
+            } => (*data_flushed_entry_id).max(*data_topic_latest_entry_id),
+        }
+    }
+
+    /// Returns the replay entry id of the metadata region.
+    pub fn metadata_replay_entry_id(&self) -> Option<u64> {
+        match self {
+            LeaderRegionManifestInfo::Metric {
+                metadata_flushed_entry_id,
+                metadata_topic_latest_entry_id,
+                ..
+            } => Some((*metadata_flushed_entry_id).max(*metadata_topic_latest_entry_id)),
+            _ => None,
+        }
+    }
+
    /// A region is considered inactive if the flushed entry id is less than the topic's latest entry id.
    ///
    /// The `topic_latest_entry_id` of a region is updated only when its memtable is empty during a flush.
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -34,8 +34,8 @@ use api::v1::meta::{
 };
 use api::v1::{
    AlterDatabaseExpr, AlterTableExpr, CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr,
-    CreateViewExpr, DropDatabaseExpr, DropFlowExpr, DropTableExpr, DropViewExpr, ExpireAfter,
-    Option as PbOption, QueryContext as PbQueryContext, TruncateTableExpr,
+    CreateViewExpr, DropDatabaseExpr, DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval,
+    ExpireAfter, Option as PbOption, QueryContext as PbQueryContext, TruncateTableExpr,
 };
 use base64::engine::general_purpose;
 use base64::Engine as _;
@@ -1125,6 +1125,7 @@ pub struct CreateFlowTask {
    pub create_if_not_exists: bool,
    /// Duration in seconds. Data older than this duration will not be used.
    pub expire_after: Option<i64>,
+    pub eval_interval_secs: Option<i64>,
    pub comment: String,
    pub sql: String,
    pub flow_options: HashMap<String, String>,
@@ -1142,6 +1143,7 @@ impl TryFrom<PbCreateFlowTask> for CreateFlowTask {
            or_replace,
            create_if_not_exists,
            expire_after,
+            eval_interval,
            comment,
            sql,
            flow_options,
@@ -1161,6 +1163,7 @@ impl TryFrom<PbCreateFlowTask> for CreateFlowTask {
            or_replace,
            create_if_not_exists,
            expire_after: expire_after.map(|e| e.value),
+            eval_interval_secs: eval_interval.map(|e| e.seconds),
            comment,
            sql,
            flow_options,
@@ -1178,6 +1181,7 @@ impl From<CreateFlowTask> for PbCreateFlowTask {
            or_replace,
            create_if_not_exists,
            expire_after,
+            eval_interval_secs: eval_interval,
            comment,
            sql,
            flow_options,
@@ -1192,6 +1196,7 @@ impl From<CreateFlowTask> for PbCreateFlowTask {
                or_replace,
                create_if_not_exists,
                expire_after: expire_after.map(|value| ExpireAfter { value }),
+                eval_interval: eval_interval.map(|seconds| EvalInterval { seconds }),
                comment,
                sql,
                flow_options,
--- a/src/common/meta/src/test_util.rs
+++ b/src/common/meta/src/test_util.rs
@@ -260,7 +260,7 @@ pub async fn test_kafka_topic_pool(
 /// Skip the test if the environment variable `GT_POSTGRES_ENDPOINTS` is not set.
 ///
 /// The format of the environment variable is:
-/// ```
+/// ```text
 /// GT_POSTGRES_ENDPOINTS=localhost:9092,localhost:9093
 /// ```
 macro_rules! maybe_skip_postgres_integration_test {
@@ -276,7 +276,7 @@ macro_rules! maybe_skip_postgres_integration_test {
 /// Skip the test if the environment variable `GT_MYSQL_ENDPOINTS` is not set.
 ///
 /// The format of the environment variable is:
-/// ```
+/// ```text
 /// GT_MYSQL_ENDPOINTS=localhost:9092,localhost:9093
 /// ```
 macro_rules! maybe_skip_mysql_integration_test {
@@ -287,3 +287,19 @@ macro_rules! maybe_skip_mysql_integration_test {
        }
    };
 }
+
+#[macro_export]
+/// Skip the test if the environment variable `GT_POSTGRES15_ENDPOINTS` is not set.
+///
+/// The format of the environment variable is:
+/// ```text
+/// GT_POSTGRES15_ENDPOINTS=postgres://user:password@127.0.0.1:5433/postgres
+/// ```
+macro_rules! maybe_skip_postgres15_integration_test {
+    () => {
+        if std::env::var("GT_POSTGRES15_ENDPOINTS").is_err() {
+            common_telemetry::warn!("The PG15 endpoints is empty, skipping the test");
+            return;
+        }
+    };
+}
--- a/src/common/meta/src/wal_options_allocator.rs
+++ b/src/common/meta/src/wal_options_allocator.rs
@@ -27,7 +27,7 @@ use snafu::{ensure, ResultExt};
 use store_api::storage::{RegionId, RegionNumber};

 use crate::error::{EncodeWalOptionsSnafu, InvalidTopicNamePrefixSnafu, Result};
-use crate::key::NAME_PATTERN_REGEX;
+use crate::key::TOPIC_NAME_PATTERN_REGEX;
 use crate::kv_backend::KvBackendRef;
 use crate::leadership_notifier::LeadershipChangeListener;
 pub use crate::wal_options_allocator::topic_creator::{
@@ -109,7 +109,7 @@ pub async fn build_wal_options_allocator(
        MetasrvWalConfig::Kafka(kafka_config) => {
            let prefix = &kafka_config.kafka_topic.topic_name_prefix;
            ensure!(
-                NAME_PATTERN_REGEX.is_match(prefix),
+                TOPIC_NAME_PATTERN_REGEX.is_match(prefix),
                InvalidTopicNamePrefixSnafu { prefix }
            );
            let topic_creator =
@@ -149,6 +149,26 @@ pub fn prepare_wal_options(
    }
 }

+/// Extracts the topic from the wal options.
+pub fn extract_topic_from_wal_options(
+    region_id: RegionId,
+    region_options: &HashMap<RegionNumber, String>,
+) -> Option<String> {
+    region_options
+        .get(&region_id.region_number())
+        .and_then(|wal_options| {
+            serde_json::from_str::<WalOptions>(wal_options)
+                .ok()
+                .and_then(|wal_options| {
+                    if let WalOptions::Kafka(kafka_wal_option) = wal_options {
+                        Some(kafka_wal_option.topic)
+                    } else {
+                        None
+                    }
+                })
+        })
+}
+
 #[cfg(test)]
 mod tests {
    use std::assert_matches::assert_matches;
--- a/src/common/query/Cargo.toml
+++ b/src/common/query/Cargo.toml
@@ -26,7 +26,6 @@ serde.workspace = true
 snafu.workspace = true
 sqlparser.workspace = true
 sqlparser_derive = "0.1"
-statrs = "0.16"
 store-api.workspace = true

 [dev-dependencies]
--- a/src/common/runtime/Cargo.toml
+++ b/src/common/runtime/Cargo.toml
@@ -28,7 +28,6 @@ parking_lot.workspace = true
 paste.workspace = true
 pin-project.workspace = true
 prometheus.workspace = true
-rand.workspace = true
 ratelimit.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/src/common/sql/Cargo.toml
+++ b/src/common/sql/Cargo.toml
@@ -6,7 +6,6 @@ license.workspace = true

 [dependencies]
 common-base.workspace = true
-common-datasource.workspace = true
 common-decimal.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
--- a/src/common/wal/src/config.rs
+++ b/src/common/wal/src/config.rs
@@ -20,7 +20,8 @@ use std::time::Duration;
 use serde::{Deserialize, Serialize};

 use crate::config::kafka::common::{
-    DEFAULT_AUTO_PRUNE_INTERVAL, DEFAULT_AUTO_PRUNE_PARALLELISM, DEFAULT_FLUSH_TRIGGER_SIZE,
+    DEFAULT_AUTO_PRUNE_INTERVAL, DEFAULT_AUTO_PRUNE_PARALLELISM, DEFAULT_CHECKPOINT_TRIGGER_SIZE,
+    DEFAULT_FLUSH_TRIGGER_SIZE,
 };
 use crate::config::kafka::{DatanodeKafkaConfig, MetasrvKafkaConfig};
 use crate::config::raft_engine::RaftEngineConfig;
@@ -64,6 +65,8 @@ impl From<DatanodeWalConfig> for MetasrvWalConfig {
                auto_prune_parallelism: DEFAULT_AUTO_PRUNE_PARALLELISM,
                // This field won't be used in standalone mode
                flush_trigger_size: DEFAULT_FLUSH_TRIGGER_SIZE,
+                // This field won't be used in standalone mode
+                checkpoint_trigger_size: DEFAULT_CHECKPOINT_TRIGGER_SIZE,
            }),
        }
    }
@@ -205,9 +208,10 @@ mod tests {
                create_topic_timeout: Duration::from_secs(30),
            },
            auto_create_topics: true,
-            auto_prune_interval: Duration::from_secs(0),
+            auto_prune_interval: Duration::from_mins(30),
            auto_prune_parallelism: 10,
            flush_trigger_size: ReadableSize::mb(512),
+            checkpoint_trigger_size: ReadableSize::mb(128),
        };
        assert_eq!(metasrv_wal_config, MetasrvWalConfig::Kafka(expected));

--- a/src/common/wal/src/config/kafka/common.rs
+++ b/src/common/wal/src/config/kafka/common.rs
@@ -37,11 +37,13 @@ pub const DEFAULT_BACKOFF_CONFIG: BackoffConfig = BackoffConfig {
 };

 /// Default interval for auto WAL pruning.
-pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::ZERO;
+pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::from_mins(30);
 /// Default limit for concurrent auto pruning tasks.
 pub const DEFAULT_AUTO_PRUNE_PARALLELISM: usize = 10;
 /// Default size of WAL to trigger flush.
 pub const DEFAULT_FLUSH_TRIGGER_SIZE: ReadableSize = ReadableSize::mb(512);
+/// Default checkpoint trigger size.
+pub const DEFAULT_CHECKPOINT_TRIGGER_SIZE: ReadableSize = ReadableSize::mb(128);

 use crate::error::{self, Result};
 use crate::{TopicSelectorType, BROKER_ENDPOINT, TOPIC_NAME_PREFIX};
--- a/src/common/wal/src/config/kafka/metasrv.rs
+++ b/src/common/wal/src/config/kafka/metasrv.rs
@@ -19,7 +19,7 @@ use serde::{Deserialize, Serialize};

 use crate::config::kafka::common::{
    KafkaConnectionConfig, KafkaTopicConfig, DEFAULT_AUTO_PRUNE_INTERVAL,
-    DEFAULT_AUTO_PRUNE_PARALLELISM, DEFAULT_FLUSH_TRIGGER_SIZE,
+    DEFAULT_AUTO_PRUNE_PARALLELISM, DEFAULT_CHECKPOINT_TRIGGER_SIZE, DEFAULT_FLUSH_TRIGGER_SIZE,
 };

 /// Kafka wal configurations for metasrv.
@@ -41,6 +41,8 @@ pub struct MetasrvKafkaConfig {
    pub auto_prune_parallelism: usize,
    // The size of WAL to trigger flush.
    pub flush_trigger_size: ReadableSize,
+    // The checkpoint trigger size.
+    pub checkpoint_trigger_size: ReadableSize,
 }

 impl Default for MetasrvKafkaConfig {
@@ -52,6 +54,7 @@ impl Default for MetasrvKafkaConfig {
            auto_prune_interval: DEFAULT_AUTO_PRUNE_INTERVAL,
            auto_prune_parallelism: DEFAULT_AUTO_PRUNE_PARALLELISM,
            flush_trigger_size: DEFAULT_FLUSH_TRIGGER_SIZE,
+            checkpoint_trigger_size: DEFAULT_CHECKPOINT_TRIGGER_SIZE,
        }
    }
 }
--- a/src/common/wal/src/lib.rs
+++ b/src/common/wal/src/lib.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 #![feature(assert_matches)]
+#![feature(duration_constructors_lite)]

 use std::net::SocketAddr;

--- a/src/common/workload/Cargo.toml
+++ b/src/common/workload/Cargo.toml
@@ -8,6 +8,5 @@ license.workspace = true
 workspace = true

 [dependencies]
-api.workspace = true
 common-telemetry.workspace = true
 serde.workspace = true
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -23,18 +23,15 @@ use common_error::ext::BoxedError;
 use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
 use common_meta::cache::{LayeredCacheRegistry, SchemaCacheRef, TableSchemaCacheRef};
 use common_meta::datanode::TopicStatsReporter;
-use common_meta::key::datanode_table::{DatanodeTableManager, DatanodeTableValue};
 use common_meta::key::runtime_switch::RuntimeSwitchManager;
 use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
-use common_meta::wal_options_allocator::prepare_wal_options;
 pub use common_procedure::options::ProcedureConfig;
 use common_telemetry::{error, info, warn};
 use common_wal::config::kafka::DatanodeKafkaConfig;
 use common_wal::config::raft_engine::RaftEngineConfig;
 use common_wal::config::DatanodeWalConfig;
 use file_engine::engine::FileRegionEngine;
-use futures_util::TryStreamExt;
 use log_store::kafka::log_store::KafkaLogStore;
 use log_store::kafka::{default_index_file, GlobalIndexCollector};
 use log_store::raft_engine::log_store::RaftEngineLogStore;
@@ -49,10 +46,8 @@ use query::QueryEngineFactory;
 use servers::export_metrics::ExportMetricsTask;
 use servers::server::ServerHandlers;
 use snafu::{ensure, OptionExt, ResultExt};
-use store_api::path_utils::{table_dir, WAL_DIR};
+use store_api::path_utils::WAL_DIR;
 use store_api::region_engine::{RegionEngineRef, RegionRole};
-use store_api::region_request::{PathType, RegionOpenRequest};
-use store_api::storage::RegionId;
 use tokio::fs;
 use tokio::sync::Notify;

@@ -70,6 +65,7 @@ use crate::greptimedb_telemetry::get_greptimedb_telemetry_task;
 use crate::heartbeat::HeartbeatTask;
 use crate::region_server::{DummyTableProviderFactory, RegionServer};
 use crate::store::{self, new_object_store_without_cache};
+use crate::utils::{build_region_open_requests, RegionOpenRequests};

 /// Datanode service.
 pub struct Datanode {
@@ -252,16 +248,12 @@ impl DatanodeBuilder {
            .recovery_mode()
            .await
            .context(GetMetadataSnafu)?;
-        let datanode_table_manager = DatanodeTableManager::new(self.kv_backend.clone());
-        let table_values = datanode_table_manager
-            .tables(node_id)
-            .try_collect::<Vec<_>>()
-            .await
-            .context(GetMetadataSnafu)?;

+        let region_open_requests =
+            build_region_open_requests(node_id, self.kv_backend.clone()).await?;
        let open_all_regions = open_all_regions(
            region_server.clone(),
-            table_values,
+            region_open_requests,
            !controlled_by_metasrv,
            self.opts.init_regions_parallelism,
            // Ignore nonexistent regions in recovery mode.
@@ -342,27 +334,22 @@ impl DatanodeBuilder {
    async fn initialize_region_server(
        &self,
        region_server: &RegionServer,
-        kv_backend: KvBackendRef,
        open_with_writable: bool,
    ) -> Result<()> {
        let node_id = self.opts.node_id.context(MissingNodeIdSnafu)?;

-        let runtime_switch_manager = RuntimeSwitchManager::new(kv_backend.clone());
+        // TODO(weny): Considering introducing a readonly kv_backend trait.
+        let runtime_switch_manager = RuntimeSwitchManager::new(self.kv_backend.clone());
        let is_recovery_mode = runtime_switch_manager
            .recovery_mode()
            .await
            .context(GetMetadataSnafu)?;
-
-        let datanode_table_manager = DatanodeTableManager::new(kv_backend.clone());
-        let table_values = datanode_table_manager
-            .tables(node_id)
-            .try_collect::<Vec<_>>()
-            .await
-            .context(GetMetadataSnafu)?;
+        let region_open_requests =
+            build_region_open_requests(node_id, self.kv_backend.clone()).await?;

        open_all_regions(
            region_server.clone(),
-            table_values,
+            region_open_requests,
            open_with_writable,
            self.opts.init_regions_parallelism,
            is_recovery_mode,
@@ -609,73 +596,24 @@ impl DatanodeBuilder {
 /// Open all regions belong to this datanode.
 async fn open_all_regions(
    region_server: RegionServer,
-    table_values: Vec<DatanodeTableValue>,
+    region_open_requests: RegionOpenRequests,
    open_with_writable: bool,
    init_regions_parallelism: usize,
    ignore_nonexistent_region: bool,
 ) -> Result<()> {
-    let mut regions = vec![];
-    #[cfg(feature = "enterprise")]
-    let mut follower_regions = vec![];
-    for table_value in table_values {
-        for region_number in table_value.regions {
-            // Augments region options with wal options if a wal options is provided.
-            let mut region_options = table_value.region_info.region_options.clone();
-            prepare_wal_options(
-                &mut region_options,
-                RegionId::new(table_value.table_id, region_number),
-                &table_value.region_info.region_wal_options,
-            );
-
-            regions.push((
-                RegionId::new(table_value.table_id, region_number),
-                table_value.region_info.engine.clone(),
-                table_value.region_info.region_storage_path.clone(),
-                region_options,
-            ));
-        }
-
+    let RegionOpenRequests {
+        leader_regions,
        #[cfg(feature = "enterprise")]
-        for region_number in table_value.follower_regions {
-            // Augments region options with wal options if a wal options is provided.
-            let mut region_options = table_value.region_info.region_options.clone();
-            prepare_wal_options(
-                &mut region_options,
-                RegionId::new(table_value.table_id, region_number),
-                &table_value.region_info.region_wal_options,
-            );
-
-            follower_regions.push((
-                RegionId::new(table_value.table_id, region_number),
-                table_value.region_info.engine.clone(),
-                table_value.region_info.region_storage_path.clone(),
-                region_options,
-            ));
-        }
-    }
-    let num_regions = regions.len();
-    info!("going to open {} region(s)", num_regions);
-
-    let mut region_requests = Vec::with_capacity(regions.len());
-    for (region_id, engine, store_path, options) in regions {
-        let table_dir = table_dir(&store_path, region_id.table_id());
-        region_requests.push((
-            region_id,
-            RegionOpenRequest {
-                engine,
-                table_dir,
-                path_type: PathType::Bare,
-                options,
-                skip_wal_replay: false,
-            },
-        ));
-    }
+        follower_regions,
+    } = region_open_requests;

+    let leader_region_num = leader_regions.len();
+    info!("going to open {} region(s)", leader_region_num);
    let now = Instant::now();
    let open_regions = region_server
        .handle_batch_open_requests(
            init_regions_parallelism,
-            region_requests,
+            leader_regions,
            ignore_nonexistent_region,
        )
        .await?;
@@ -686,19 +624,19 @@ async fn open_all_regions(
    );
    if !ignore_nonexistent_region {
        ensure!(
-            open_regions.len() == num_regions,
+            open_regions.len() == leader_region_num,
            error::UnexpectedSnafu {
                violated: format!(
                    "Expected to open {} of regions, only {} of regions has opened",
-                    num_regions,
+                    leader_region_num,
                    open_regions.len()
                )
            }
        );
-    } else if open_regions.len() != num_regions {
+    } else if open_regions.len() != leader_region_num {
        warn!(
            "ignore nonexistent region, expected to open {} of regions, only {} of regions has opened",
-            num_regions,
+            leader_region_num,
            open_regions.len()
        );
    }
@@ -717,31 +655,14 @@ async fn open_all_regions(
    if !follower_regions.is_empty() {
        use tokio::time::Instant;

-        info!(
-            "going to open {} follower region(s)",
-            follower_regions.len()
-        );
-        let mut region_requests = Vec::with_capacity(follower_regions.len());
-        let num_regions = follower_regions.len();
-        for (region_id, engine, store_path, options) in follower_regions {
-            let table_dir = table_dir(&store_path, region_id.table_id());
-            region_requests.push((
-                region_id,
-                RegionOpenRequest {
-                    engine,
-                    table_dir,
-                    path_type: PathType::Bare,
-                    options,
-                    skip_wal_replay: true,
-                },
-            ));
-        }
+        let follower_region_num = follower_regions.len();
+        info!("going to open {} follower region(s)", follower_region_num);

        let now = Instant::now();
        let open_regions = region_server
            .handle_batch_open_requests(
                init_regions_parallelism,
-                region_requests,
+                follower_regions,
                ignore_nonexistent_region,
            )
            .await?;
@@ -753,19 +674,19 @@ async fn open_all_regions(

        if !ignore_nonexistent_region {
            ensure!(
-                open_regions.len() == num_regions,
+                open_regions.len() == follower_region_num,
                error::UnexpectedSnafu {
                    violated: format!(
                        "Expected to open {} of follower regions, only {} of regions has opened",
-                        num_regions,
+                        follower_region_num,
                        open_regions.len()
                    )
                }
            );
-        } else if open_regions.len() != num_regions {
+        } else if open_regions.len() != follower_region_num {
            warn!(
                "ignore nonexistent region, expected to open {} of follower regions, only {} of regions has opened",
-                num_regions,
+                follower_region_num,
                open_regions.len()
            );
        }
@@ -835,15 +756,13 @@ mod tests {
                ..Default::default()
            },
            Plugins::default(),
-            kv_backend,
+            kv_backend.clone(),
        );
        builder.with_cache_registry(layered_cache_registry);
-
-        let kv = Arc::new(MemoryKvBackend::default()) as _;
-        setup_table_datanode(&kv).await;
+        setup_table_datanode(&(kv_backend as _)).await;

        builder
-            .initialize_region_server(&mock_region_server, kv.clone(), false)
+            .initialize_region_server(&mock_region_server, false)
            .await
            .unwrap();

--- a/src/datanode/src/heartbeat/handler/open_region.rs
+++ b/src/datanode/src/heartbeat/handler/open_region.rs
@@ -16,7 +16,7 @@ use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
 use common_meta::wal_options_allocator::prepare_wal_options;
 use futures_util::future::BoxFuture;
 use store_api::path_utils::table_dir;
-use store_api::region_request::{PathType, RegionOpenRequest, RegionRequest};
+use store_api::region_request::{PathType, RegionOpenRequest, RegionRequest, ReplayCheckpoint};

 use crate::heartbeat::handler::HandlerContext;

@@ -29,17 +29,31 @@ impl HandlerContext {
            mut region_options,
            region_wal_options,
            skip_wal_replay,
+            replay_entry_id,
+            metadata_replay_entry_id,
        }: OpenRegion,
    ) -> BoxFuture<'static, Option<InstructionReply>> {
        Box::pin(async move {
            let region_id = Self::region_ident_to_region_id(&region_ident);
            prepare_wal_options(&mut region_options, region_id, &region_wal_options);
+            let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
+                (Some(replay_entry_id), Some(metadata_replay_entry_id)) => Some(ReplayCheckpoint {
+                    entry_id: replay_entry_id,
+                    metadata_entry_id: Some(metadata_replay_entry_id),
+                }),
+                (Some(replay_entry_id), None) => Some(ReplayCheckpoint {
+                    entry_id: replay_entry_id,
+                    metadata_entry_id: None,
+                }),
+                _ => None,
+            };
            let request = RegionRequest::Open(RegionOpenRequest {
                engine: region_ident.engine,
                table_dir: table_dir(&region_storage_path, region_id.table_id()),
                path_type: PathType::Bare,
                options: region_options,
                skip_wal_replay,
+                checkpoint,
            });
            let result = self.region_server.handle_request(region_id, request).await;
            let success = result.is_ok();
--- a/src/datanode/src/lib.rs
+++ b/src/datanode/src/lib.rs
@@ -28,3 +28,4 @@ pub mod service;
 pub mod store;
 #[cfg(any(test, feature = "testing"))]
 pub mod tests;
+pub mod utils;
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -1410,6 +1410,7 @@ mod tests {
                    path_type: PathType::Bare,
                    options: Default::default(),
                    skip_wal_replay: false,
+                    checkpoint: None,
                }),
            )
            .await
@@ -1579,6 +1580,7 @@ mod tests {
                            path_type: PathType::Bare,
                            options: Default::default(),
                            skip_wal_replay: false,
+                            checkpoint: None,
                        },
                    ),
                    (
@@ -1589,6 +1591,7 @@ mod tests {
                            path_type: PathType::Bare,
                            options: Default::default(),
                            skip_wal_replay: false,
+                            checkpoint: None,
                        },
                    ),
                ],
@@ -1610,6 +1613,7 @@ mod tests {
                            path_type: PathType::Bare,
                            options: Default::default(),
                            skip_wal_replay: false,
+                            checkpoint: None,
                        },
                    ),
                    (
@@ -1620,6 +1624,7 @@ mod tests {
                            path_type: PathType::Bare,
                            options: Default::default(),
                            skip_wal_replay: false,
+                            checkpoint: None,
                        },
                    ),
                ],
--- a/src/datanode/src/utils.rs
+++ b/src/datanode/src/utils.rs
@@ -0,0 +1,188 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use common_meta::key::datanode_table::DatanodeTableManager;
+use common_meta::key::topic_region::{TopicRegionKey, TopicRegionManager, TopicRegionValue};
+use common_meta::kv_backend::KvBackendRef;
+use common_meta::wal_options_allocator::{extract_topic_from_wal_options, prepare_wal_options};
+use common_meta::DatanodeId;
+use futures::TryStreamExt;
+use snafu::ResultExt;
+use store_api::path_utils::table_dir;
+use store_api::region_request::{PathType, RegionOpenRequest, ReplayCheckpoint};
+use store_api::storage::{RegionId, RegionNumber};
+use tracing::info;
+
+use crate::error::{GetMetadataSnafu, Result};
+
+/// The requests to open regions.
+pub(crate) struct RegionOpenRequests {
+    pub leader_regions: Vec<(RegionId, RegionOpenRequest)>,
+    #[cfg(feature = "enterprise")]
+    pub follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+}
+
+fn group_region_by_topic(
+    region_id: RegionId,
+    region_options: &HashMap<RegionNumber, String>,
+    topic_regions: &mut HashMap<String, Vec<RegionId>>,
+) {
+    if let Some(topic) = extract_topic_from_wal_options(region_id, region_options) {
+        topic_regions.entry(topic).or_default().push(region_id);
+    }
+}
+
+fn get_replay_checkpoint(
+    region_id: RegionId,
+    topic_region_values: &Option<HashMap<RegionId, TopicRegionValue>>,
+) -> Option<ReplayCheckpoint> {
+    let topic_region_values = topic_region_values.as_ref()?;
+    let topic_region_value = topic_region_values.get(&region_id);
+    let replay_checkpoint = topic_region_value.and_then(|value| value.checkpoint);
+    replay_checkpoint.map(|checkpoint| ReplayCheckpoint {
+        entry_id: checkpoint.entry_id,
+        metadata_entry_id: checkpoint.metadata_entry_id,
+    })
+}
+
+pub(crate) async fn build_region_open_requests(
+    node_id: DatanodeId,
+    kv_backend: KvBackendRef,
+) -> Result<RegionOpenRequests> {
+    let datanode_table_manager = DatanodeTableManager::new(kv_backend.clone());
+    let table_values = datanode_table_manager
+        .tables(node_id)
+        .try_collect::<Vec<_>>()
+        .await
+        .context(GetMetadataSnafu)?;
+
+    let topic_region_manager = TopicRegionManager::new(kv_backend);
+    let mut topic_regions = HashMap::<String, Vec<RegionId>>::new();
+    let mut regions = vec![];
+    #[cfg(feature = "enterprise")]
+    let mut follower_regions = vec![];
+
+    for table_value in table_values {
+        for region_number in table_value.regions {
+            let region_id = RegionId::new(table_value.table_id, region_number);
+            // Augments region options with wal options if a wal options is provided.
+            let mut region_options = table_value.region_info.region_options.clone();
+            prepare_wal_options(
+                &mut region_options,
+                region_id,
+                &table_value.region_info.region_wal_options,
+            );
+            group_region_by_topic(
+                region_id,
+                &table_value.region_info.region_wal_options,
+                &mut topic_regions,
+            );
+
+            regions.push((
+                region_id,
+                table_value.region_info.engine.clone(),
+                table_value.region_info.region_storage_path.clone(),
+                region_options,
+            ));
+        }
+
+        #[cfg(feature = "enterprise")]
+        for region_number in table_value.follower_regions {
+            let region_id = RegionId::new(table_value.table_id, region_number);
+            // Augments region options with wal options if a wal options is provided.
+            let mut region_options = table_value.region_info.region_options.clone();
+            prepare_wal_options(
+                &mut region_options,
+                RegionId::new(table_value.table_id, region_number),
+                &table_value.region_info.region_wal_options,
+            );
+            group_region_by_topic(
+                region_id,
+                &table_value.region_info.region_wal_options,
+                &mut topic_regions,
+            );
+
+            follower_regions.push((
+                RegionId::new(table_value.table_id, region_number),
+                table_value.region_info.engine.clone(),
+                table_value.region_info.region_storage_path.clone(),
+                region_options,
+            ));
+        }
+    }
+
+    let topic_region_values = if !topic_regions.is_empty() {
+        let keys = topic_regions
+            .iter()
+            .flat_map(|(topic, regions)| {
+                regions
+                    .iter()
+                    .map(|region_id| TopicRegionKey::new(*region_id, topic))
+            })
+            .collect::<Vec<_>>();
+        let topic_region_manager = topic_region_manager
+            .batch_get(keys)
+            .await
+            .context(GetMetadataSnafu)?;
+        Some(topic_region_manager)
+    } else {
+        None
+    };
+
+    let mut leader_region_requests = Vec::with_capacity(regions.len());
+    for (region_id, engine, store_path, options) in regions {
+        let table_dir = table_dir(&store_path, region_id.table_id());
+        let checkpoint = get_replay_checkpoint(region_id, &topic_region_values);
+        info!("region_id: {}, checkpoint: {:?}", region_id, checkpoint);
+        leader_region_requests.push((
+            region_id,
+            RegionOpenRequest {
+                engine,
+                table_dir,
+                path_type: PathType::Bare,
+                options,
+                skip_wal_replay: false,
+                checkpoint,
+            },
+        ));
+    }
+
+    #[cfg(feature = "enterprise")]
+    let follower_region_requests = {
+        let mut follower_region_requests = Vec::with_capacity(follower_regions.len());
+        for (region_id, engine, store_path, options) in follower_regions {
+            let table_dir = table_dir(&store_path, region_id.table_id());
+            follower_region_requests.push((
+                region_id,
+                RegionOpenRequest {
+                    engine,
+                    table_dir,
+                    path_type: PathType::Bare,
+                    options,
+                    skip_wal_replay: true,
+                    checkpoint: None,
+                },
+            ));
+        }
+        follower_region_requests
+    };
+
+    Ok(RegionOpenRequests {
+        leader_regions: leader_region_requests,
+        #[cfg(feature = "enterprise")]
+        follower_regions: follower_region_requests,
+    })
+}
--- a/src/datatypes/src/vectors/dictionary.rs
+++ b/src/datatypes/src/vectors/dictionary.rs
@@ -13,11 +13,11 @@
 // limitations under the License.

 use std::any::Any;
+use std::fmt;
 use std::sync::Arc;

-use arrow::array::Array;
-use arrow::datatypes::Int64Type;
-use arrow_array::{ArrayRef, DictionaryArray, Int64Array};
+use arrow::array::{Array, ArrayRef, DictionaryArray, PrimitiveArray, PrimitiveBuilder};
+use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType};
 use serde_json::Value as JsonValue;
 use snafu::ResultExt;

@@ -30,34 +30,55 @@ use crate::vectors::operations::VectorOp;
 use crate::vectors::{self, Helper, Validity, Vector, VectorRef};

 /// Vector of dictionaries, basically backed by Arrow's `DictionaryArray`.
-#[derive(Debug, PartialEq)]
-pub struct DictionaryVector {
-    array: DictionaryArray<Int64Type>,
+pub struct DictionaryVector<K: ArrowDictionaryKeyType> {
+    array: DictionaryArray<K>,
+    /// The datatype of the keys in the dictionary.
+    key_type: ConcreteDataType,
    /// The datatype of the items in the dictionary.
    item_type: ConcreteDataType,
    /// The vector of items in the dictionary.
    item_vector: VectorRef,
 }

-impl DictionaryVector {
+impl<K: ArrowDictionaryKeyType> fmt::Debug for DictionaryVector<K> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("DictionaryVector")
+            .field("array", &self.array)
+            .field("key_type", &self.key_type)
+            .field("item_type", &self.item_type)
+            .finish()
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryVector<K> {
+    fn eq(&self, other: &DictionaryVector<K>) -> bool {
+        self.array == other.array
+            && self.key_type == other.key_type
+            && self.item_type == other.item_type
+    }
+}
+
+impl<K: ArrowDictionaryKeyType> DictionaryVector<K> {
    /// Create a new instance of `DictionaryVector` from a dictionary array and item type
-    pub fn new(array: DictionaryArray<Int64Type>, item_type: ConcreteDataType) -> Result<Self> {
+    pub fn new(array: DictionaryArray<K>, item_type: ConcreteDataType) -> Result<Self> {
+        let key_type = ConcreteDataType::try_from(&K::DATA_TYPE)?;
        let item_vector = Helper::try_into_vector(array.values())?;

        Ok(Self {
            array,
+            key_type,
            item_type,
            item_vector,
        })
    }

    /// Returns the underlying Arrow dictionary array
-    pub fn array(&self) -> &DictionaryArray<Int64Type> {
+    pub fn array(&self) -> &DictionaryArray<K> {
        &self.array
    }

    /// Returns the keys array of this dictionary
-    pub fn keys(&self) -> &arrow_array::PrimitiveArray<Int64Type> {
+    pub fn keys(&self) -> &arrow_array::PrimitiveArray<K> {
        self.array.keys()
    }

@@ -71,10 +92,10 @@ impl DictionaryVector {
    }
 }

-impl Vector for DictionaryVector {
+impl<K: ArrowDictionaryKeyType> Vector for DictionaryVector<K> {
    fn data_type(&self) -> ConcreteDataType {
        ConcreteDataType::Dictionary(DictionaryType::new(
-            ConcreteDataType::int64_datatype(),
+            self.key_type.clone(),
            self.item_type.clone(),
        ))
    }
@@ -118,6 +139,7 @@ impl Vector for DictionaryVector {
    fn slice(&self, offset: usize, length: usize) -> VectorRef {
        Arc::new(Self {
            array: self.array.slice(offset, length),
+            key_type: self.key_type.clone(),
            item_type: self.item_type.clone(),
            item_vector: self.item_vector.clone(),
        })
@@ -129,7 +151,7 @@ impl Vector for DictionaryVector {
        }

        let key = self.array.keys().value(index);
-        self.item_vector.get(key as usize)
+        self.item_vector.get(key.as_usize())
    }

    fn get_ref(&self, index: usize) -> ValueRef {
@@ -138,11 +160,11 @@ impl Vector for DictionaryVector {
        }

        let key = self.array.keys().value(index);
-        self.item_vector.get_ref(key as usize)
+        self.item_vector.get_ref(key.as_usize())
    }
 }

-impl Serializable for DictionaryVector {
+impl<K: ArrowDictionaryKeyType> Serializable for DictionaryVector<K> {
    fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
        // Convert the dictionary array to JSON, where each element is either null or
        // the value it refers to in the dictionary
@@ -153,7 +175,7 @@ impl Serializable for DictionaryVector {
                result.push(JsonValue::Null);
            } else {
                let key = self.array.keys().value(i);
-                let value = self.item_vector.get(key as usize);
+                let value = self.item_vector.get(key.as_usize());
                let json_value = serde_json::to_value(value).context(error::SerializeSnafu)?;
                result.push(json_value);
            }
@@ -163,33 +185,35 @@ impl Serializable for DictionaryVector {
    }
 }

-impl TryFrom<DictionaryArray<Int64Type>> for DictionaryVector {
+impl<K: ArrowDictionaryKeyType> TryFrom<DictionaryArray<K>> for DictionaryVector<K> {
    type Error = crate::error::Error;

-    fn try_from(array: DictionaryArray<Int64Type>) -> Result<Self> {
-        let item_type = ConcreteDataType::from_arrow_type(array.values().data_type());
+    fn try_from(array: DictionaryArray<K>) -> Result<Self> {
+        let key_type = ConcreteDataType::try_from(array.keys().data_type())?;
+        let item_type = ConcreteDataType::try_from(array.values().data_type())?;
        let item_vector = Helper::try_into_vector(array.values())?;

        Ok(Self {
            array,
+            key_type,
            item_type,
            item_vector,
        })
    }
 }

-pub struct DictionaryIter<'a> {
-    vector: &'a DictionaryVector,
+pub struct DictionaryIter<'a, K: ArrowDictionaryKeyType> {
+    vector: &'a DictionaryVector<K>,
    idx: usize,
 }

-impl<'a> DictionaryIter<'a> {
-    pub fn new(vector: &'a DictionaryVector) -> DictionaryIter<'a> {
+impl<'a, K: ArrowDictionaryKeyType> DictionaryIter<'a, K> {
+    pub fn new(vector: &'a DictionaryVector<K>) -> DictionaryIter<'a, K> {
        DictionaryIter { vector, idx: 0 }
    }
 }

-impl<'a> Iterator for DictionaryIter<'a> {
+impl<'a, K: ArrowDictionaryKeyType> Iterator for DictionaryIter<'a, K> {
    type Item = Option<ValueRef<'a>>;

    #[inline]
@@ -205,7 +229,7 @@ impl<'a> Iterator for DictionaryIter<'a> {
            return Some(None);
        }

-        Some(Some(self.vector.item_vector.get_ref(self.idx)))
+        Some(Some(self.vector.get_ref(idx)))
    }

    #[inline]
@@ -217,10 +241,10 @@ impl<'a> Iterator for DictionaryIter<'a> {
    }
 }

-impl VectorOp for DictionaryVector {
+impl<K: ArrowDictionaryKeyType> VectorOp for DictionaryVector<K> {
    fn replicate(&self, offsets: &[usize]) -> VectorRef {
        let keys = self.array.keys();
-        let mut replicated_keys = Vec::with_capacity(offsets.len());
+        let mut replicated_keys = PrimitiveBuilder::new();

        let mut previous_offset = 0;
        for (i, &offset) in offsets.iter().enumerate() {
@@ -236,19 +260,20 @@ impl VectorOp for DictionaryVector {

            // repeat this key (offset - previous_offset) times
            let repeat_count = offset - previous_offset;
-            if repeat_count > 0 {
-                replicated_keys.resize(replicated_keys.len() + repeat_count, key);
+            for _ in 0..repeat_count {
+                replicated_keys.append_option(key);
            }

            previous_offset = offset;
        }

-        let new_keys = Int64Array::from(replicated_keys);
+        let new_keys = replicated_keys.finish();
        let new_array = DictionaryArray::try_new(new_keys, self.values().clone())
            .expect("Failed to create replicated dictionary array");

        Arc::new(Self {
            array: new_array,
+            key_type: self.key_type.clone(),
            item_type: self.item_type.clone(),
            item_vector: self.item_vector.clone(),
        })
@@ -261,7 +286,7 @@ impl VectorOp for DictionaryVector {
        let filtered_key_array = filtered_key_vector.to_arrow_array();
        let filtered_key_array = filtered_key_array
            .as_any()
-            .downcast_ref::<Int64Array>()
+            .downcast_ref::<PrimitiveArray<K>>()
            .unwrap();

        let new_array = DictionaryArray::try_new(filtered_key_array.clone(), self.values().clone())
@@ -269,6 +294,7 @@ impl VectorOp for DictionaryVector {

        Ok(Arc::new(Self {
            array: new_array,
+            key_type: self.key_type.clone(),
            item_type: self.item_type.clone(),
            item_vector: self.item_vector.clone(),
        }))
@@ -281,6 +307,7 @@ impl VectorOp for DictionaryVector {
                .expect("Failed to create casted dictionary array");
        Ok(Arc::new(Self {
            array: new_array,
+            key_type: self.key_type.clone(),
            item_type: to_type.clone(),
            item_vector: self.item_vector.clone(),
        }))
@@ -291,13 +318,17 @@ impl VectorOp for DictionaryVector {
        let key_vector = Helper::try_into_vector(&key_array)?;
        let new_key_vector = key_vector.take(indices)?;
        let new_key_array = new_key_vector.to_arrow_array();
-        let new_key_array = new_key_array.as_any().downcast_ref::<Int64Array>().unwrap();
+        let new_key_array = new_key_array
+            .as_any()
+            .downcast_ref::<PrimitiveArray<K>>()
+            .unwrap();

        let new_array = DictionaryArray::try_new(new_key_array.clone(), self.values().clone())
            .expect("Failed to create filtered dictionary array");

        Ok(Arc::new(Self {
            array: new_array,
+            key_type: self.key_type.clone(),
            item_type: self.item_type.clone(),
            item_vector: self.item_vector.clone(),
        }))
@@ -308,19 +339,20 @@ impl VectorOp for DictionaryVector {
 mod tests {
    use std::sync::Arc;

-    use arrow_array::StringArray;
+    use arrow::array::{Int64Array, StringArray, UInt32Array};
+    use arrow::datatypes::{Int64Type, UInt32Type};

    use super::*;

    // Helper function to create a test dictionary vector with string values
-    fn create_test_dictionary() -> DictionaryVector {
+    fn create_test_dictionary() -> DictionaryVector<Int64Type> {
        // Dictionary values: ["a", "b", "c", "d"]
        // Keys: [0, 1, 2, null, 1, 3]
        // Resulting in: ["a", "b", "c", null, "b", "d"]
        let values = StringArray::from(vec!["a", "b", "c", "d"]);
        let keys = Int64Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
        let dict_array = DictionaryArray::new(keys, Arc::new(values));
-        DictionaryVector::try_from(dict_array).unwrap()
+        DictionaryVector::<Int64Type>::try_from(dict_array).unwrap()
    }

    #[test]
@@ -435,4 +467,19 @@ mod tests {
        assert_eq!(taken.get(1), Value::String("a".to_string().into()));
        assert_eq!(taken.get(2), Value::String("b".to_string().into()));
    }
+
+    #[test]
+    fn test_other_type() {
+        let values = StringArray::from(vec!["a", "b", "c", "d"]);
+        let keys = UInt32Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
+        let dict_array = DictionaryArray::new(keys, Arc::new(values));
+        let dict_vec = DictionaryVector::<UInt32Type>::try_from(dict_array).unwrap();
+        assert_eq!(
+            ConcreteDataType::dictionary_datatype(
+                ConcreteDataType::uint32_datatype(),
+                ConcreteDataType::string_datatype()
+            ),
+            dict_vec.data_type()
+        );
+    }
 }
--- a/src/datatypes/src/vectors/helper.rs
+++ b/src/datatypes/src/vectors/helper.rs
@@ -20,7 +20,10 @@ use std::sync::Arc;
 use arrow::array::{Array, ArrayRef, StringArray};
 use arrow::compute;
 use arrow::compute::kernels::comparison;
-use arrow::datatypes::{DataType as ArrowDataType, Int64Type, TimeUnit};
+use arrow::datatypes::{
+    DataType as ArrowDataType, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, UInt16Type,
+    UInt32Type, UInt64Type, UInt8Type,
+};
 use arrow_array::{DictionaryArray, StructArray};
 use arrow_schema::IntervalUnit;
 use datafusion_common::ScalarValue;
@@ -351,16 +354,37 @@ impl Helper {
            ArrowDataType::Decimal128(_, _) => {
                Arc::new(Decimal128Vector::try_from_arrow_array(array)?)
            }
-            ArrowDataType::Dictionary(key, value) if matches!(&**key, ArrowDataType::Int64) => {
-                let array = array
-                    .as_ref()
-                    .as_any()
-                    .downcast_ref::<DictionaryArray<Int64Type>>()
-                    .unwrap(); // Safety: the type is guarded by match arm condition
-                Arc::new(DictionaryVector::new(
-                    array.clone(),
-                    ConcreteDataType::try_from(value.as_ref())?,
-                )?)
+            ArrowDataType::Dictionary(key, value) => {
+                macro_rules! handle_dictionary_key_type {
+                    ($key_type:ident) => {{
+                        let array = array
+                            .as_ref()
+                            .as_any()
+                            .downcast_ref::<DictionaryArray<$key_type>>()
+                            .unwrap(); // Safety: the type is guarded by match arm condition
+                        Arc::new(DictionaryVector::new(
+                            array.clone(),
+                            ConcreteDataType::try_from(value.as_ref())?,
+                        )?)
+                    }};
+                }
+
+                match key.as_ref() {
+                    ArrowDataType::Int8 => handle_dictionary_key_type!(Int8Type),
+                    ArrowDataType::Int16 => handle_dictionary_key_type!(Int16Type),
+                    ArrowDataType::Int32 => handle_dictionary_key_type!(Int32Type),
+                    ArrowDataType::Int64 => handle_dictionary_key_type!(Int64Type),
+                    ArrowDataType::UInt8 => handle_dictionary_key_type!(UInt8Type),
+                    ArrowDataType::UInt16 => handle_dictionary_key_type!(UInt16Type),
+                    ArrowDataType::UInt32 => handle_dictionary_key_type!(UInt32Type),
+                    ArrowDataType::UInt64 => handle_dictionary_key_type!(UInt64Type),
+                    _ => {
+                        return error::UnsupportedArrowTypeSnafu {
+                            arrow_type: array.as_ref().data_type().clone(),
+                        }
+                        .fail()
+                    }
+                }
            }

            ArrowDataType::Struct(_fields) => {
@@ -375,7 +399,6 @@ impl Helper {
            | ArrowDataType::LargeList(_)
            | ArrowDataType::FixedSizeList(_, _)
            | ArrowDataType::Union(_, _)
-            | ArrowDataType::Dictionary(_, _)
            | ArrowDataType::Decimal256(_, _)
            | ArrowDataType::Map(_, _)
            | ArrowDataType::RunEndEncoded(_, _)
@@ -629,10 +652,55 @@ mod tests {
        check_try_into_vector(Time64MicrosecondArray::from(vec![1, 2, 3]));
        check_try_into_vector(Time64NanosecondArray::from(vec![1, 2, 3]));

+        // Test dictionary arrays with different key types
        let values = StringArray::from_iter_values(["a", "b", "c"]);
+
+        // Test Int8 keys
        let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test Int16 keys
+        let keys = Int16Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test Int32 keys
+        let keys = Int32Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test Int64 keys
+        let keys = Int64Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test UInt8 keys
+        let keys = UInt8Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test UInt16 keys
+        let keys = UInt16Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test UInt32 keys
+        let keys = UInt32Array::from_iter_values([0, 0, 1, 2]);
+        let array: ArrayRef =
+            Arc::new(DictionaryArray::try_new(keys, Arc::new(values.clone())).unwrap());
+        Helper::try_into_vector(array).unwrap();
+
+        // Test UInt64 keys
+        let keys = UInt64Array::from_iter_values([0, 0, 1, 2]);
        let array: ArrayRef = Arc::new(DictionaryArray::try_new(keys, Arc::new(values)).unwrap());
-        Helper::try_into_vector(array).unwrap_err();
+        Helper::try_into_vector(array).unwrap();
    }

    #[test]
--- a/src/file-engine/src/region.rs
+++ b/src/file-engine/src/region.rs
@@ -178,6 +178,7 @@ mod tests {
            path_type: PathType::Bare,
            options: HashMap::default(),
            skip_wal_replay: false,
+            checkpoint: None,
        };

        let region = FileRegion::open(region_id, request, &object_store)
@@ -230,6 +231,7 @@ mod tests {
            path_type: PathType::Bare,
            options: HashMap::default(),
            skip_wal_replay: false,
+            checkpoint: None,
        };
        let err = FileRegion::open(region_id, request, &object_store)
            .await
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -63,6 +63,7 @@ prost.workspace = true
 query.workspace = true
 rand.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 servers.workspace = true
 session.workspace = true
 smallvec.workspace = true
@@ -81,6 +82,5 @@ common-catalog.workspace = true
 pretty_assertions.workspace = true
 prost.workspace = true
 query.workspace = true
-serde_json = "1.0"
 session.workspace = true
 table.workspace = true
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -773,6 +773,7 @@ impl StreamingEngine {
            create_if_not_exists,
            or_replace,
            expire_after,
+            eval_interval: _,
            comment,
            sql,
            flow_options,
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -318,6 +318,7 @@ impl FlowDualEngine {
                        create_if_not_exists: true,
                        or_replace: true,
                        expire_after: info.expire_after(),
+                        eval_interval: info.eval_interval(),
                        comment: Some(info.comment().clone()),
                        sql: info.raw_sql().clone(),
                        flow_options: info.options().clone(),
@@ -770,6 +771,7 @@ impl common_meta::node_manager::Flownode for FlowDualEngine {
                sink_table_name: Some(sink_table_name),
                create_if_not_exists,
                expire_after,
+                eval_interval,
                comment,
                sql,
                flow_options,
@@ -789,6 +791,7 @@ impl common_meta::node_manager::Flownode for FlowDualEngine {
                    create_if_not_exists,
                    or_replace,
                    expire_after,
+                    eval_interval: eval_interval.map(|e| e.seconds),
                    comment: Some(comment),
                    sql: sql.clone(),
                    flow_options,
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -16,6 +16,7 @@

 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::sync::Arc;
+use std::time::Duration;

 use api::v1::flow::{DirtyWindowRequests, FlowResponse};
 use catalog::CatalogManagerRef;
@@ -30,6 +31,7 @@ use common_telemetry::{debug, info};
 use common_time::TimeToLive;
 use query::QueryEngineRef;
 use snafu::{ensure, OptionExt, ResultExt};
+use sql::parsers::utils::is_tql;
 use store_api::storage::{RegionId, TableId};
 use tokio::sync::{oneshot, RwLock};

@@ -40,8 +42,8 @@ use crate::batching_mode::utils::sql_to_df_plan;
 use crate::batching_mode::BatchingModeOptions;
 use crate::engine::FlowEngine;
 use crate::error::{
-    ExternalSnafu, FlowAlreadyExistSnafu, FlowNotFoundSnafu, TableNotFoundMetaSnafu,
-    UnexpectedSnafu, UnsupportedSnafu,
+    CreateFlowSnafu, ExternalSnafu, FlowAlreadyExistSnafu, FlowNotFoundSnafu, InvalidQuerySnafu,
+    TableNotFoundMetaSnafu, UnexpectedSnafu, UnsupportedSnafu,
 };
 use crate::metrics::METRIC_FLOW_BATCHING_ENGINE_BULK_MARK_TIME_WINDOW;
 use crate::{CreateFlowArgs, Error, FlowId, TableName};
@@ -335,6 +337,7 @@ impl BatchingEngine {
            create_if_not_exists,
            or_replace,
            expire_after,
+            eval_interval,
            comment: _,
            sql,
            flow_options,
@@ -361,6 +364,25 @@ impl BatchingEngine {
            }
        }

+        let query_ctx = query_ctx.context({
+            UnexpectedSnafu {
+                reason: "Query context is None".to_string(),
+            }
+        })?;
+        let query_ctx = Arc::new(query_ctx);
+
+        // optionally set a eval interval for the flow
+        if eval_interval.is_none()
+            && is_tql(query_ctx.sql_dialect(), &sql)
+                .map_err(BoxedError::new)
+                .context(CreateFlowSnafu { sql: &sql })?
+        {
+            InvalidQuerySnafu {
+                reason: "TQL query requires EVAL INTERVAL to be set".to_string(),
+            }
+            .fail()?;
+        }
+
        let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);

        ensure!(
@@ -374,13 +396,6 @@ impl BatchingEngine {
            }
        );

-        let Some(query_ctx) = query_ctx else {
-            UnexpectedSnafu {
-                reason: "Query context is None".to_string(),
-            }
-            .fail()?
-        };
-        let query_ctx = Arc::new(query_ctx);
        let mut source_table_names = Vec::with_capacity(2);
        for src_id in source_table_ids {
            // also check table option to see if ttl!=instant
@@ -442,6 +457,7 @@ impl BatchingEngine {
            catalog_manager: self.catalog_manager.clone(),
            shutdown_rx: rx,
            batch_opts: self.batch_opts.clone(),
+            flow_eval_interval: eval_interval.map(|secs| Duration::from_secs(secs as u64)),
        };

        let task = BatchingTask::try_new(task_args)?;
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -41,8 +41,8 @@ use snafu::{OptionExt, ResultExt};

 use crate::batching_mode::BatchingModeOptions;
 use crate::error::{
-    ExternalSnafu, InvalidClientConfigSnafu, InvalidRequestSnafu, NoAvailableFrontendSnafu,
-    UnexpectedSnafu,
+    CreateSinkTableSnafu, ExternalSnafu, InvalidClientConfigSnafu, InvalidRequestSnafu,
+    NoAvailableFrontendSnafu, UnexpectedSnafu,
 };
 use crate::{Error, FlowAuthHeader};

@@ -290,13 +290,17 @@ impl FrontendClient {
    ) -> Result<u32, Error> {
        self.handle(
            Request::Ddl(api::v1::DdlRequest {
-                expr: Some(api::v1::ddl_request::Expr::CreateTable(create)),
+                expr: Some(api::v1::ddl_request::Expr::CreateTable(create.clone())),
            }),
            catalog,
            schema,
            &mut None,
        )
        .await
+        .map_err(BoxedError::new)
+        .with_context(|_| CreateSinkTableSnafu {
+            create: create.clone(),
+        })
    }

    /// Execute a SQL statement on the frontend.
--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
@@ -17,7 +17,6 @@ use std::sync::{Arc, RwLock};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};

 use api::v1::CreateTableExpr;
-use arrow_schema::Fields;
 use catalog::CatalogManagerRef;
 use common_error::ext::BoxedError;
 use common_query::logical_plan::breakup_insert_plan;
@@ -75,11 +74,12 @@ pub struct TaskConfig {
    pub time_window_expr: Option<TimeWindowExpr>,
    /// in seconds
    pub expire_after: Option<i64>,
-    sink_table_name: [String; 3],
+    pub sink_table_name: [String; 3],
    pub source_table_names: HashSet<[String; 3]>,
-    catalog_manager: CatalogManagerRef,
-    query_type: QueryType,
-    batch_opts: Arc<BatchingModeOptions>,
+    pub catalog_manager: CatalogManagerRef,
+    pub query_type: QueryType,
+    pub batch_opts: Arc<BatchingModeOptions>,
+    pub flow_eval_interval: Option<Duration>,
 }

 fn determine_query_type(query: &str, query_ctx: &QueryContextRef) -> Result<QueryType, Error> {
@@ -101,8 +101,8 @@ fn determine_query_type(query: &str, query_ctx: &QueryContextRef) -> Result<Quer
    }
 }

-#[derive(Debug, Clone)]
-enum QueryType {
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum QueryType {
    /// query is a tql query
    Tql,
    /// query is a sql query
@@ -128,6 +128,7 @@ pub struct TaskArgs<'a> {
    pub catalog_manager: CatalogManagerRef,
    pub shutdown_rx: oneshot::Receiver<()>,
    pub batch_opts: Arc<BatchingModeOptions>,
+    pub flow_eval_interval: Option<Duration>,
 }

 pub struct PlanInfo {
@@ -150,6 +151,7 @@ impl BatchingTask {
            catalog_manager,
            shutdown_rx,
            batch_opts,
+            flow_eval_interval,
        }: TaskArgs<'_>,
    ) -> Result<Self, Error> {
        Ok(Self {
@@ -164,6 +166,7 @@ impl BatchingTask {
                output_schema: plan.schema().clone(),
                query_type: determine_query_type(query, &query_ctx)?,
                batch_opts,
+                flow_eval_interval,
            }),
            state: Arc::new(RwLock::new(TaskState::new(query_ctx, shutdown_rx))),
        })
@@ -452,6 +455,13 @@ impl BatchingTask {
    ) {
        let flow_id_str = self.config.flow_id.to_string();
        let mut max_window_cnt = None;
+        let mut interval = self
+            .config
+            .flow_eval_interval
+            .map(|d| tokio::time::interval(d));
+        if let Some(tick) = &mut interval {
+            tick.tick().await; // pass the first tick immediately
+        }
        loop {
            // first check if shutdown signal is received
            // if so, break the loop
@@ -499,24 +509,33 @@ impl BatchingTask {
                    max_window_cnt = max_window_cnt.map(|cnt| {
                        (cnt + 1).min(self.config.batch_opts.experimental_max_filter_num_per_query)
                    });
-                    let sleep_until = {
-                        let state = self.state.write().unwrap();

-                        let time_window_size = self
-                            .config
-                            .time_window_expr
-                            .as_ref()
-                            .and_then(|t| *t.time_window_size());
+                    // here use proper ticking if set eval interval
+                    if let Some(eval_interval) = &mut interval {
+                        eval_interval.tick().await;
+                    } else {
+                        // if not explicitly set, just automatically calculate next start time
+                        // using time window size and more args
+                        let sleep_until = {
+                            let state = self.state.write().unwrap();

-                        state.get_next_start_query_time(
-                            self.config.flow_id,
-                            &time_window_size,
-                            min_refresh,
-                            Some(self.config.batch_opts.query_timeout),
-                            self.config.batch_opts.experimental_max_filter_num_per_query,
-                        )
+                            let time_window_size = self
+                                .config
+                                .time_window_expr
+                                .as_ref()
+                                .and_then(|t| *t.time_window_size());
+
+                            state.get_next_start_query_time(
+                                self.config.flow_id,
+                                &time_window_size,
+                                min_refresh,
+                                Some(self.config.batch_opts.query_timeout),
+                                self.config.batch_opts.experimental_max_filter_num_per_query,
+                            )
+                        };
+
+                        tokio::time::sleep_until(sleep_until).await;
                    };
-                    tokio::time::sleep_until(sleep_until).await;
                }
                // no new data, sleep for some time before checking for new data
                Ok(None) => {
@@ -569,7 +588,7 @@ impl BatchingTask {
        let query_ctx = self.state.read().unwrap().query_ctx.clone();
        let plan =
            sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, true).await?;
-        create_table_with_expr(&plan, &self.config.sink_table_name)
+        create_table_with_expr(&plan, &self.config.sink_table_name, &self.config.query_type)
    }

    /// will merge and use the first ten time window in query
@@ -711,12 +730,12 @@ impl BatchingTask {
 fn create_table_with_expr(
    plan: &LogicalPlan,
    sink_table_name: &[String; 3],
+    query_type: &QueryType,
 ) -> Result<CreateTableExpr, Error> {
-    let fields = plan.schema().fields();
-    let (first_time_stamp, primary_keys) = build_primary_key_constraint(plan, fields)?;
+    let (first_time_stamp, primary_keys) = build_primary_key_constraint(plan)?;

    let mut column_schemas = Vec::new();
-    for field in fields {
+    for field in plan.schema().fields() {
        let name = field.name();
        let ty = ConcreteDataType::from_arrow_type(field.data_type());
        let col_schema = if first_time_stamp == Some(name.clone()) {
@@ -724,15 +743,40 @@ fn create_table_with_expr(
        } else {
            ColumnSchema::new(name, ty, true)
        };
-        column_schemas.push(col_schema);
+
+        match query_type {
+            QueryType::Sql => {
+                column_schemas.push(col_schema);
+            }
+            QueryType::Tql => {
+                // if is val column, need to rename as val DOUBLE NULL
+                // if is tag column, need to cast type as STRING NULL
+                let is_tag_column = primary_keys.contains(name);
+                let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
+                if is_val_column {
+                    let col_schema =
+                        ColumnSchema::new("val", ConcreteDataType::float64_datatype(), true);
+                    column_schemas.push(col_schema);
+                } else if is_tag_column {
+                    let col_schema =
+                        ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
+                    column_schemas.push(col_schema);
+                } else {
+                    // time index column
+                    column_schemas.push(col_schema);
+                }
+            }
+        }
    }

-    let update_at_schema = ColumnSchema::new(
-        AUTO_CREATED_UPDATE_AT_TS_COL,
-        ConcreteDataType::timestamp_millisecond_datatype(),
-        true,
-    );
-    column_schemas.push(update_at_schema);
+    if query_type == &QueryType::Sql {
+        let update_at_schema = ColumnSchema::new(
+            AUTO_CREATED_UPDATE_AT_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        );
+        column_schemas.push(update_at_schema);
+    }

    let time_index = if let Some(time_index) = first_time_stamp {
        time_index
@@ -773,8 +817,8 @@ fn create_table_with_expr(
 /// * `Vec<String>` - other columns which are also in group by clause
 fn build_primary_key_constraint(
    plan: &LogicalPlan,
-    schema: &Fields,
 ) -> Result<(Option<String>, Vec<String>), Error> {
+    let fields = plan.schema().fields();
    let mut pk_names = FindGroupByFinalName::default();

    plan.visit(&mut pk_names)
@@ -782,19 +826,23 @@ fn build_primary_key_constraint(
            context: format!("Can't find aggr expr in plan {plan:?}"),
        })?;

-    // if no group by clause, return empty
+    // if no group by clause, return empty with first timestamp column found in output schema
    let pk_final_names = pk_names.get_group_expr_names().unwrap_or_default();
    if pk_final_names.is_empty() {
-        return Ok((None, Vec::new()));
+        let first_ts_col = fields
+            .iter()
+            .find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
+            .map(|f| f.name().clone());
+        return Ok((first_ts_col, Vec::new()));
    }

-    let all_pk_cols: Vec<_> = schema
+    let all_pk_cols: Vec<_> = fields
        .iter()
        .filter(|f| pk_final_names.contains(f.name()))
        .map(|f| f.name().clone())
        .collect();
    // auto create table use first timestamp column in group by clause as time index
-    let first_time_stamp = schema
+    let first_time_stamp = fields
        .iter()
        .find(|f| {
            all_pk_cols.contains(&f.name().clone())
@@ -853,13 +901,13 @@ mod test {
                    ColumnSchema::new(
                        "ts",
                        ConcreteDataType::timestamp_millisecond_datatype(),
-                        true,
-                    ),
+                        false,
+                    )
+                    .with_time_index(true),
                    update_at_schema.clone(),
-                    ts_placeholder_schema.clone(),
                ],
                primary_keys: vec![],
-                time_index: AUTO_CREATED_PLACEHOLDER_TS_COL.to_string(),
+                time_index: "ts".to_string(),
            },
            TestCase {
                sql: "SELECT number, max(ts) FROM numbers_with_ts GROUP BY number".to_string(),
@@ -926,6 +974,7 @@ mod test {
                    "public".to_string(),
                    tc.sink_table_name.clone(),
                ],
+                &QueryType::Sql,
            )
            .unwrap();
            // TODO(discord9): assert expr
@@ -934,9 +983,9 @@ mod test {
                .iter()
                .map(|c| try_as_column_schema(c).unwrap())
                .collect::<Vec<_>>();
-            assert_eq!(tc.column_schemas, column_schemas);
-            assert_eq!(tc.primary_keys, expr.primary_keys);
-            assert_eq!(tc.time_index, expr.time_index);
+            assert_eq!(tc.column_schemas, column_schemas, "{:?}", tc.sql);
+            assert_eq!(tc.primary_keys, expr.primary_keys, "{:?}", tc.sql);
+            assert_eq!(tc.time_index, expr.time_index, "{:?}", tc.sql);
        }
    }
 }
--- a/src/flow/src/batching_mode/utils.rs
+++ b/src/flow/src/batching_mode/utils.rs
@@ -258,56 +258,9 @@ impl AddAutoColumnRewriter {
            is_rewritten: false,
        }
    }
-}
-
-impl TreeNodeRewriter for AddAutoColumnRewriter {
-    type Node = LogicalPlan;
-    fn f_down(&mut self, mut node: Self::Node) -> DfResult<Transformed<Self::Node>> {
-        if self.is_rewritten {
-            return Ok(Transformed::no(node));
-        }
-
-        // if is distinct all, wrap it in a projection
-        if let LogicalPlan::Distinct(Distinct::All(_)) = &node {
-            let mut exprs = vec![];
-
-            for field in node.schema().fields().iter() {
-                exprs.push(Expr::Column(datafusion::common::Column::new_unqualified(
-                    field.name(),
-                )));
-            }
-
-            let projection =
-                LogicalPlan::Projection(Projection::try_new(exprs, Arc::new(node.clone()))?);
-
-            node = projection;
-        }
-        // handle table_scan by wrap it in a projection
-        else if let LogicalPlan::TableScan(table_scan) = node {
-            let mut exprs = vec![];
-
-            for field in table_scan.projected_schema.fields().iter() {
-                exprs.push(Expr::Column(datafusion::common::Column::new(
-                    Some(table_scan.table_name.clone()),
-                    field.name(),
-                )));
-            }
-
-            let projection = LogicalPlan::Projection(Projection::try_new(
-                exprs,
-                Arc::new(LogicalPlan::TableScan(table_scan)),
-            )?);
-
-            node = projection;
-        }
-
-        // only do rewrite if found the outermost projection
-        let mut exprs = if let LogicalPlan::Projection(project) = &node {
-            project.expr.clone()
-        } else {
-            return Ok(Transformed::no(node));
-        };

+    /// modify the exprs in place so that it matches the schema and some auto columns are added
+    fn modify_project_exprs(&mut self, mut exprs: Vec<Expr>) -> DfResult<Vec<Expr>> {
        let all_names = self
            .schema
            .column_schemas()
@@ -391,10 +344,76 @@ impl TreeNodeRewriter for AddAutoColumnRewriter {
                    query_col_cnt, exprs, table_col_cnt, self.schema.column_schemas()
                )));
        }
+        Ok(exprs)
+    }
+}

-        self.is_rewritten = true;
-        let new_plan = node.with_new_exprs(exprs, node.inputs().into_iter().cloned().collect())?;
-        Ok(Transformed::yes(new_plan))
+impl TreeNodeRewriter for AddAutoColumnRewriter {
+    type Node = LogicalPlan;
+    fn f_down(&mut self, mut node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+        if self.is_rewritten {
+            return Ok(Transformed::no(node));
+        }
+
+        // if is distinct all, wrap it in a projection
+        if let LogicalPlan::Distinct(Distinct::All(_)) = &node {
+            let mut exprs = vec![];
+
+            for field in node.schema().fields().iter() {
+                exprs.push(Expr::Column(datafusion::common::Column::new_unqualified(
+                    field.name(),
+                )));
+            }
+
+            let projection =
+                LogicalPlan::Projection(Projection::try_new(exprs, Arc::new(node.clone()))?);
+
+            node = projection;
+        }
+        // handle table_scan by wrap it in a projection
+        else if let LogicalPlan::TableScan(table_scan) = node {
+            let mut exprs = vec![];
+
+            for field in table_scan.projected_schema.fields().iter() {
+                exprs.push(Expr::Column(datafusion::common::Column::new(
+                    Some(table_scan.table_name.clone()),
+                    field.name(),
+                )));
+            }
+
+            let projection = LogicalPlan::Projection(Projection::try_new(
+                exprs,
+                Arc::new(LogicalPlan::TableScan(table_scan)),
+            )?);
+
+            node = projection;
+        }
+
+        // only do rewrite if found the outermost projection
+        // if the outermost node is projection, can rewrite the exprs
+        // if not, wrap it in a projection
+        if let LogicalPlan::Projection(project) = &node {
+            let exprs = project.expr.clone();
+            let exprs = self.modify_project_exprs(exprs)?;
+
+            self.is_rewritten = true;
+            let new_plan =
+                node.with_new_exprs(exprs, node.inputs().into_iter().cloned().collect())?;
+            Ok(Transformed::yes(new_plan))
+        } else {
+            // wrap the logical plan in a projection
+            let mut exprs = vec![];
+            for field in node.schema().fields().iter() {
+                exprs.push(Expr::Column(datafusion::common::Column::new_unqualified(
+                    field.name(),
+                )));
+            }
+            let exprs = self.modify_project_exprs(exprs)?;
+            self.is_rewritten = true;
+            let new_plan =
+                LogicalPlan::Projection(Projection::try_new(exprs, Arc::new(node.clone()))?);
+            Ok(Transformed::yes(new_plan))
+        }
    }

    /// We might add new columns, so we need to recompute the schema
--- a/src/flow/src/engine.rs
+++ b/src/flow/src/engine.rs
@@ -70,6 +70,7 @@ pub struct CreateFlowArgs {
    pub create_if_not_exists: bool,
    pub or_replace: bool,
    pub expire_after: Option<i64>,
+    pub eval_interval: Option<i64>,
    pub comment: Option<String>,
    pub sql: String,
    pub flow_options: HashMap<String, String>,
--- a/src/flow/src/error.rs
+++ b/src/flow/src/error.rs
@@ -16,6 +16,7 @@

 use std::any::Any;

+use api::v1::CreateTableExpr;
 use arrow_schema::ArrowError;
 use common_error::ext::BoxedError;
 use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
@@ -60,6 +61,14 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Error encountered while creating sink table for flow: {create:?}"))]
+    CreateSinkTable {
+        create: CreateTableExpr,
+        source: BoxedError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Time error"))]
    Time {
        source: common_time::error::Error,
@@ -331,9 +340,10 @@ impl ErrorExt for Error {
            | Self::ListFlows { .. } => StatusCode::TableNotFound,
            Self::FlowNotFound { .. } => StatusCode::FlowNotFound,
            Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
-            Self::CreateFlow { .. } | Self::Arrow { .. } | Self::Time { .. } => {
-                StatusCode::EngineExecuteQuery
-            }
+            Self::CreateFlow { .. }
+            | Self::CreateSinkTable { .. }
+            | Self::Arrow { .. }
+            | Self::Time { .. } => StatusCode::EngineExecuteQuery,
            Self::Unexpected { .. }
            | Self::SyncCheckTask { .. }
            | Self::IllegalCheckTaskState { .. } => StatusCode::Unexpected,
--- a/src/flow/src/heartbeat.rs
+++ b/src/flow/src/heartbeat.rs
@@ -218,6 +218,7 @@ impl HeartbeatTask {
                        if let Some(message) = message {
                            Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report)
                        } else {
+                            warn!("Sender has been dropped, exiting the heartbeat loop");
                            // Receives None that means Sender was dropped, we need to break the current loop
                            break
                        }
@@ -259,7 +260,11 @@ impl HeartbeatTask {
                            error!(e; "Error while handling heartbeat response");
                        }
                    }
-                    Ok(None) => break,
+                    Ok(None) => {
+                        warn!("Heartbeat response stream closed");
+                        capture_self.start_with_retry(retry_interval).await;
+                        break;
+                    }
                    Err(e) => {
                        error!(e; "Occur error while reading heartbeat response");
                        capture_self.start_with_retry(retry_interval).await;
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -337,12 +337,6 @@ pub enum Error {
        source: BoxedError,
    },

-    #[snafu(display("In-flight write bytes exceeded the maximum limit"))]
-    InFlightWriteBytesExceeded {
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to decode logical plan from substrait"))]
    SubstraitDecodeLogicalPlan {
        #[snafu(implicit)]
@@ -369,6 +363,14 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Failed to acquire more permits from limiter"))]
+    AcquireLimiter {
+        #[snafu(source)]
+        error: tokio::sync::AcquireError,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -444,13 +446,13 @@ impl ErrorExt for Error {

            Error::TableOperation { source, .. } => source.status_code(),

-            Error::InFlightWriteBytesExceeded { .. } => StatusCode::RateLimited,
-
            Error::DataFusion { error, .. } => datafusion_status_code::<Self>(error, None),

            Error::Cancelled { .. } => StatusCode::Cancelled,

            Error::StatementTimeout { .. } => StatusCode::Cancelled,
+
+            Error::AcquireLimiter { .. } => StatusCode::Internal,
        }
    }

--- a/src/frontend/src/events.rs
+++ b/src/frontend/src/events.rs
@@ -19,7 +19,10 @@ use client::inserter::{Context, InsertOptions, Inserter};
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_PRIVATE_SCHEMA_NAME};
 use common_error::ext::BoxedError;
 use common_event_recorder::error::{InsertEventsSnafu, Result};
-use common_event_recorder::{build_row_inserts_request, group_events_by_type, Event, EventHandler};
+use common_event_recorder::{
+    build_row_inserts_request, group_events_by_type, Event, EventHandler,
+    DEFAULT_COMPACTION_TIME_WINDOW,
+};
 use common_frontend::slow_query_event::SLOW_QUERY_EVENT_TYPE;
 use datafusion::common::HashMap;
 use operator::statement::{InserterImpl, StatementExecutorRef};
@@ -47,6 +50,7 @@ impl EventHandlerImpl {
                    Some(InsertOptions {
                        ttl: slow_query_ttl,
                        append_mode: true,
+                        twcs_compaction_time_window: Some(DEFAULT_COMPACTION_TIME_WINDOW),
                    }),
                )) as _,
            )]),
@@ -55,6 +59,7 @@ impl EventHandlerImpl {
                Some(InsertOptions {
                    ttl: global_ttl,
                    append_mode: true,
+                    twcs_compaction_time_window: Some(DEFAULT_COMPACTION_TIME_WINDOW),
                }),
            )),
        }
--- a/src/frontend/src/heartbeat.rs
+++ b/src/frontend/src/heartbeat.rs
@@ -23,7 +23,7 @@ use common_meta::heartbeat::handler::{
 };
 use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
 use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
-use common_telemetry::{debug, error, info};
+use common_telemetry::{debug, error, info, warn};
 use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
 use servers::addrs;
 use servers::heartbeat_options::HeartbeatOptions;
@@ -42,8 +42,8 @@ use crate::metrics::{HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT};
 pub struct HeartbeatTask {
    peer_addr: String,
    meta_client: Arc<MetaClient>,
-    report_interval: u64,
-    retry_interval: u64,
+    report_interval: Duration,
+    retry_interval: Duration,
    resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
    start_time_ms: u64,
 }
@@ -65,8 +65,8 @@ impl HeartbeatTask {
                addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
            },
            meta_client,
-            report_interval: heartbeat_opts.interval.as_millis() as u64,
-            retry_interval: heartbeat_opts.retry_interval.as_millis() as u64,
+            report_interval: heartbeat_opts.interval,
+            retry_interval: heartbeat_opts.retry_interval,
            resp_handler_executor,
            start_time_ms: common_time::util::current_time_millis() as u64,
        }
@@ -110,13 +110,15 @@ impl HeartbeatTask {
                            HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
                        }
                    }
-                    Ok(None) => break,
+                    Ok(None) => {
+                        warn!("Heartbeat response stream closed");
+                        capture_self.start_with_retry(retry_interval).await;
+                        break;
+                    }
                    Err(e) => {
                        HEARTBEAT_RECV_COUNT.with_label_values(&["error"]).inc();
                        error!(e; "Occur error while reading heartbeat response");
-                        capture_self
-                            .start_with_retry(Duration::from_millis(retry_interval))
-                            .await;
+                        capture_self.start_with_retry(retry_interval).await;

                        break;
                    }
@@ -184,12 +186,13 @@ impl HeartbeatTask {
                        if let Some(message) = message {
                            Self::new_heartbeat_request(&heartbeat_request, Some(message))
                        } else {
+                            warn!("Sender has been dropped, exiting the heartbeat loop");
                            // Receives None that means Sender was dropped, we need to break the current loop
                            break
                        }
                    }
                    _ = &mut sleep => {
-                        sleep.as_mut().reset(Instant::now() + Duration::from_millis(report_interval));
+                        sleep.as_mut().reset(Instant::now() + report_interval);
                       Self::new_heartbeat_request(&heartbeat_request, None)
                    }
                };
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -207,7 +207,7 @@ impl FrontendBuilder {
            .options
            .max_in_flight_write_bytes
            .map(|max_in_flight_write_bytes| {
-                Arc::new(Limiter::new(max_in_flight_write_bytes.as_bytes()))
+                Arc::new(Limiter::new(max_in_flight_write_bytes.as_bytes() as usize))
            });

        Ok(Instance {
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -43,9 +43,9 @@ use table::table_name::TableName;
 use table::TableRef;

 use crate::error::{
-    CatalogSnafu, Error, ExternalSnafu, InFlightWriteBytesExceededSnafu,
-    IncompleteGrpcRequestSnafu, NotSupportedSnafu, PermissionSnafu, PlanStatementSnafu, Result,
-    SubstraitDecodeLogicalPlanSnafu, TableNotFoundSnafu, TableOperationSnafu,
+    CatalogSnafu, Error, ExternalSnafu, IncompleteGrpcRequestSnafu, NotSupportedSnafu,
+    PermissionSnafu, PlanStatementSnafu, Result, SubstraitDecodeLogicalPlanSnafu,
+    TableNotFoundSnafu, TableOperationSnafu,
 };
 use crate::instance::{attach_timer, Instance};
 use crate::metrics::{
@@ -68,11 +68,7 @@ impl GrpcQueryHandler for Instance {
            .context(PermissionSnafu)?;

        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_request(&request);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(limiter.limit_request(&request).await?)
        } else {
            None
        };
@@ -247,6 +243,7 @@ impl GrpcQueryHandler for Instance {
        table_ref: &mut Option<TableRef>,
        decoder: &mut FlightDecoder,
        data: FlightData,
+        ctx: QueryContextRef,
    ) -> Result<AffectedRows> {
        let table = if let Some(table) = table_ref {
            table.clone()
@@ -268,6 +265,18 @@ impl GrpcQueryHandler for Instance {
            table
        };

+        let interceptor_ref = self.plugins.get::<GrpcQueryInterceptorRef<Error>>();
+        let interceptor = interceptor_ref.as_ref();
+        interceptor.pre_bulk_insert(table.clone(), ctx.clone())?;
+
+        self.plugins
+            .get::<PermissionCheckerRef>()
+            .as_ref()
+            .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
+            .context(PermissionSnafu)?;
+
+        // do we check limit for bulk insert?
+
        self.inserter
            .handle_bulk_insert(table, decoder, data)
            .await
--- a/src/frontend/src/instance/influxdb.rs
+++ b/src/frontend/src/instance/influxdb.rs
@@ -16,7 +16,7 @@ use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use client::Output;
 use common_error::ext::BoxedError;
-use servers::error::{AuthSnafu, Error, InFlightWriteBytesExceededSnafu};
+use servers::error::{AuthSnafu, Error, OtherSnafu};
 use servers::influxdb::InfluxdbRequest;
 use servers::interceptor::{LineProtocolInterceptor, LineProtocolInterceptorRef};
 use servers::query_handler::InfluxdbLineProtocolHandler;
@@ -47,11 +47,13 @@ impl InfluxdbLineProtocolHandler for Instance {
            .await?;

        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_row_inserts(&requests);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_row_inserts(&requests)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(OtherSnafu)?,
+            )
        } else {
            None
        };
--- a/src/frontend/src/instance/log_handler.rs
+++ b/src/frontend/src/instance/log_handler.rs
@@ -23,8 +23,8 @@ use datatypes::timestamp::TimestampNanosecond;
 use pipeline::pipeline_operator::PipelineOperator;
 use pipeline::{Pipeline, PipelineInfo, PipelineVersion};
 use servers::error::{
-    AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, InFlightWriteBytesExceededSnafu,
-    PipelineSnafu, Result as ServerResult,
+    AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, OtherSnafu, PipelineSnafu,
+    Result as ServerResult,
 };
 use servers::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef};
 use servers::query_handler::PipelineHandler;
@@ -125,11 +125,13 @@ impl Instance {
        ctx: QueryContextRef,
    ) -> ServerResult<Output> {
        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_row_inserts(&log);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_row_inserts(&log)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(OtherSnafu)?,
+            )
        } else {
            None
        };
@@ -147,11 +149,13 @@ impl Instance {
        ctx: QueryContextRef,
    ) -> ServerResult<Output> {
        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_row_inserts(&rows);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_row_inserts(&rows)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(OtherSnafu)?,
+            )
        } else {
            None
        };
--- a/src/frontend/src/instance/opentsdb.rs
+++ b/src/frontend/src/instance/opentsdb.rs
@@ -16,8 +16,7 @@ use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_error::ext::BoxedError;
 use common_telemetry::tracing;
-use servers::error as server_error;
-use servers::error::{AuthSnafu, InFlightWriteBytesExceededSnafu};
+use servers::error::{self as server_error, AuthSnafu, ExecuteGrpcQuerySnafu, OtherSnafu};
 use servers::opentsdb::codec::DataPoint;
 use servers::opentsdb::data_point_to_grpc_row_insert_requests;
 use servers::query_handler::OpentsdbProtocolHandler;
@@ -43,11 +42,13 @@ impl OpentsdbProtocolHandler for Instance {
        let (requests, _) = data_point_to_grpc_row_insert_requests(data_points)?;

        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_row_inserts(&requests);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_row_inserts(&requests)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(OtherSnafu)?,
+            )
        } else {
            None
        };
@@ -57,7 +58,7 @@ impl OpentsdbProtocolHandler for Instance {
            .handle_row_inserts(requests, ctx, true, true)
            .await
            .map_err(BoxedError::new)
-            .context(servers::error::ExecuteGrpcQuerySnafu)?;
+            .context(ExecuteGrpcQuerySnafu)?;

        Ok(match output.data {
            common_query::OutputData::AffectedRows(rows) => rows,
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -24,7 +24,7 @@ use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
 use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
 use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
 use pipeline::{GreptimePipelineParams, PipelineWay};
-use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult};
+use servers::error::{self, AuthSnafu, OtherSnafu, Result as ServerResult};
 use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
 use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
 use servers::otlp;
@@ -84,11 +84,13 @@ impl OpenTelemetryProtocolHandler for Instance {
        };

        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_row_inserts(&requests);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_row_inserts(&requests)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(OtherSnafu)?,
+            )
        } else {
            None
        };
@@ -190,11 +192,13 @@ impl OpenTelemetryProtocolHandler for Instance {
        .await?;

        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_ctx_req(&opt_req);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_ctx_req(&opt_req)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(OtherSnafu)?,
+            )
        } else {
            None
        };
--- a/src/frontend/src/instance/prom_store.rs
+++ b/src/frontend/src/instance/prom_store.rs
@@ -30,7 +30,7 @@ use common_telemetry::{debug, tracing};
 use operator::insert::InserterRef;
 use operator::statement::StatementExecutor;
 use prost::Message;
-use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult};
+use servers::error::{self, AuthSnafu, Result as ServerResult};
 use servers::http::header::{collect_plan_metrics, CONTENT_ENCODING_SNAPPY, CONTENT_TYPE_PROTOBUF};
 use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
 use servers::interceptor::{PromStoreProtocolInterceptor, PromStoreProtocolInterceptorRef};
@@ -176,11 +176,13 @@ impl PromStoreProtocolHandler for Instance {
        interceptor_ref.pre_write(&request, ctx.clone())?;

        let _guard = if let Some(limiter) = &self.limiter {
-            let result = limiter.limit_row_inserts(&request);
-            if result.is_none() {
-                return InFlightWriteBytesExceededSnafu.fail();
-            }
-            result
+            Some(
+                limiter
+                    .limit_row_inserts(&request)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(error::OtherSnafu)?,
+            )
        } else {
            None
        };
--- a/src/frontend/src/limiter.rs
+++ b/src/frontend/src/limiter.rs
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;

 use api::v1::column::Values;
@@ -21,61 +20,30 @@ use api::v1::value::ValueData;
 use api::v1::{
    Decimal128, InsertRequests, IntervalMonthDayNano, RowInsertRequest, RowInsertRequests,
 };
-use common_telemetry::{debug, warn};
 use pipeline::ContextReq;
+use snafu::ResultExt;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
+
+use crate::error::{AcquireLimiterSnafu, Result};

 pub(crate) type LimiterRef = Arc<Limiter>;

-/// A frontend request limiter that controls the total size of in-flight write requests.
+/// A frontend request limiter that controls the total size of in-flight write
+/// requests.
 pub(crate) struct Limiter {
-    // The maximum number of bytes that can be in flight.
-    max_in_flight_write_bytes: u64,
-
-    // The current in-flight write bytes.
-    in_flight_write_bytes: Arc<AtomicU64>,
-}
-
-/// A counter for the in-flight write bytes.
-pub(crate) struct InFlightWriteBytesCounter {
-    // The current in-flight write bytes.
-    in_flight_write_bytes: Arc<AtomicU64>,
-
-    // The write bytes that are being processed.
-    processing_write_bytes: u64,
-}
-
-impl InFlightWriteBytesCounter {
-    /// Creates a new InFlightWriteBytesCounter. It will decrease the in-flight write bytes when dropped.
-    pub fn new(in_flight_write_bytes: Arc<AtomicU64>, processing_write_bytes: u64) -> Self {
-        debug!(
-            "processing write bytes: {}, current in-flight write bytes: {}",
-            processing_write_bytes,
-            in_flight_write_bytes.load(Ordering::Relaxed)
-        );
-        Self {
-            in_flight_write_bytes,
-            processing_write_bytes,
-        }
-    }
-}
-
-impl Drop for InFlightWriteBytesCounter {
-    // When the request is finished, the in-flight write bytes should be decreased.
-    fn drop(&mut self) {
-        self.in_flight_write_bytes
-            .fetch_sub(self.processing_write_bytes, Ordering::Relaxed);
-    }
+    max_in_flight_write_bytes: usize,
+    byte_counter: Arc<Semaphore>,
 }

 impl Limiter {
-    pub fn new(max_in_flight_write_bytes: u64) -> Self {
+    pub fn new(max_in_flight_write_bytes: usize) -> Self {
        Self {
+            byte_counter: Arc::new(Semaphore::new(max_in_flight_write_bytes)),
            max_in_flight_write_bytes,
-            in_flight_write_bytes: Arc::new(AtomicU64::new(0)),
        }
    }

-    pub fn limit_request(&self, request: &Request) -> Option<InFlightWriteBytesCounter> {
+    pub async fn limit_request(&self, request: &Request) -> Result<OwnedSemaphorePermit> {
        let size = match request {
            Request::Inserts(requests) => self.insert_requests_data_size(requests),
            Request::RowInserts(requests) => {
@@ -83,56 +51,35 @@ impl Limiter {
            }
            _ => 0,
        };
-        self.limit_in_flight_write_bytes(size as u64)
+        self.limit_in_flight_write_bytes(size).await
    }

-    pub fn limit_row_inserts(
+    pub async fn limit_row_inserts(
        &self,
        requests: &RowInsertRequests,
-    ) -> Option<InFlightWriteBytesCounter> {
+    ) -> Result<OwnedSemaphorePermit> {
        let size = self.rows_insert_requests_data_size(requests.inserts.iter());
-        self.limit_in_flight_write_bytes(size as u64)
+        self.limit_in_flight_write_bytes(size).await
    }

-    pub fn limit_ctx_req(&self, opt_req: &ContextReq) -> Option<InFlightWriteBytesCounter> {
+    pub async fn limit_ctx_req(&self, opt_req: &ContextReq) -> Result<OwnedSemaphorePermit> {
        let size = self.rows_insert_requests_data_size(opt_req.ref_all_req());
-        self.limit_in_flight_write_bytes(size as u64)
+        self.limit_in_flight_write_bytes(size).await
    }

-    /// Returns None if the in-flight write bytes exceed the maximum limit.
-    /// Otherwise, returns Some(InFlightWriteBytesCounter) and the in-flight write bytes will be increased.
-    pub fn limit_in_flight_write_bytes(&self, bytes: u64) -> Option<InFlightWriteBytesCounter> {
-        let result = self.in_flight_write_bytes.fetch_update(
-            Ordering::Relaxed,
-            Ordering::Relaxed,
-            |current| {
-                if current + bytes > self.max_in_flight_write_bytes {
-                    warn!(
-                        "in-flight write bytes exceed the maximum limit {}, request with {} bytes will be limited",
-                        self.max_in_flight_write_bytes,
-                        bytes
-                    );
-                    return None;
-                }
-                Some(current + bytes)
-            },
-        );
-
-        match result {
-            // Update the in-flight write bytes successfully.
-            Ok(_) => Some(InFlightWriteBytesCounter::new(
-                self.in_flight_write_bytes.clone(),
-                bytes,
-            )),
-            // It means the in-flight write bytes exceed the maximum limit.
-            Err(_) => None,
-        }
+    /// Await until more inflight bytes are available
+    pub async fn limit_in_flight_write_bytes(&self, bytes: usize) -> Result<OwnedSemaphorePermit> {
+        self.byte_counter
+            .clone()
+            .acquire_many_owned(bytes as u32)
+            .await
+            .context(AcquireLimiterSnafu)
    }

    /// Returns the current in-flight write bytes.
    #[allow(dead_code)]
-    pub fn in_flight_write_bytes(&self) -> u64 {
-        self.in_flight_write_bytes.load(Ordering::Relaxed)
+    pub fn in_flight_write_bytes(&self) -> usize {
+        self.max_in_flight_write_bytes - self.byte_counter.available_permits()
    }

    fn insert_requests_data_size(&self, request: &InsertRequests) -> usize {
@@ -270,8 +217,10 @@ mod tests {
        for _ in 0..tasks_count {
            let limiter = limiter_ref.clone();
            let handle = tokio::spawn(async move {
-                let result = limiter.limit_request(&generate_request(request_data_size));
-                assert!(result.is_some());
+                let result = limiter
+                    .limit_request(&generate_request(request_data_size))
+                    .await;
+                assert!(result.is_ok());
            });
            handles.push(handle);
        }
@@ -282,23 +231,27 @@ mod tests {
        }
    }

-    #[test]
-    fn test_in_flight_write_bytes() {
+    #[tokio::test]
+    async fn test_in_flight_write_bytes() {
        let limiter_ref: LimiterRef = Arc::new(Limiter::new(1024));
        let req1 = generate_request(100);
-        let result1 = limiter_ref.limit_request(&req1);
-        assert!(result1.is_some());
+        let result1 = limiter_ref
+            .limit_request(&req1)
+            .await
+            .expect("failed to acquire permits");
        assert_eq!(limiter_ref.in_flight_write_bytes(), 100);

        let req2 = generate_request(200);
-        let result2 = limiter_ref.limit_request(&req2);
-        assert!(result2.is_some());
+        let result2 = limiter_ref
+            .limit_request(&req2)
+            .await
+            .expect("failed to acquire permits");
        assert_eq!(limiter_ref.in_flight_write_bytes(), 300);

-        drop(result1.unwrap());
+        drop(result1);
        assert_eq!(limiter_ref.in_flight_write_bytes(), 200);

-        drop(result2.unwrap());
+        drop(result2);
        assert_eq!(limiter_ref.in_flight_write_bytes(), 0);
    }
 }
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -170,10 +170,13 @@ where
            .name(name)
            .database_handler(greptime_request_handler.clone())
            .prometheus_handler(self.instance.clone(), user_provider.clone())
-            .otel_arrow_handler(OtelArrowServiceHandler::new(self.instance.clone()))
+            .otel_arrow_handler(OtelArrowServiceHandler::new(
+                self.instance.clone(),
+                user_provider.clone(),
+            ))
            .flight_handler(Arc::new(greptime_request_handler));

-        let grpc_server = if external {
+        let grpc_server = if !external {
            let frontend_grpc_handler =
                FrontendGrpcHandler::new(self.instance.process_manager().clone());
            grpc_server.frontend_grpc_handler(frontend_grpc_handler)
--- a/src/log-store/Cargo.toml
+++ b/src/log-store/Cargo.toml
@@ -51,6 +51,5 @@ common-test-util.workspace = true
 common-wal = { workspace = true, features = ["testing"] }
 itertools.workspace = true
 rand.workspace = true
-rand_distr = "0.4"
 rskafka = { workspace = true, features = ["unstable-fuzzing"] }
 uuid.workspace = true
--- a/src/log-store/src/kafka/consumer.rs
+++ b/src/log-store/src/kafka/consumer.rs
@@ -28,6 +28,17 @@ use rskafka::record::RecordAndOffset;

 use crate::kafka::index::{NextBatchHint, RegionWalIndexIterator};

+pub struct FetchResult {
+    /// The offsets of the fetched records.
+    pub records: Vec<RecordAndOffset>,
+
+    /// The high watermark of the partition.
+    pub high_watermark: i64,
+
+    /// The size of the response encoded in bytes.
+    pub encoded_response_size: usize,
+}
+
 #[async_trait::async_trait]
 pub trait FetchClient: std::fmt::Debug + Send + Sync {
    /// Fetch records.
@@ -38,7 +49,9 @@ pub trait FetchClient: std::fmt::Debug + Send + Sync {
        offset: i64,
        bytes: Range<i32>,
        max_wait_ms: i32,
-    ) -> rskafka::client::error::Result<(Vec<RecordAndOffset>, i64)>;
+    ) -> rskafka::client::error::Result<FetchResult>;
+
+    fn topic(&self) -> &str;
 }

 #[async_trait::async_trait]
@@ -48,15 +61,25 @@ impl FetchClient for PartitionClient {
        offset: i64,
        bytes: Range<i32>,
        max_wait_ms: i32,
-    ) -> rskafka::client::error::Result<(Vec<RecordAndOffset>, i64)> {
-        self.fetch_records(offset, bytes, max_wait_ms).await
+    ) -> rskafka::client::error::Result<FetchResult> {
+        self.fetch_records(offset, bytes, max_wait_ms)
+            .await
+            .map(|r| FetchResult {
+                records: r.records,
+                high_watermark: r.high_watermark,
+                encoded_response_size: r.encoded_response_size,
+            })
+    }
+
+    fn topic(&self) -> &str {
+        self.topic()
    }
 }

-struct FetchResult {
+struct FetchResultInner {
    records_and_offsets: Vec<RecordAndOffset>,
    batch_size: usize,
-    fetch_bytes: i32,
+    fetch_bytes: usize,
    watermark: i64,
    used_offset: i64,
 }
@@ -97,7 +120,23 @@ pub struct Consumer {

    /// The fetch future.
    #[builder(default = "Fuse::terminated()")]
-    fetch_fut: Fuse<BoxFuture<'static, rskafka::client::error::Result<FetchResult>>>,
+    fetch_fut: Fuse<BoxFuture<'static, rskafka::client::error::Result<FetchResultInner>>>,
+
+    /// Total fetched bytes.
+    #[builder(default = "0")]
+    total_fetched_bytes: u64,
+}
+
+impl Consumer {
+    /// Returns the total fetched bytes.
+    pub fn total_fetched_bytes(&self) -> u64 {
+        self.total_fetched_bytes
+    }
+
+    /// Returns the topic name.
+    pub fn topic(&self) -> &str {
+        self.client.topic()
+    }
 }

 pub(crate) struct RecordsBuffer {
@@ -184,15 +223,20 @@ impl Stream for Consumer {
                        let fetch_range =
                            1i32..(bytes.saturating_add(1).min(*this.max_batch_size) as i32);
                        *this.fetch_fut = FutureExt::fuse(Box::pin(async move {
-                            let (records_and_offsets, watermark) = client
+                            let FetchResult {
+                                records: records_and_offsets,
+                                high_watermark: watermark,
+                                encoded_response_size,
+                                ..
+                            } = client
                                .fetch_records(offset, fetch_range, max_wait_ms)
                                .await?;

-                            Ok(FetchResult {
+                            Ok(FetchResultInner {
                                records_and_offsets,
                                watermark,
                                used_offset: offset,
-                                fetch_bytes: bytes as i32,
+                                fetch_bytes: encoded_response_size,
                                batch_size: len,
                            })
                        }));
@@ -206,7 +250,7 @@ impl Stream for Consumer {
            let data = futures::ready!(this.fetch_fut.poll_unpin(cx));

            match data {
-                Ok(FetchResult {
+                Ok(FetchResultInner {
                    mut records_and_offsets,
                    watermark,
                    used_offset,
@@ -217,9 +261,10 @@ impl Stream for Consumer {
                    records_and_offsets.sort_unstable_by_key(|x| x.offset);
                    *this.last_high_watermark = watermark;
                    if !records_and_offsets.is_empty() {
-                        *this.avg_record_size = fetch_bytes as usize / records_and_offsets.len();
+                        *this.avg_record_size = fetch_bytes / records_and_offsets.len();
                        debug!("set avg_record_size: {}", *this.avg_record_size);
                    }
+                    *this.total_fetched_bytes += fetch_bytes as u64;

                    debug!(
                        "Fetch result: {:?}, used_offset: {used_offset}, max_batch_size: {fetch_bytes}, expected batch_num: {batch_size}, actual batch_num: {}",
@@ -254,7 +299,7 @@ mod tests {
    use futures::TryStreamExt;
    use rskafka::record::{Record, RecordAndOffset};

-    use super::FetchClient;
+    use super::*;
    use crate::kafka::consumer::{Consumer, RecordsBuffer};
    use crate::kafka::index::{MultipleRegionWalIndexIterator, RegionWalRange, RegionWalVecIndex};

@@ -270,7 +315,7 @@ mod tests {
            offset: i64,
            bytes: Range<i32>,
            _max_wait_ms: i32,
-        ) -> rskafka::client::error::Result<(Vec<RecordAndOffset>, i64)> {
+        ) -> rskafka::client::error::Result<FetchResult> {
            let record_size = self.record.approximate_size();
            let num = (bytes.end.unsigned_abs() as usize / record_size).max(1);

@@ -280,8 +325,18 @@ mod tests {
                    offset: offset + idx as i64,
                })
                .collect::<Vec<_>>();
+
            let max_offset = offset + records.len() as i64;
-            Ok((records, max_offset))
+            let encoded_response_size = records.iter().map(|r| r.record.approximate_size()).sum();
+            Ok(FetchResult {
+                records,
+                high_watermark: max_offset,
+                encoded_response_size,
+            })
+        }
+
+        fn topic(&self) -> &str {
+            "test"
        }
    }

@@ -315,6 +370,7 @@ mod tests {
                index: Box::new(index),
            },
            fetch_fut: Fuse::terminated(),
+            total_fetched_bytes: 0,
        };

        let records = consumer.try_collect::<Vec<_>>().await.unwrap();
@@ -347,6 +403,7 @@ mod tests {
                index: Box::new(index),
            },
            fetch_fut: Fuse::terminated(),
+            total_fetched_bytes: 0,
        };

        let records = consumer.try_collect::<Vec<_>>().await.unwrap();
@@ -388,6 +445,7 @@ mod tests {
                index: Box::new(iter),
            },
            fetch_fut: Fuse::terminated(),
+            total_fetched_bytes: 0,
        };

        let records = consumer.try_collect::<Vec<_>>().await.unwrap();
--- a/src/log-store/src/kafka/log_store.rs
+++ b/src/log-store/src/kafka/log_store.rs
@@ -14,11 +14,12 @@

 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};

+use common_base::readable_size::ReadableSize;
 use common_meta::datanode::TopicStatsReporter;
 use common_meta::distributed_time_constants::TOPIC_STATS_REPORT_INTERVAL_SECS;
-use common_telemetry::{debug, warn};
+use common_telemetry::{debug, info, warn};
 use common_time::util::current_time_millis;
 use common_wal::config::kafka::DatanodeKafkaConfig;
 use dashmap::DashMap;
@@ -400,6 +401,7 @@ impl LogStore for KafkaLogStore {
        let mut entry_records: HashMap<RegionId, Vec<Record>> = HashMap::new();
        let provider = provider.clone();
        let stream = async_stream::stream!({
+            let now = Instant::now();
            while let Some(consume_result) = stream_consumer.next().await {
                // Each next on the stream consumer produces a `RecordAndOffset` and a high watermark offset.
                // The `RecordAndOffset` contains the record data and its start offset.
@@ -410,9 +412,6 @@ impl LogStore for KafkaLogStore {
                    })?;
                let (kafka_record, offset) = (record_and_offset.record, record_and_offset.offset);

-                metrics::METRIC_KAFKA_READ_BYTES_TOTAL
-                    .inc_by(kafka_record.approximate_size() as u64);
-
                debug!(
                    "Read a record at offset {} for topic {}, high watermark: {}",
                    offset, provider.topic, high_watermark
@@ -446,6 +445,17 @@ impl LogStore for KafkaLogStore {
                    break;
                }
            }
+
+            metrics::METRIC_KAFKA_READ_BYTES_TOTAL.inc_by(stream_consumer.total_fetched_bytes());
+
+            info!(
+                "Fetched {} bytes from topic: {}, start_entry_id: {}, end_offset: {}, elapsed: {:?}",
+                ReadableSize(stream_consumer.total_fetched_bytes()),
+                stream_consumer.topic(),
+                entry_id,
+                end_offset,
+                now.elapsed()
+            );
        });
        Ok(Box::pin(stream))
    }
--- a/src/meta-client/src/client/store.rs
+++ b/src/meta-client/src/client/store.rs
@@ -237,10 +237,17 @@ impl Inner {
            .get(addr)
            .context(error::CreateChannelSnafu)?;

+        let max_decoding_message_size = self
+            .channel_manager
+            .config()
+            .max_recv_message_size
+            .as_bytes() as usize;
+
        Ok(StoreClient::new(channel)
            .accept_compressed(CompressionEncoding::Gzip)
            .accept_compressed(CompressionEncoding::Zstd)
-            .send_compressed(CompressionEncoding::Zstd))
+            .send_compressed(CompressionEncoding::Zstd)
+            .max_decoding_message_size(max_decoding_message_size))
    }

    #[inline]
--- a/src/meta-srv/Cargo.toml
+++ b/src/meta-srv/Cargo.toml
@@ -59,7 +59,6 @@ humantime-serde.workspace = true
 hyper-util = { workspace = true, features = ["tokio"] }
 itertools.workspace = true
 lazy_static.workspace = true
-log-store.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
 prometheus.workspace = true
@@ -92,7 +91,6 @@ client = { workspace = true, features = ["testing"] }
 common-meta = { workspace = true, features = ["testing"] }
 common-procedure-test.workspace = true
 common-wal = { workspace = true, features = ["testing"] }
-hyper = "0.14"
 session.workspace = true
 tracing = "0.1"
 tracing-subscriber.workspace = true
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -21,6 +21,7 @@ use api::v1::meta::procedure_service_server::ProcedureServiceServer;
 use api::v1::meta::store_server::StoreServer;
 use common_base::Plugins;
 use common_config::Configurable;
+#[cfg(feature = "pg_kvbackend")]
 use common_error::ext::BoxedError;
 #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
 use common_meta::distributed_time_constants::META_LEASE_SECS;
@@ -40,7 +41,7 @@ use common_telemetry::info;
 #[cfg(feature = "pg_kvbackend")]
 use deadpool_postgres::{Config, Runtime};
 use either::Either;
-use etcd_client::Client;
+use etcd_client::{Client, ConnectOptions};
 use servers::configurator::ConfiguratorRef;
 use servers::export_metrics::ExportMetricsTask;
 use servers::http::{HttpServer, HttpServerBuilder};
@@ -286,7 +287,8 @@ pub async fn metasrv_builder(
        (Some(kv_backend), _) => (kv_backend, None),
        (None, BackendImpl::MemoryStore) => (Arc::new(MemoryKvBackend::new()) as _, None),
        (None, BackendImpl::EtcdStore) => {
-            let etcd_client = create_etcd_client(&opts.store_addrs).await?;
+            let etcd_client =
+                create_etcd_client_with_tls(&opts.store_addrs, opts.backend_tls.as_ref()).await?;
            let kv_backend = EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
            let election = EtcdElection::with_etcd_client(
                &opts.grpc.server_addr,
@@ -330,15 +332,21 @@ pub async fn metasrv_builder(
                opts.store_key_prefix.clone(),
                candidate_lease_ttl,
                meta_lease_ttl,
+                opts.meta_schema_name.as_deref(),
                &opts.meta_table_name,
                opts.meta_election_lock_id,
            )
            .await?;

            let pool = create_postgres_pool(&opts.store_addrs, opts.backend_tls.clone()).await?;
-            let kv_backend = PgStore::with_pg_pool(pool, &opts.meta_table_name, opts.max_txn_ops)
-                .await
-                .context(error::KvBackendSnafu)?;
+            let kv_backend = PgStore::with_pg_pool(
+                pool,
+                opts.meta_schema_name.as_deref(),
+                &opts.meta_table_name,
+                opts.max_txn_ops,
+            )
+            .await
+            .context(error::KvBackendSnafu)?;

            (kv_backend, Some(election))
        }
@@ -435,12 +443,67 @@ pub async fn metasrv_builder(
 }

 pub async fn create_etcd_client(store_addrs: &[String]) -> Result<Client> {
+    create_etcd_client_with_tls(store_addrs, None).await
+}
+
+fn build_connection_options(tls_config: Option<&TlsOption>) -> Result<Option<ConnectOptions>> {
+    use std::fs;
+
+    use common_telemetry::debug;
+    use etcd_client::{Certificate, ConnectOptions, Identity, TlsOptions};
+    use servers::tls::TlsMode;
+
+    // If TLS options are not provided, return None
+    let Some(tls_config) = tls_config else {
+        return Ok(None);
+    };
+    // If TLS is disabled, return None
+    if matches!(tls_config.mode, TlsMode::Disable) {
+        return Ok(None);
+    }
+    let mut etcd_tls_opts = TlsOptions::new();
+    // Set CA certificate if provided
+    if !tls_config.ca_cert_path.is_empty() {
+        debug!("Using CA certificate from {}", tls_config.ca_cert_path);
+        let ca_cert_pem = fs::read(&tls_config.ca_cert_path).context(error::FileIoSnafu {
+            path: &tls_config.ca_cert_path,
+        })?;
+        let ca_cert = Certificate::from_pem(ca_cert_pem);
+        etcd_tls_opts = etcd_tls_opts.ca_certificate(ca_cert);
+    }
+    // Set client identity (cert + key) if both are provided
+    if !tls_config.cert_path.is_empty() && !tls_config.key_path.is_empty() {
+        debug!(
+            "Using client certificate from {} and key from {}",
+            tls_config.cert_path, tls_config.key_path
+        );
+        let cert_pem = fs::read(&tls_config.cert_path).context(error::FileIoSnafu {
+            path: &tls_config.cert_path,
+        })?;
+        let key_pem = fs::read(&tls_config.key_path).context(error::FileIoSnafu {
+            path: &tls_config.key_path,
+        })?;
+        let identity = Identity::from_pem(cert_pem, key_pem);
+        etcd_tls_opts = etcd_tls_opts.identity(identity);
+    }
+    // Enable native TLS roots for additional trust anchors
+    etcd_tls_opts = etcd_tls_opts.with_native_roots();
+    Ok(Some(ConnectOptions::new().with_tls(etcd_tls_opts)))
+}
+
+pub async fn create_etcd_client_with_tls(
+    store_addrs: &[String],
+    tls_config: Option<&TlsOption>,
+) -> Result<Client> {
    let etcd_endpoints = store_addrs
        .iter()
        .map(|x| x.trim())
        .filter(|x| !x.is_empty())
        .collect::<Vec<_>>();
-    Client::connect(&etcd_endpoints, None)
+
+    let connect_options = build_connection_options(tls_config)?;
+
+    Client::connect(&etcd_endpoints, connect_options)
        .await
        .context(error::ConnectEtcdSnafu)
 }
@@ -533,3 +596,104 @@ pub async fn create_mysql_pool(store_addrs: &[String]) -> Result<MySqlPool> {

    Ok(pool)
 }
+
+#[cfg(test)]
+mod tests {
+    use servers::tls::TlsMode;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_create_etcd_client_tls_without_certs() {
+        let endpoints: Vec<String> = match std::env::var("GT_ETCD_TLS_ENDPOINTS") {
+            Ok(endpoints_str) => endpoints_str
+                .split(',')
+                .map(|s| s.trim().to_string())
+                .collect(),
+            Err(_) => return,
+        };
+
+        let tls_config = TlsOption {
+            mode: TlsMode::Require,
+            ca_cert_path: String::new(),
+            cert_path: String::new(),
+            key_path: String::new(),
+            watch: false,
+        };
+
+        let _client = create_etcd_client_with_tls(&endpoints, Some(&tls_config))
+            .await
+            .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_create_etcd_client_tls_with_client_certs() {
+        let endpoints: Vec<String> = match std::env::var("GT_ETCD_TLS_ENDPOINTS") {
+            Ok(endpoints_str) => endpoints_str
+                .split(',')
+                .map(|s| s.trim().to_string())
+                .collect(),
+            Err(_) => return,
+        };
+
+        let cert_dir = std::env::current_dir()
+            .unwrap()
+            .join("tests-integration")
+            .join("fixtures")
+            .join("etcd-tls-certs");
+
+        if cert_dir.join("client.crt").exists() && cert_dir.join("client-key.pem").exists() {
+            let tls_config = TlsOption {
+                mode: TlsMode::Require,
+                ca_cert_path: String::new(),
+                cert_path: cert_dir.join("client.crt").to_string_lossy().to_string(),
+                key_path: cert_dir
+                    .join("client-key.pem")
+                    .to_string_lossy()
+                    .to_string(),
+                watch: false,
+            };
+
+            let _client = create_etcd_client_with_tls(&endpoints, Some(&tls_config))
+                .await
+                .unwrap();
+        }
+    }
+
+    #[tokio::test]
+    async fn test_create_etcd_client_tls_with_full_certs() {
+        let endpoints: Vec<String> = match std::env::var("GT_ETCD_TLS_ENDPOINTS") {
+            Ok(endpoints_str) => endpoints_str
+                .split(',')
+                .map(|s| s.trim().to_string())
+                .collect(),
+            Err(_) => return,
+        };
+
+        let cert_dir = std::env::current_dir()
+            .unwrap()
+            .join("tests-integration")
+            .join("fixtures")
+            .join("etcd-tls-certs");
+
+        if cert_dir.join("ca.crt").exists()
+            && cert_dir.join("client.crt").exists()
+            && cert_dir.join("client-key.pem").exists()
+        {
+            let tls_config = TlsOption {
+                mode: TlsMode::Require,
+                ca_cert_path: cert_dir.join("ca.crt").to_string_lossy().to_string(),
+                cert_path: cert_dir.join("client.crt").to_string_lossy().to_string(),
+                key_path: cert_dir
+                    .join("client-key.pem")
+                    .to_string_lossy()
+                    .to_string(),
+                watch: false,
+            };
+
+            let _client = create_etcd_client_with_tls(&endpoints, Some(&tls_config))
+                .await
+                .unwrap();
+        }
+    }
+}
--- a/src/meta-srv/src/election/rds/postgres.rs
+++ b/src/meta-srv/src/election/rds/postgres.rs
@@ -38,6 +38,7 @@ use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};

 struct ElectionSqlFactory<'a> {
    lock_id: u64,
+    schema_name: Option<&'a str>,
    table_name: &'a str,
 }

@@ -88,13 +89,21 @@ struct ElectionSqlSet {
 }

 impl<'a> ElectionSqlFactory<'a> {
-    fn new(lock_id: u64, table_name: &'a str) -> Self {
+    fn new(lock_id: u64, schema_name: Option<&'a str>, table_name: &'a str) -> Self {
        Self {
            lock_id,
+            schema_name,
            table_name,
        }
    }

+    fn table_ident(&self) -> String {
+        match self.schema_name {
+            Some(s) if !s.is_empty() => format!("\"{}\".\"{}\"", s, self.table_name),
+            _ => format!("\"{}\"", self.table_name),
+        }
+    }
+
    fn build(self) -> ElectionSqlSet {
        ElectionSqlSet {
            campaign: self.campaign_sql(),
@@ -116,47 +125,54 @@ impl<'a> ElectionSqlFactory<'a> {
    }

    fn put_value_with_lease_sql(&self) -> String {
+        let table = self.table_ident();
        format!(
            r#"WITH prev AS (
-                SELECT k, v FROM "{}" WHERE k = $1
+                SELECT k, v FROM {table} WHERE k = $1
            ), insert AS (
-                INSERT INTO "{}"
-                VALUES($1, convert_to($2 || '{}' || TO_CHAR(CURRENT_TIMESTAMP + INTERVAL '1 second' * $3, 'YYYY-MM-DD HH24:MI:SS.MS'), 'UTF8'))
+                INSERT INTO {table}
+                VALUES($1, convert_to($2 || '{lease_sep}' || TO_CHAR(CURRENT_TIMESTAMP + INTERVAL '1 second' * $3, 'YYYY-MM-DD HH24:MI:SS.MS'), 'UTF8'))
                ON CONFLICT (k) DO NOTHING
            )
            SELECT k, v FROM prev;
            "#,
-            self.table_name, self.table_name, LEASE_SEP
+            table = table,
+            lease_sep = LEASE_SEP
        )
    }

    fn update_value_with_lease_sql(&self) -> String {
+        let table = self.table_ident();
        format!(
-            r#"UPDATE "{}"
-               SET v = convert_to($3 || '{}' || TO_CHAR(CURRENT_TIMESTAMP + INTERVAL '1 second' * $4, 'YYYY-MM-DD HH24:MI:SS.MS'), 'UTF8')
+            r#"UPDATE {table}
+               SET v = convert_to($3 || '{lease_sep}' || TO_CHAR(CURRENT_TIMESTAMP + INTERVAL '1 second' * $4, 'YYYY-MM-DD HH24:MI:SS.MS'), 'UTF8')
               WHERE k = $1 AND v = $2"#,
-            self.table_name, LEASE_SEP
+            table = table,
+            lease_sep = LEASE_SEP
        )
    }

    fn get_value_with_lease_sql(&self) -> String {
+        let table = self.table_ident();
        format!(
-            r#"SELECT v, TO_CHAR(CURRENT_TIMESTAMP, 'YYYY-MM-DD HH24:MI:SS.MS') FROM "{}" WHERE k = $1"#,
-            self.table_name
+            r#"SELECT v, TO_CHAR(CURRENT_TIMESTAMP, 'YYYY-MM-DD HH24:MI:SS.MS') FROM {table} WHERE k = $1"#,
+            table = table
        )
    }

    fn get_value_with_lease_by_prefix_sql(&self) -> String {
+        let table = self.table_ident();
        format!(
-            r#"SELECT v, TO_CHAR(CURRENT_TIMESTAMP, 'YYYY-MM-DD HH24:MI:SS.MS') FROM "{}" WHERE k LIKE $1"#,
-            self.table_name
+            r#"SELECT v, TO_CHAR(CURRENT_TIMESTAMP, 'YYYY-MM-DD HH24:MI:SS.MS') FROM {table} WHERE k LIKE $1"#,
+            table = table
        )
    }

    fn delete_value_sql(&self) -> String {
+        let table = self.table_ident();
        format!(
-            "DELETE FROM \"{}\" WHERE k = $1 RETURNING k,v;",
-            self.table_name
+            "DELETE FROM {table} WHERE k = $1 RETURNING k,v;",
+            table = table
        )
    }
 }
@@ -299,16 +315,23 @@ impl PgElection {
        Ok(())
    }

+    #[allow(clippy::too_many_arguments)]
    pub async fn with_pg_client(
        leader_value: String,
        pg_client: ElectionPgClient,
        store_key_prefix: String,
        candidate_lease_ttl: Duration,
        meta_lease_ttl: Duration,
+        schema_name: Option<&str>,
        table_name: &str,
        lock_id: u64,
    ) -> Result<ElectionRef> {
-        let sql_factory = ElectionSqlFactory::new(lock_id, table_name);
+        if let Some(s) = schema_name {
+            common_telemetry::info!("PgElection uses schema: {}", s);
+        } else {
+            common_telemetry::info!("PgElection uses default search_path (no schema provided)");
+        }
+        let sql_factory = ElectionSqlFactory::new(lock_id, schema_name, table_name);

        let tx = listen_leader_change(leader_value.clone());
        Ok(Arc::new(Self {
@@ -638,8 +661,8 @@ impl PgElection {
    ///     after a period of time during which other leaders have been elected and stepped down.
    ///   - **Case 1.4**: If no lease information is found, it also steps down and re-initiates the campaign.
    ///
-    /// - **Case 2**: If the current instance is not leader previously, it calls the
-    ///   `elected` method as a newly elected leader.
+    /// - **Case 2**: If the current instance is not leader previously, it calls the `elected` method
+    ///   as a newly elected leader.
    async fn leader_action(&self) -> Result<()> {
        let key = self.election_key();
        // Case 1
@@ -881,7 +904,7 @@ mod tests {
            store_key_prefix: uuid,
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28319, table_name).build(),
+            sql_set: ElectionSqlFactory::new(28319, None, table_name).build(),
        };

        let res = pg_election
@@ -969,7 +992,7 @@ mod tests {
            store_key_prefix,
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28319, &table_name).build(),
+            sql_set: ElectionSqlFactory::new(28319, None, &table_name).build(),
        };

        let node_info = MetasrvNodeInfo {
@@ -1026,7 +1049,7 @@ mod tests {
            store_key_prefix: uuid.clone(),
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28319, table_name).build(),
+            sql_set: ElectionSqlFactory::new(28319, None, table_name).build(),
        };

        let candidates = pg_election.all_candidates().await.unwrap();
@@ -1081,7 +1104,7 @@ mod tests {
            store_key_prefix: uuid,
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28320, table_name).build(),
+            sql_set: ElectionSqlFactory::new(28320, None, table_name).build(),
        };

        leader_pg_election.elected().await.unwrap();
@@ -1206,7 +1229,7 @@ mod tests {
            store_key_prefix: uuid,
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28321, table_name).build(),
+            sql_set: ElectionSqlFactory::new(28321, None, table_name).build(),
        };

        // Step 1: No leader exists, campaign and elected.
@@ -1473,7 +1496,7 @@ mod tests {
            store_key_prefix: uuid.clone(),
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28322, table_name).build(),
+            sql_set: ElectionSqlFactory::new(28322, None, table_name).build(),
        };

        let leader_client = create_postgres_client(
@@ -1494,7 +1517,7 @@ mod tests {
            store_key_prefix: uuid,
            candidate_lease_ttl,
            meta_lease_ttl,
-            sql_set: ElectionSqlFactory::new(28322, table_name).build(),
+            sql_set: ElectionSqlFactory::new(28322, None, table_name).build(),
        };

        leader_pg_election
@@ -1578,4 +1601,26 @@ mod tests {
        client.reset_client().await.unwrap();
        let _ = client.query("SELECT 1", &[]).await.unwrap();
    }
+
+    #[test]
+    fn test_election_sql_with_schema() {
+        let f = ElectionSqlFactory::new(42, Some("test_schema"), "greptime_metakv");
+        let s = f.build();
+        assert!(s.campaign.contains("pg_try_advisory_lock"));
+        assert!(s
+            .put_value_with_lease
+            .contains("\"test_schema\".\"greptime_metakv\""));
+        assert!(s
+            .update_value_with_lease
+            .contains("\"test_schema\".\"greptime_metakv\""));
+        assert!(s
+            .get_value_with_lease
+            .contains("\"test_schema\".\"greptime_metakv\""));
+        assert!(s
+            .get_value_with_lease_by_prefix
+            .contains("\"test_schema\".\"greptime_metakv\""));
+        assert!(s
+            .delete_value
+            .contains("\"test_schema\".\"greptime_metakv\""));
+    }
 }
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -257,6 +257,15 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Failed to read file: {}", path))]
+    FileIo {
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+        path: String,
+    },
+
    #[snafu(display("Failed to bind address {}", addr))]
    TcpBind {
        addr: String,
@@ -970,6 +979,7 @@ impl ErrorExt for Error {
        match self {
            Error::EtcdFailed { .. }
            | Error::ConnectEtcd { .. }
+            | Error::FileIo { .. }
            | Error::TcpBind { .. }
            | Error::SerializeToJson { .. }
            | Error::DeserializeFromJson { .. }
--- a/src/meta-srv/src/handler/persist_stats_handler.rs
+++ b/src/meta-srv/src/handler/persist_stats_handler.rs
@@ -41,7 +41,7 @@ pub struct PersistStatsHandler {
 }

 /// The name of the table to persist region stats.
-const META_REGION_STATS_TABLE_NAME: &str = "region_statistics";
+const META_REGION_STATS_HISTORY_TABLE_NAME: &str = "region_statistics_history";
 /// The default context to persist region stats.
 const DEFAULT_CONTEXT: InserterContext = InserterContext {
    catalog: DEFAULT_CATALOG_NAME,
@@ -207,7 +207,7 @@ impl PersistStatsHandler {
                &DEFAULT_CONTEXT,
                RowInsertRequests {
                    inserts: vec![RowInsertRequest {
-                        table_name: META_REGION_STATS_TABLE_NAME.to_string(),
+                        table_name: META_REGION_STATS_HISTORY_TABLE_NAME.to_string(),
                        rows: Some(Rows {
                            schema: PersistRegionStat::schema(),
                            rows,
@@ -532,7 +532,10 @@ mod tests {
            assert_eq!(requests.len(), 1);
            requests.pop().unwrap()
        };
-        assert_eq!(request.table_name, META_REGION_STATS_TABLE_NAME.to_string());
+        assert_eq!(
+            request.table_name,
+            META_REGION_STATS_HISTORY_TABLE_NAME.to_string()
+        );
        assert_eq!(request.rows.unwrap().rows, vec![expected_row]);

        // Check last persisted time
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -200,6 +200,9 @@ pub struct MetasrvOptions {
    #[cfg(feature = "pg_kvbackend")]
    /// Lock id for meta kv election. Only effect when using pg_kvbackend.
    pub meta_election_lock_id: u64,
+    #[cfg(feature = "pg_kvbackend")]
+    /// Optional PostgreSQL schema for metadata table (defaults to current search_path if empty).
+    pub meta_schema_name: Option<String>,
    #[serde(with = "humantime_serde")]
    pub node_max_idle_time: Duration,
    /// The event recorder options.
@@ -244,6 +247,8 @@ impl fmt::Debug for MetasrvOptions {

        #[cfg(feature = "pg_kvbackend")]
        debug_struct.field("meta_election_lock_id", &self.meta_election_lock_id);
+        #[cfg(feature = "pg_kvbackend")]
+        debug_struct.field("meta_schema_name", &self.meta_schema_name);

        debug_struct
            .field("node_max_idle_time", &self.node_max_idle_time)
@@ -297,6 +302,8 @@ impl Default for MetasrvOptions {
            meta_table_name: common_meta::kv_backend::DEFAULT_META_TABLE_NAME.to_string(),
            #[cfg(feature = "pg_kvbackend")]
            meta_election_lock_id: common_meta::kv_backend::DEFAULT_META_ELECTION_LOCK_ID,
+            #[cfg(feature = "pg_kvbackend")]
+            meta_schema_name: None,
            node_max_idle_time: Duration::from_secs(24 * 60 * 60),
            event_recorder: EventRecorderOptions::default(),
            stats_persistence: StatsPersistenceOptions::default(),
--- a/src/meta-srv/src/metasrv/builder.rs
+++ b/src/meta-srv/src/metasrv/builder.rs
@@ -15,12 +15,13 @@
 use std::path::Path;
 use std::sync::atomic::AtomicBool;
 use std::sync::{Arc, Mutex, RwLock};
+use std::time::Duration;

 use client::client_manager::NodeClients;
 use client::inserter::InsertOptions;
 use common_base::Plugins;
 use common_catalog::consts::{MIN_USER_FLOW_ID, MIN_USER_TABLE_ID};
-use common_event_recorder::{EventRecorderImpl, EventRecorderRef};
+use common_event_recorder::{EventRecorderImpl, EventRecorderRef, DEFAULT_COMPACTION_TIME_WINDOW};
 use common_grpc::channel_manager::ChannelConfig;
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::{TableMetadataAllocator, TableMetadataAllocatorRef};
@@ -81,6 +82,9 @@ use crate::state::State;
 use crate::table_meta_alloc::MetasrvPeerAllocator;
 use crate::utils::insert_forwarder::InsertForwarder;

+/// The time window for twcs compaction of the region stats table.
+const REGION_STATS_TABLE_TWCS_COMPACTION_TIME_WINDOW: Duration = Duration::from_days(1);
+
 // TODO(fys): try use derive_builder macro
 pub struct MetasrvBuilder {
    options: Option<MetasrvOptions>,
@@ -205,6 +209,7 @@ impl MetasrvBuilder {
            Some(InsertOptions {
                ttl: options.event_recorder.ttl,
                append_mode: true,
+                twcs_compaction_time_window: Some(DEFAULT_COMPACTION_TIME_WINDOW),
            }),
        ));
        // Builds the event recorder to record important events and persist them as the system table.
@@ -417,6 +422,7 @@ impl MetasrvBuilder {
                mailbox.clone(),
                options.grpc.server_addr.clone(),
                remote_wal_options.flush_trigger_size,
+                remote_wal_options.checkpoint_trigger_size,
            );
            region_flush_trigger.try_start()?;

@@ -465,6 +471,9 @@ impl MetasrvBuilder {
                Some(InsertOptions {
                    ttl: options.stats_persistence.ttl,
                    append_mode: true,
+                    twcs_compaction_time_window: Some(
+                        REGION_STATS_TABLE_TWCS_COMPACTION_TIME_WINDOW,
+                    ),
                }),
            ));

--- a/src/meta-srv/src/metrics.rs
+++ b/src/meta-srv/src/metrics.rs
@@ -82,5 +82,12 @@ lazy_static! {
    .unwrap();
    /// The triggered region flush total counter.
    pub static ref METRIC_META_TRIGGERED_REGION_FLUSH_TOTAL: IntCounterVec =
-        register_int_counter_vec!("meta_triggered_region_flushes_total", "meta triggered region flush total", &["topic_name", "region_type"]).unwrap();
+        register_int_counter_vec!("meta_triggered_region_flush_total", "meta triggered region flush total", &["topic_name", "region_type"]).unwrap();
+
+    /// The triggered region checkpoint total counter.
+    pub static ref METRIC_META_TRIGGERED_REGION_CHECKPOINT_TOTAL: IntCounterVec =
+        register_int_counter_vec!("meta_triggered_region_checkpoint_total", "meta triggered region checkpoint total", &["topic_name"]).unwrap();
+    /// The topic estimated replay size.
+    pub static ref METRIC_META_TOPIC_ESTIMATED_REPLAY_SIZE: IntGaugeVec =
+        register_int_gauge_vec!("meta_topic_estimated_replay_size", "meta topic estimated replay size", &["topic_name"]).unwrap();
 }
--- a/src/meta-srv/src/procedure/region_migration.rs
+++ b/src/meta-srv/src/procedure/region_migration.rs
@@ -38,6 +38,7 @@ use common_meta::instruction::CacheIdent;
 use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue};
 use common_meta::key::table_info::TableInfoValue;
 use common_meta::key::table_route::TableRouteValue;
+use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey};
 use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
 use common_meta::kv_backend::ResettableKvBackendRef;
 use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock};
@@ -534,6 +535,20 @@ impl Context {
        Ok(datanode_value.as_ref().unwrap())
    }

+    /// Fetches the replay checkpoint for the given topic.
+    pub async fn fetch_replay_checkpoint(&self, topic: &str) -> Result<Option<ReplayCheckpoint>> {
+        let region_id = self.region_id();
+        let topic_region_key = TopicRegionKey::new(region_id, topic);
+        let value = self
+            .table_metadata_manager
+            .topic_region_manager()
+            .get(topic_region_key)
+            .await
+            .context(error::TableMetadataManagerSnafu)?;
+
+        Ok(value.and_then(|value| value.checkpoint))
+    }
+
    /// Returns the [RegionId].
    pub fn region_id(&self) -> RegionId {
        self.persistent_ctx.region_id
--- a/Show More
+++ b/Show More