fix: conn pool leak & placeholder feature so ci can compile

fix: placeholder feature so ci can compile
fix: time window filter expr use OR
2026-01-08 06:12:55 +00:00 · 2025-04-10 15:01:07 +08:00 · 2025-04-08 14:37:55 +08:00 · 2025-04-07 16:50:17 +08:00 · 2025-04-07 16:50:17 +08:00 · 2025-04-07 16:50:17 +08:00
357 changed files with 19274 additions and 7018 deletions
--- a/.github/actions/build-greptime-images/action.yml
+++ b/.github/actions/build-greptime-images/action.yml
@@ -34,8 +34,8 @@ inputs:
    required: true
  push-latest-tag:
    description: Whether to push the latest tag
-    required: false
-    default: 'true'
+    required: true
+    default: 'false'
 runs:
  using: composite
  steps:
@@ -47,7 +47,11 @@ runs:
        password: ${{ inputs.image-registry-password }}

    - name: Set up qemu for multi-platform builds
-      uses: docker/setup-qemu-action@v2
+      uses: docker/setup-qemu-action@v3
+      with:
+        platforms: linux/amd64,linux/arm64
+        # The latest version will lead to segmentation fault.
+        image: tonistiigi/binfmt:qemu-v7.0.0-28

    - name: Set up buildx
      uses: docker/setup-buildx-action@v2
--- a/.github/actions/build-images/action.yml
+++ b/.github/actions/build-images/action.yml
@@ -22,8 +22,8 @@ inputs:
    required: true
  push-latest-tag:
    description: Whether to push the latest tag
-    required: false
-    default: 'true'
+    required: true
+    default: 'false'
  dev-mode:
    description: Enable dev mode, only build standard greptime
    required: false
--- a/.github/actions/release-cn-artifacts/action.yaml
+++ b/.github/actions/release-cn-artifacts/action.yaml
@@ -51,8 +51,8 @@ inputs:
    required: true
  upload-to-s3:
    description: Upload to S3
-    required: false
-    default: 'true'
+    required: true
+    default: 'false'
  artifacts-dir:
    description: Directory to store artifacts
    required: false
@@ -77,13 +77,21 @@ runs:
      with:
        path: ${{ inputs.artifacts-dir }}

+    - name: Install s5cmd
+      shell: bash
+      run: |
+        wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz
+        tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz
+        sudo mv s5cmd /usr/local/bin/
+        sudo chmod +x /usr/local/bin/s5cmd
+
    - name: Release artifacts to cn region
      uses: nick-invision/retry@v2
      if: ${{ inputs.upload-to-s3 == 'true' }}
      env:
        AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }}
-        AWS_DEFAULT_REGION: ${{ inputs.aws-cn-region }}
+        AWS_REGION: ${{ inputs.aws-cn-region }}
        UPDATE_VERSION_INFO: ${{ inputs.update-version-info }}
      with:
        max_attempts: ${{ inputs.upload-max-retry-times }}
--- a/.github/scripts/upload-artifacts-to-s3.sh
+++ b/.github/scripts/upload-artifacts-to-s3.sh
@@ -33,7 +33,7 @@ function upload_artifacts() {
  #    ├── greptime-darwin-amd64-v0.2.0.sha256sum
  #    └── greptime-darwin-amd64-v0.2.0.tar.gz
  find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
-    aws s3 cp \
+    s5cmd cp \
      "$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")"
  done
 }
@@ -45,7 +45,7 @@ function update_version_info() {
    if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
      echo "Updating latest-version.txt"
      echo "$VERSION" > latest-version.txt
-      aws s3 cp \
+      s5cmd cp \
        latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt"
    fi

@@ -53,7 +53,7 @@ function update_version_info() {
    if [[ "$VERSION" == *"nightly"* ]]; then
      echo "Updating latest-nightly-version.txt"
      echo "$VERSION" > latest-nightly-version.txt
-      aws s3 cp \
+      s5cmd cp \
        latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt"
    fi
  fi
--- a/.github/workflows/apidoc.yml
+++ b/.github/workflows/apidoc.yml
@@ -17,6 +17,8 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
    - uses: arduino/setup-protoc@v3
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/dependency-check.yml
+++ b/.github/workflows/dependency-check.yml
@@ -12,6 +12,8 @@ jobs:
    steps:
    - name: Checkout code
      uses: actions/checkout@v4
+      with:
+        persist-credentials: false

    - name: Set up Rust
      uses: actions-rust-lang/setup-rust-toolchain@v1
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -76,15 +76,9 @@ env:

  NIGHTLY_RELEASE_PREFIX: nightly

-  # Use the different image name to avoid conflict with the release images.
-  IMAGE_NAME: greptimedb-dev
-
  # The source code will check out in the following path: '${WORKING_DIR}/dev/greptime'.
  CHECKOUT_GREPTIMEDB_PATH: dev/greptimedb

-permissions:
-  issues: write
-
 jobs:
  allocate-runners:
    name: Allocate runners
@@ -107,6 +101,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Create version
        id: create-version
@@ -161,6 +156,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Checkout greptimedb
        uses: actions/checkout@v4
@@ -168,6 +164,7 @@ jobs:
          repository: ${{ inputs.repository }}
          ref: ${{ inputs.commit }}
          path: ${{ env.CHECKOUT_GREPTIMEDB_PATH }}
+          persist-credentials: true

      - uses: ./.github/actions/build-linux-artifacts
        with:
@@ -192,6 +189,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Checkout greptimedb
        uses: actions/checkout@v4
@@ -199,6 +197,7 @@ jobs:
          repository: ${{ inputs.repository }}
          ref: ${{ inputs.commit }}
          path: ${{ env.CHECKOUT_GREPTIMEDB_PATH }}
+          persist-credentials: true

      - uses: ./.github/actions/build-linux-artifacts
        with:
@@ -226,13 +225,14 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Build and push images to dockerhub
        uses: ./.github/actions/build-images
        with:
          image-registry: docker.io
          image-namespace: ${{ vars.IMAGE_NAMESPACE }}
-          image-name: ${{ env.IMAGE_NAME }}
+          image-name: ${{ vars.DEV_BUILD_IMAGE_NAME }}
          image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
          image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
          version: ${{ needs.allocate-runners.outputs.version }}
@@ -257,13 +257,14 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Release artifacts to CN region
        uses: ./.github/actions/release-cn-artifacts
        with:
          src-image-registry: docker.io
          src-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
-          src-image-name: ${{ env.IMAGE_NAME }}
+          src-image-name: ${{ vars.DEV_BUILD_IMAGE_NAME }}
          dst-image-registry-username: ${{ secrets.ALICLOUD_USERNAME }}
          dst-image-registry-password: ${{ secrets.ALICLOUD_PASSWORD }}
          dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
@@ -273,6 +274,7 @@ jobs:
          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          upload-to-s3: false
          dev-mode: true                     # Only build the standard images(exclude centos images).
          push-latest-tag: false             # Don't push the latest tag to registry.
          update-version-info: false         # Don't update the version info in S3.
@@ -291,6 +293,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Stop EC2 runner
        uses: ./.github/actions/stop-runner
@@ -316,6 +319,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Stop EC2 runner
        uses: ./.github/actions/stop-runner
@@ -334,10 +338,16 @@ jobs:
      release-images-to-dockerhub
    ]
    runs-on: ubuntu-20.04
+    permissions:
+      issues: write
+
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Report CI status
        id: report-ci-status
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -26,6 +26,8 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: crate-ci/typos@master
      - name: Check the config docs
        run: |
@@ -38,6 +40,8 @@ jobs:
    name: Check License Header
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: korandoru/hawkeye@v5

  check:
@@ -49,6 +53,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -70,6 +76,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: actions-rust-lang/setup-rust-toolchain@v1
      - name: Install taplo
        run: cargo +stable install taplo-cli --version ^0.9 --locked --force
@@ -85,6 +93,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -139,6 +149,8 @@ jobs:
          echo "Disk space after:"
          df -h
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -192,6 +204,8 @@ jobs:
          echo "Disk space after:"
          df -h
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -238,6 +252,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -295,6 +311,8 @@ jobs:
          echo "Disk space after:"
          df -h
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - name: Setup Kind
        uses: ./.github/actions/setup-kind
      - if: matrix.mode.minio
@@ -437,6 +455,8 @@ jobs:
          echo "Disk space after:"
          df -h
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - name: Setup Kind
        uses: ./.github/actions/setup-kind
      - name: Setup Chaos Mesh
@@ -562,6 +582,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - if: matrix.mode.kafka
        name: Setup kafka server
        working-directory: tests-integration/fixtures
@@ -589,6 +611,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -604,6 +628,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -626,6 +652,8 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - name: Merge Conflict Finder
        uses: olivernybroe/action-conflict-finder@v4.0

@@ -636,6 +664,8 @@ jobs:
    needs:  [conflict-check, clippy, fmt]
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -684,6 +714,8 @@ jobs:
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: arduino/setup-protoc@v3
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/docbot.yml
+++ b/.github/workflows/docbot.yml
@@ -3,16 +3,21 @@ on:
  pull_request_target:
    types: [opened, edited]

-permissions:
-  pull-requests: write
-  contents: read
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true

 jobs:
  docbot:
    runs-on: ubuntu-20.04
+    permissions:
+      pull-requests: write
+      contents: read
    timeout-minutes: 10
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Maybe Follow Up Docs Issue
        working-directory: cyborg
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -34,6 +34,8 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: crate-ci/typos@master

  license-header-check:
@@ -41,6 +43,8 @@ jobs:
    name: Check License Header
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: korandoru/hawkeye@v5

  check:
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -66,13 +66,6 @@ env:

  NIGHTLY_RELEASE_PREFIX: nightly

-  # Use the different image name to avoid conflict with the release images.
-  # The DockerHub image will be greptime/greptimedb-nightly.
-  IMAGE_NAME: greptimedb-nightly
-
-permissions:
-  issues: write
-
 jobs:
  allocate-runners:
    name: Allocate runners
@@ -95,6 +88,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Create version
        id: create-version
@@ -147,6 +141,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - uses: ./.github/actions/build-linux-artifacts
        with:
@@ -168,6 +163,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - uses: ./.github/actions/build-linux-artifacts
        with:
@@ -193,17 +189,18 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Build and push images to dockerhub
        uses: ./.github/actions/build-images
        with:
          image-registry: docker.io
          image-namespace: ${{ vars.IMAGE_NAMESPACE }}
-          image-name: ${{ env.IMAGE_NAME }}
+          image-name: ${{ vars.NIGHTLY_BUILD_IMAGE_NAME }}
          image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
          image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
          version: ${{ needs.allocate-runners.outputs.version }}
-          push-latest-tag: true
+          push-latest-tag: false

      - name: Set nightly build result
        id: set-nightly-build-result
@@ -226,13 +223,14 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Release artifacts to CN region
        uses: ./.github/actions/release-cn-artifacts
        with:
          src-image-registry: docker.io
          src-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
-          src-image-name: ${{ env.IMAGE_NAME }}
+          src-image-name: ${{ vars.NIGHTLY_BUILD_IMAGE_NAME }}
          dst-image-registry-username: ${{ secrets.ALICLOUD_USERNAME }}
          dst-image-registry-password: ${{ secrets.ALICLOUD_PASSWORD }}
          dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
@@ -242,9 +240,10 @@ jobs:
          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          upload-to-s3: false
          dev-mode: false
          update-version-info: false  # Don't update version info in S3.
-          push-latest-tag: true
+          push-latest-tag: false

  stop-linux-amd64-runner: # It's always run as the last job in the workflow to make sure that the runner is released.
    name: Stop linux-amd64 runner
@@ -260,6 +259,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Stop EC2 runner
        uses: ./.github/actions/stop-runner
@@ -285,6 +285,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Stop EC2 runner
        uses: ./.github/actions/stop-runner
@@ -303,10 +304,14 @@ jobs:
      release-images-to-dockerhub
    ]
    runs-on: ubuntu-20.04
+    permissions:
+      issues: write
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Report CI status
        id: report-ci-status
--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
@@ -9,9 +9,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

-permissions:
-  issues: write
-
 jobs:
  sqlness-test:
    name: Run sqlness test
@@ -22,6 +19,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Check install.sh
        run: ./.github/scripts/check-install-script.sh
@@ -46,9 +44,14 @@ jobs:
    name: Sqlness tests on Windows
    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
    runs-on: windows-2022-8-cores
+    permissions:
+      issues: write
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - uses: arduino/setup-protoc@v3
        with:
@@ -76,6 +79,9 @@ jobs:
    steps:
      - run: git config --global core.autocrlf false
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - uses: arduino/setup-protoc@v3
        with:
@@ -111,9 +117,13 @@ jobs:
  cleanbuild-linux-nix:
    name: Run clean build on Linux
    runs-on: ubuntu-latest
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: cachix/install-nix-action@v27
        with:
          nix_path: nixpkgs=channel:nixos-24.11
@@ -141,6 +151,9 @@ jobs:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Report CI status
        id: report-ci-status
--- a/.github/workflows/release-dev-builder-images.yaml
+++ b/.github/workflows/release-dev-builder-images.yaml
@@ -37,6 +37,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Configure build image version
        id: set-version
@@ -85,48 +86,66 @@ jobs:
      - name: Push dev-builder-ubuntu image
        shell: bash
        if: ${{ inputs.release_dev_builder_ubuntu_image }}
+        env:
+          IMAGE_VERSION: ${{ needs.release-dev-builder-images.outputs.version }}
+          IMAGE_NAMESPACE: ${{ vars.IMAGE_NAMESPACE }}
+          ECR_IMAGE_REGISTRY: ${{ vars.ECR_IMAGE_REGISTRY }}
+          ECR_IMAGE_NAMESPACE: ${{ vars.ECR_IMAGE_NAMESPACE }}
        run: |
          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-ubuntu:${{ needs.release-dev-builder-images.outputs.version }} \
-            docker://${{ vars.ECR_IMAGE_REGISTRY }}/${{ vars.ECR_IMAGE_NAMESPACE }}/dev-builder-ubuntu:${{ needs.release-dev-builder-images.outputs.version }}
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-ubuntu:$IMAGE_VERSION \
+            docker://$ECR_IMAGE_REGISTRY/$ECR_IMAGE_NAMESPACE/dev-builder-ubuntu:$IMAGE_VERSION

          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-ubuntu:latest \
-            docker://${{ vars.ECR_IMAGE_REGISTRY }}/${{ vars.ECR_IMAGE_NAMESPACE }}/dev-builder-ubuntu:latest
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-ubuntu:latest \
+            docker://$ECR_IMAGE_REGISTRY/$ECR_IMAGE_NAMESPACE/dev-builder-ubuntu:latest
+
      - name: Push dev-builder-centos image
        shell: bash
        if: ${{ inputs.release_dev_builder_centos_image }}
+        env:
+          IMAGE_VERSION: ${{ needs.release-dev-builder-images.outputs.version }}
+          IMAGE_NAMESPACE: ${{ vars.IMAGE_NAMESPACE }}
+          ECR_IMAGE_REGISTRY: ${{ vars.ECR_IMAGE_REGISTRY }}
+          ECR_IMAGE_NAMESPACE: ${{ vars.ECR_IMAGE_NAMESPACE }}
        run: |
          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-centos:${{ needs.release-dev-builder-images.outputs.version }} \
-            docker://${{ vars.ECR_IMAGE_REGISTRY }}/${{ vars.ECR_IMAGE_NAMESPACE }}/dev-builder-centos:${{ needs.release-dev-builder-images.outputs.version }}
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-centos:$IMAGE_VERSION \
+            docker://$ECR_IMAGE_REGISTRY/$ECR_IMAGE_NAMESPACE/dev-builder-centos:$IMAGE_VERSION

          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-centos:latest \
-            docker://${{ vars.ECR_IMAGE_REGISTRY }}/${{ vars.ECR_IMAGE_NAMESPACE }}/dev-builder-centos:latest
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-centos:latest \
+            docker://$ECR_IMAGE_REGISTRY/$ECR_IMAGE_NAMESPACE/dev-builder-centos:latest
+
      - name: Push dev-builder-android image
        shell: bash
        if: ${{ inputs.release_dev_builder_android_image }}
+        env:
+          IMAGE_VERSION: ${{ needs.release-dev-builder-images.outputs.version }}
+          IMAGE_NAMESPACE: ${{ vars.IMAGE_NAMESPACE }}
+          ECR_IMAGE_REGISTRY: ${{ vars.ECR_IMAGE_REGISTRY }}
+          ECR_IMAGE_NAMESPACE: ${{ vars.ECR_IMAGE_NAMESPACE }}
        run: |
          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-android:${{ needs.release-dev-builder-images.outputs.version }} \
-            docker://${{ vars.ECR_IMAGE_REGISTRY }}/${{ vars.ECR_IMAGE_NAMESPACE }}/dev-builder-android:${{ needs.release-dev-builder-images.outputs.version }}
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-android:$IMAGE_VERSION \
+            docker://$ECR_IMAGE_REGISTRY/$ECR_IMAGE_NAMESPACE/dev-builder-android:$IMAGE_VERSION

          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-android:latest \
-            docker://${{ vars.ECR_IMAGE_REGISTRY }}/${{ vars.ECR_IMAGE_NAMESPACE }}/dev-builder-android:latest
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-android:latest \
+            docker://$ECR_IMAGE_REGISTRY/$ECR_IMAGE_NAMESPACE/dev-builder-android:latest
+
  release-dev-builder-images-cn: # Note: Be careful issue: https://github.com/containers/skopeo/issues/1874 and we decide to use the latest stable skopeo container.
    name: Release dev builder images to CN region
    runs-on: ubuntu-20.04
@@ -144,29 +163,41 @@ jobs:
      - name: Push dev-builder-ubuntu image
        shell: bash
        if: ${{ inputs.release_dev_builder_ubuntu_image }}
+        env:
+          IMAGE_VERSION: ${{ needs.release-dev-builder-images.outputs.version }}
+          IMAGE_NAMESPACE: ${{ vars.IMAGE_NAMESPACE }}
+          ACR_IMAGE_REGISTRY: ${{ vars.ACR_IMAGE_REGISTRY }}
        run: |
          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-ubuntu:${{ needs.release-dev-builder-images.outputs.version }} \
-            docker://${{ vars.ACR_IMAGE_REGISTRY }}/${{ vars.IMAGE_NAMESPACE }}/dev-builder-ubuntu:${{ needs.release-dev-builder-images.outputs.version }}
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-ubuntu:$IMAGE_VERSION \
+            docker://$ACR_IMAGE_REGISTRY/$IMAGE_NAMESPACE/dev-builder-ubuntu:$IMAGE_VERSION

      - name: Push dev-builder-centos image
        shell: bash
        if: ${{ inputs.release_dev_builder_centos_image }}
+        env:
+          IMAGE_VERSION: ${{ needs.release-dev-builder-images.outputs.version }}
+          IMAGE_NAMESPACE: ${{ vars.IMAGE_NAMESPACE }}
+          ACR_IMAGE_REGISTRY: ${{ vars.ACR_IMAGE_REGISTRY }}
        run: |
          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-centos:${{ needs.release-dev-builder-images.outputs.version }} \
-            docker://${{ vars.ACR_IMAGE_REGISTRY }}/${{ vars.IMAGE_NAMESPACE }}/dev-builder-centos:${{ needs.release-dev-builder-images.outputs.version }}
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-centos:$IMAGE_VERSION \
+            docker://$ACR_IMAGE_REGISTRY/$IMAGE_NAMESPACE/dev-builder-centos:$IMAGE_VERSION

      - name: Push dev-builder-android image
        shell: bash
        if: ${{ inputs.release_dev_builder_android_image }}
+        env:
+          IMAGE_VERSION: ${{ needs.release-dev-builder-images.outputs.version }}
+          IMAGE_NAMESPACE: ${{ vars.IMAGE_NAMESPACE }}
+          ACR_IMAGE_REGISTRY: ${{ vars.ACR_IMAGE_REGISTRY }}
        run: |
          docker run -v "${DOCKER_CONFIG:-$HOME/.docker}:/root/.docker:ro" \
            -e "REGISTRY_AUTH_FILE=/root/.docker/config.json" \
            quay.io/skopeo/stable:latest \
-            copy -a docker://docker.io/${{ vars.IMAGE_NAMESPACE }}/dev-builder-android:${{ needs.release-dev-builder-images.outputs.version }} \
-            docker://${{ vars.ACR_IMAGE_REGISTRY }}/${{ vars.IMAGE_NAMESPACE }}/dev-builder-android:${{ needs.release-dev-builder-images.outputs.version }}
+            copy -a docker://docker.io/$IMAGE_NAMESPACE/dev-builder-android:$IMAGE_VERSION \
+            docker://$ACR_IMAGE_REGISTRY/$IMAGE_NAMESPACE/dev-builder-android:$IMAGE_VERSION
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -93,11 +93,6 @@ env:
  # Note: The NEXT_RELEASE_VERSION should be modified manually by every formal release.
  NEXT_RELEASE_VERSION: v0.12.0

-# Permission reference: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
-permissions:
-  issues: write # Allows the action to create issues for cyborg.
-  contents: write # Allows the action to create a release.
-
 jobs:
  allocate-runners:
    name: Allocate runners
@@ -122,6 +117,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Check Rust toolchain version
        shell: bash
@@ -181,6 +177,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - uses: ./.github/actions/build-linux-artifacts
        with:
@@ -202,6 +199,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - uses: ./.github/actions/build-linux-artifacts
        with:
@@ -237,6 +235,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - uses: ./.github/actions/build-macos-artifacts
        with:
@@ -276,6 +275,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - uses: ./.github/actions/build-windows-artifacts
        with:
@@ -306,15 +306,18 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Build and push images to dockerhub
        uses: ./.github/actions/build-images
        with:
          image-registry: docker.io
          image-namespace: ${{ vars.IMAGE_NAMESPACE }}
+          image-name: ${{ vars.GREPTIMEDB_IMAGE_NAME }}
          image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
          image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
          version: ${{ needs.allocate-runners.outputs.version }}
+          push-latest-tag: true

      - name: Set build image result
        id: set-build-image-result
@@ -341,13 +344,14 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Release artifacts to CN region
        uses: ./.github/actions/release-cn-artifacts
        with:
          src-image-registry: docker.io
          src-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
-          src-image-name: greptimedb
+          src-image-name: ${{ vars.GREPTIMEDB_IMAGE_NAME }}
          dst-image-registry-username: ${{ secrets.ALICLOUD_USERNAME }}
          dst-image-registry-password: ${{ secrets.ALICLOUD_PASSWORD }}
          dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
@@ -358,6 +362,7 @@ jobs:
          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
          dev-mode: false
+          upload-to-s3: true
          update-version-info: true
          push-latest-tag: true

@@ -377,6 +382,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Publish GitHub release
        uses: ./.github/actions/publish-github-release
@@ -400,6 +406,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Stop EC2 runner
        uses: ./.github/actions/stop-runner
@@ -425,6 +432,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          persist-credentials: false

      - name: Stop EC2 runner
        uses: ./.github/actions/stop-runner
@@ -441,8 +449,15 @@ jobs:
    if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }}
    needs: [allocate-runners]
    runs-on: ubuntu-20.04
+    # Permission reference: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
+    permissions:
+      issues: write # Allows the action to create issues for cyborg.
+      contents: write # Allows the action to create a release.
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Bump doc version
        working-directory: cyborg
@@ -461,10 +476,17 @@ jobs:
      build-windows-artifacts,
    ]
    runs-on: ubuntu-20.04
+    # Permission reference: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
+    permissions:
+      issues: write # Allows the action to create issues for cyborg.
+      contents: write # Allows the action to create a release.
    env:
      SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Report CI status
        id: report-ci-status
--- a/.github/workflows/schedule.yml
+++ b/.github/workflows/schedule.yml
@@ -4,18 +4,20 @@ on:
    - cron: '4 2 * * *'
  workflow_dispatch:

-permissions:
-  contents: read
-  issues: write
-  pull-requests: write

 jobs:
  maintenance:
    name: Periodic Maintenance
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      issues: write
+      pull-requests: write
    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Do Maintenance
        working-directory: cyborg
--- a/.github/workflows/semantic-pull-request.yml
+++ b/.github/workflows/semantic-pull-request.yml
@@ -1,18 +1,24 @@
 name: "Semantic Pull Request"

 on:
-  pull_request_target:
+  pull_request:
    types:
      - opened
      - reopened
      - edited

+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  check:
    runs-on: ubuntu-20.04
    timeout-minutes: 10
    steps:
      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
      - uses: ./.github/actions/setup-cyborg
      - name: Check Pull Request
        working-directory: cyborg
--- a/AUTHOR.md
+++ b/AUTHOR.md
@@ -3,30 +3,28 @@
 ## Individual Committers (in alphabetical order)

 * [CookiePieWw](https://github.com/CookiePieWw)
-* [KKould](https://github.com/KKould)
-* [NiwakaDev](https://github.com/NiwakaDev)
 * [etolbakov](https://github.com/etolbakov)
 * [irenjj](https://github.com/irenjj)
-* [tisonkun](https://github.com/tisonkun)
+* [KKould](https://github.com/KKould)
 * [Lanqing Yang](https://github.com/lyang24)
+* [NiwakaDev](https://github.com/NiwakaDev)
+* [tisonkun](https://github.com/tisonkun)
+

 ## Team Members (in alphabetical order)

-* [Breeze-P](https://github.com/Breeze-P)
-* [GrepTime](https://github.com/GrepTime)
-* [MichaelScofield](https://github.com/MichaelScofield)
-* [Wenjie0329](https://github.com/Wenjie0329)
-* [WenyXu](https://github.com/WenyXu)
-* [ZonaHex](https://github.com/ZonaHex)
 * [apdong2022](https://github.com/apdong2022)
 * [beryl678](https://github.com/beryl678)
+* [Breeze-P](https://github.com/Breeze-P)
 * [daviderli614](https://github.com/daviderli614)
 * [discord9](https://github.com/discord9)
 * [evenyag](https://github.com/evenyag)
 * [fengjiachun](https://github.com/fengjiachun)
 * [fengys1996](https://github.com/fengys1996)
+* [GrepTime](https://github.com/GrepTime)
 * [holalengyu](https://github.com/holalengyu)
 * [killme2008](https://github.com/killme2008)
+* [MichaelScofield](https://github.com/MichaelScofield)
 * [nicecui](https://github.com/nicecui)
 * [paomian](https://github.com/paomian)
 * [shuiyisong](https://github.com/shuiyisong)
@@ -34,11 +32,14 @@
 * [sunng87](https://github.com/sunng87)
 * [v0y4g3r](https://github.com/v0y4g3r)
 * [waynexia](https://github.com/waynexia)
+* [Wenjie0329](https://github.com/Wenjie0329)
+* [WenyXu](https://github.com/WenyXu)
 * [xtang](https://github.com/xtang)
 * [zhaoyingnan01](https://github.com/zhaoyingnan01)
 * [zhongzc](https://github.com/zhongzc)
+* [ZonaHex](https://github.com/ZonaHex)
 * [zyy17](https://github.com/zyy17)

 ## All Contributors

-[![All Contributors](https://contrib.rocks/image?repo=GreptimeTeam/greptimedb)](https://github.com/GreptimeTeam/greptimedb/graphs/contributors)
+To see the full list of contributors, please visit our [Contributors page](https://github.com/GreptimeTeam/greptimedb/graphs/contributors)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -313,7 +313,7 @@ dependencies = [
 "arrow-data",
 "arrow-schema",
 "chrono",
- "chrono-tz 0.10.1",
+ "chrono-tz",
 "half",
 "hashbrown 0.15.2",
 "num",
@@ -432,7 +432,7 @@ dependencies = [
 "arrow-schema",
 "chrono",
 "half",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "lexical-core",
 "num",
 "serde",
@@ -1053,7 +1053,7 @@ dependencies = [
 "bitflags 2.6.0",
 "cexpr",
 "clang-sys",
- "itertools 0.11.0",
+ "itertools 0.13.0",
 "proc-macro2",
 "quote",
 "regex",
@@ -1475,7 +1475,7 @@ version = "0.13.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6026d8cd82ada8bbcfe337805dd1eb6afdc9e80fa4d57e977b3a36315e0c5525"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "lazy_static",
 "num-traits",
 "regex",
@@ -1508,28 +1508,6 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

-[[package]]
-name = "chrono-tz"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e"
-dependencies = [
- "chrono",
- "chrono-tz-build 0.2.1",
- "phf",
-]
-
-[[package]]
-name = "chrono-tz"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb"
-dependencies = [
- "chrono",
- "chrono-tz-build 0.3.0",
- "phf",
-]
-
 [[package]]
 name = "chrono-tz"
 version = "0.10.1"
@@ -1537,32 +1515,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f"
 dependencies = [
 "chrono",
- "chrono-tz-build 0.4.0",
+ "chrono-tz-build",
 "phf",
 ]

-[[package]]
-name = "chrono-tz-build"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f"
-dependencies = [
- "parse-zoneinfo",
- "phf",
- "phf_codegen",
-]
-
-[[package]]
-name = "chrono-tz-build"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1"
-dependencies = [
- "parse-zoneinfo",
- "phf",
- "phf_codegen",
-]
-
 [[package]]
 name = "chrono-tz-build"
 version = "0.4.0"
@@ -2053,10 +2009,12 @@ dependencies = [
 name = "common-function"
 version = "0.12.0"
 dependencies = [
+ "ahash 0.8.11",
 "api",
 "approx 0.5.1",
 "arc-swap",
 "async-trait",
+ "bincode",
 "common-base",
 "common-catalog",
 "common-error",
@@ -2074,6 +2032,7 @@ dependencies = [
 "geo-types",
 "geohash",
 "h3o",
+ "hyperloglogplus",
 "jsonb",
 "nalgebra 0.33.2",
 "num",
@@ -2090,6 +2049,7 @@ dependencies = [
 "store-api",
 "table",
 "tokio",
+ "uddsketch",
 "wkt",
 ]

@@ -2425,7 +2385,7 @@ version = "0.12.0"
 dependencies = [
 "arrow",
 "chrono",
- "chrono-tz 0.8.6",
+ "chrono-tz",
 "common-error",
 "common-macro",
 "humantime",
@@ -3016,7 +2976,7 @@ dependencies = [
 "chrono",
 "half",
 "hashbrown 0.14.5",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "libc",
 "object_store",
 "parquet",
@@ -3076,7 +3036,7 @@ dependencies = [
 "datafusion-functions-aggregate-common",
 "datafusion-functions-window-common",
 "datafusion-physical-expr-common",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "paste",
 "recursive",
 "serde_json",
@@ -3198,7 +3158,7 @@ dependencies = [
 "datafusion-physical-expr-common",
 "datafusion-physical-plan",
 "half",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "log",
 "parking_lot 0.12.3",
 "paste",
@@ -3249,7 +3209,7 @@ dependencies = [
 "datafusion-common",
 "datafusion-expr",
 "datafusion-physical-expr",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itertools 0.13.0",
 "log",
 "recursive",
@@ -3274,7 +3234,7 @@ dependencies = [
 "datafusion-physical-expr-common",
 "half",
 "hashbrown 0.14.5",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itertools 0.13.0",
 "log",
 "paste",
@@ -3333,7 +3293,7 @@ dependencies = [
 "futures",
 "half",
 "hashbrown 0.14.5",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itertools 0.13.0",
 "log",
 "once_cell",
@@ -3353,7 +3313,7 @@ dependencies = [
 "arrow-schema",
 "datafusion-common",
 "datafusion-expr",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "log",
 "recursive",
 "regex",
@@ -3420,6 +3380,7 @@ dependencies = [
 "meta-client",
 "metric-engine",
 "mito2",
+ "num_cpus",
 "object-store",
 "prometheus",
 "prost 0.13.3",
@@ -4204,6 +4165,7 @@ dependencies = [
 "bytes",
 "cache",
 "catalog",
+ "chrono",
 "client",
 "common-base",
 "common-catalog",
@@ -4240,6 +4202,7 @@ dependencies = [
 "meta-client",
 "nom",
 "num-traits",
+ "num_cpus",
 "operator",
 "partition",
 "pretty_assertions",
@@ -4336,6 +4299,7 @@ dependencies = [
 "common-test-util",
 "common-time",
 "common-version",
+ "datafusion",
 "datafusion-expr",
 "datanode",
 "datatypes",
@@ -4345,6 +4309,7 @@ dependencies = [
 "log-query",
 "log-store",
 "meta-client",
+ "num_cpus",
 "opentelemetry-proto 0.27.0",
 "operator",
 "partition",
@@ -4735,7 +4700,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=683e9d10ae7f3dfb8aaabd89082fc600c17e3795#683e9d10ae7f3dfb8aaabd89082fc600c17e3795"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=072ce580502e015df1a6b03a185b60309a7c2a7a#072ce580502e015df1a6b03a185b60309a7c2a7a"
 dependencies = [
 "prost 0.13.3",
 "serde",
@@ -4758,7 +4723,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.12",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "slab",
 "tokio",
 "tokio-util",
@@ -4777,7 +4742,7 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http 1.1.0",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "slab",
 "tokio",
 "tokio-util",
@@ -5327,6 +5292,15 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "hyperloglogplus"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "i_float"
 version = "1.3.1"
@@ -5615,9 +5589,9 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
 dependencies = [
 "equivalent",
 "hashbrown 0.15.2",
@@ -5631,7 +5605,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
 "ahash 0.8.11",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "is-terminal",
 "itoa",
 "log",
@@ -5978,7 +5952,7 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ee7893dab2e44ae5f9d0173f26ff4aa327c10b01b06a72b52dd9405b628640d"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 ]

 [[package]]
@@ -6268,7 +6242,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
 "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.6",
 ]

 [[package]]
@@ -6461,7 +6435,7 @@ dependencies = [
 "cactus",
 "cfgrammar",
 "filetime",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "lazy_static",
 "lrtable",
 "num-traits",
@@ -6780,6 +6754,7 @@ version = "0.12.0"
 dependencies = [
 "api",
 "aquamarine",
+ "async-stream",
 "async-trait",
 "base64 0.21.7",
 "common-base",
@@ -6792,6 +6767,7 @@ dependencies = [
 "common-time",
 "datafusion",
 "datatypes",
+ "futures-util",
 "itertools 0.10.5",
 "lazy_static",
 "mito2",
@@ -7700,7 +7676,7 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
 dependencies = [
 "futures-core",
 "futures-sink",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "js-sys",
 "once_cell",
 "pin-project-lite",
@@ -7884,7 +7860,7 @@ dependencies = [
 "bytemuck",
 "bytes",
 "chrono",
- "chrono-tz 0.10.1",
+ "chrono-tz",
 "fallible-streaming-iterator",
 "flate2",
 "futures",
@@ -8272,7 +8248,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
 "fixedbitset",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 ]

 [[package]]
@@ -8379,7 +8355,7 @@ dependencies = [
 "async-trait",
 "catalog",
 "chrono",
- "chrono-tz 0.9.0",
+ "chrono-tz",
 "common-catalog",
 "common-error",
 "common-function",
@@ -8797,8 +8773,7 @@ dependencies = [
 [[package]]
 name = "promql-parser"
 version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe99e6f80a79abccf1e8fb48dd63473a36057e600cc6ea36147c8318698ae6f"
+source = "git+https://github.com/GreptimeTeam/promql-parser.git?rev=27abb8e16003a50c720f00d6c85f41f5fa2a2a8e#27abb8e16003a50c720f00d6c85f41f5fa2a2a8e"
 dependencies = [
 "cfgrammar",
 "chrono",
@@ -8869,7 +8844,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
 "heck 0.5.0",
- "itertools 0.11.0",
+ "itertools 0.13.0",
 "log",
 "multimap",
 "once_cell",
@@ -8915,7 +8890,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
 "anyhow",
- "itertools 0.11.0",
+ "itertools 0.13.0",
 "proc-macro2",
 "quote",
 "syn 2.0.96",
@@ -9129,6 +9104,7 @@ dependencies = [
 "table",
 "tokio",
 "tokio-stream",
+ "unescaper",
 "uuid",
 ]

@@ -10363,7 +10339,7 @@ version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itoa",
 "memchr",
 "ryu",
@@ -10434,7 +10410,7 @@ dependencies = [
 "chrono",
 "hex",
 "indexmap 1.9.3",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "serde",
 "serde_derive",
 "serde_json",
@@ -10460,7 +10436,7 @@ version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itoa",
 "ryu",
 "serde",
@@ -10521,6 +10497,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 1.4.1",
+ "indexmap 2.7.1",
 "influxdb_line_protocol",
 "itertools 0.10.5",
 "json5",
@@ -10561,6 +10538,7 @@ dependencies = [
 "session",
 "snafu 0.8.5",
 "snap",
+ "socket2",
 "sql",
 "store-api",
 "strum 0.25.0",
@@ -10930,12 +10908,12 @@ dependencies = [
 [[package]]
 name = "sqlness"
 version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "308a7338f2211813d6e9da117e9b9b7aee5d072872d11a934002fd2bd4ab5276"
+source = "git+https://github.com/CeresDB/sqlness.git?rev=bb91f31ff58993e07ea89845791235138283a24c#bb91f31ff58993e07ea89845791235138283a24c"
 dependencies = [
 "async-trait",
 "derive_builder 0.11.2",
 "duration-str",
+ "futures",
 "minijinja",
 "prettydiff",
 "regex",
@@ -10961,6 +10939,7 @@ dependencies = [
 "hex",
 "local-ip-address",
 "mysql",
+ "num_cpus",
 "reqwest",
 "serde",
 "serde_json",
@@ -11060,7 +11039,7 @@ dependencies = [
 "futures-util",
 "hashbrown 0.15.2",
 "hashlink",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "log",
 "memchr",
 "once_cell",
@@ -12356,7 +12335,7 @@ version = "0.19.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "toml_datetime",
 "winnow 0.5.40",
 ]
@@ -12367,7 +12346,7 @@ version = "0.22.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -12505,7 +12484,7 @@ dependencies = [
 "futures-core",
 "futures-util",
 "hdrhistogram",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "pin-project-lite",
 "slab",
 "sync_wrapper 1.0.1",
@@ -12993,6 +12972,23 @@ version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"

+[[package]]
+name = "uddsketch"
+version = "0.1.0"
+source = "git+https://github.com/GreptimeTeam/timescaledb-toolkit.git?rev=84828fe8fb494a6a61412a3da96517fc80f7bb20#84828fe8fb494a6a61412a3da96517fc80f7bb20"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "unescaper"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c878a167baa8afd137494101a688ef8c67125089ff2249284bd2b5f9bfedb815"
+dependencies = [
+ "thiserror 1.0.64",
+]
+
 [[package]]
 name = "unicase"
 version = "2.7.0"
@@ -13409,7 +13405,7 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,6 +81,7 @@ rust.unknown_lints = "deny"
 rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }

 [workspace.dependencies]
+# DO_NOT_REMOVE_THIS: BEGIN_OF_EXTERNAL_DEPENDENCIES
 # We turn off default-features for some dependencies here so the workspaces which inherit them can
 # selectively turn them on if needed, since we can override default-features = true (from false)
 # for the inherited dependency but cannot do the reverse (override from true to false).
@@ -106,6 +107,7 @@ bitflags = "2.4.1"
 bytemuck = "1.12"
 bytes = { version = "1.7", features = ["serde"] }
 chrono = { version = "0.4", features = ["serde"] }
+chrono-tz = "0.10.1"
 clap = { version = "4.4", features = ["derive"] }
 config = "0.13.0"
 crossbeam-utils = "0.8"
@@ -127,7 +129,7 @@ etcd-client = "0.14"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "683e9d10ae7f3dfb8aaabd89082fc600c17e3795" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "072ce580502e015df1a6b03a185b60309a7c2a7a" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -158,7 +160,9 @@ parquet = { version = "53.0.0", default-features = false, features = ["arrow", "
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { version = "0.4.3", features = ["ser"] }
+promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", features = [
+    "ser",
+], rev = "27abb8e16003a50c720f00d6c85f41f5fa2a2a8e" }
 prost = "0.13"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.8"
@@ -207,6 +211,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"]
 typetag = "0.2"
 uuid = { version = "1.7", features = ["serde", "v4", "fast-rng"] }
 zstd = "0.13"
+# DO_NOT_REMOVE_THIS: END_OF_EXTERNAL_DEPENDENCIES

 ## workspaces members
 api = { path = "src/api" }
--- a/config/config.md
+++ b/config/config.md
@@ -40,6 +40,7 @@
 | `mysql.enable` | Bool | `true` | Whether to enable. |
 | `mysql.addr` | String | `127.0.0.1:4002` | The addr to bind the MySQL server. |
 | `mysql.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `mysql.keep_alive` | String | `0s` | Server-side keep-alive time.<br/>Set to 0 (default) to disable. |
 | `mysql.tls` | -- | -- | -- |
 | `mysql.tls.mode` | String | `disable` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- `disable` (default value)<br/>- `prefer`<br/>- `require`<br/>- `verify-ca`<br/>- `verify-full` |
 | `mysql.tls.cert_path` | String | Unset | Certificate file path. |
@@ -49,6 +50,7 @@
 | `postgres.enable` | Bool | `true` | Whether to enable |
 | `postgres.addr` | String | `127.0.0.1:4003` | The addr to bind the PostgresSQL server. |
 | `postgres.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `postgres.keep_alive` | String | `0s` | Server-side keep-alive time.<br/>Set to 0 (default) to disable. |
 | `postgres.tls` | -- | -- | PostgresSQL server TLS options, see `mysql.tls` section. |
 | `postgres.tls.mode` | String | `disable` | TLS mode. |
 | `postgres.tls.cert_path` | String | Unset | Certificate file path. |
@@ -58,6 +60,8 @@
 | `opentsdb.enable` | Bool | `true` | Whether to enable OpenTSDB put in HTTP API. |
 | `influxdb` | -- | -- | InfluxDB protocol options. |
 | `influxdb.enable` | Bool | `true` | Whether to enable InfluxDB protocol in HTTP API. |
+| `jaeger` | -- | -- | Jaeger protocol options. |
+| `jaeger.enable` | Bool | `true` | Whether to enable Jaeger protocol in HTTP API. |
 | `prom_store` | -- | -- | Prometheus remote storage options |
 | `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. |
 | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
@@ -148,6 +152,7 @@
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
 | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
 | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
+| `region_engine.mito.index.staging_ttl` | String | `7d` | The TTL of the staging directory.<br/>Defaults to 7 days.<br/>Setting it to "0s" to disable TTL. |
 | `region_engine.mito.index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
 | `region_engine.mito.index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
 | `region_engine.mito.index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
@@ -234,6 +239,7 @@
 | `mysql.enable` | Bool | `true` | Whether to enable. |
 | `mysql.addr` | String | `127.0.0.1:4002` | The addr to bind the MySQL server. |
 | `mysql.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `mysql.keep_alive` | String | `0s` | Server-side keep-alive time.<br/>Set to 0 (default) to disable. |
 | `mysql.tls` | -- | -- | -- |
 | `mysql.tls.mode` | String | `disable` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- `disable` (default value)<br/>- `prefer`<br/>- `require`<br/>- `verify-ca`<br/>- `verify-full` |
 | `mysql.tls.cert_path` | String | Unset | Certificate file path. |
@@ -243,6 +249,7 @@
 | `postgres.enable` | Bool | `true` | Whether to enable |
 | `postgres.addr` | String | `127.0.0.1:4003` | The addr to bind the PostgresSQL server. |
 | `postgres.runtime_size` | Integer | `2` | The number of server worker threads. |
+| `postgres.keep_alive` | String | `0s` | Server-side keep-alive time.<br/>Set to 0 (default) to disable. |
 | `postgres.tls` | -- | -- | PostgresSQL server TLS options, see `mysql.tls` section. |
 | `postgres.tls.mode` | String | `disable` | TLS mode. |
 | `postgres.tls.cert_path` | String | Unset | Certificate file path. |
@@ -252,6 +259,8 @@
 | `opentsdb.enable` | Bool | `true` | Whether to enable OpenTSDB put in HTTP API. |
 | `influxdb` | -- | -- | InfluxDB protocol options. |
 | `influxdb.enable` | Bool | `true` | Whether to enable InfluxDB protocol in HTTP API. |
+| `jaeger` | -- | -- | Jaeger protocol options. |
+| `jaeger.enable` | Bool | `true` | Whether to enable Jaeger protocol in HTTP API. |
 | `prom_store` | -- | -- | Prometheus remote storage options |
 | `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. |
 | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
@@ -310,6 +319,7 @@
 | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
 | `use_memory_store` | Bool | `false` | Store data in memory. |
 | `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
+| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
 | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
@@ -483,6 +493,7 @@
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
 | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
 | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
+| `region_engine.mito.index.staging_ttl` | String | `7d` | The TTL of the staging directory.<br/>Defaults to 7 days.<br/>Setting it to "0s" to disable TTL. |
 | `region_engine.mito.index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
 | `region_engine.mito.index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
 | `region_engine.mito.index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -497,6 +497,11 @@ aux_path = ""
 ## The max capacity of the staging directory.
 staging_size = "2GB"

+## The TTL of the staging directory.
+## Defaults to 7 days.
+## Setting it to "0s" to disable TTL.
+staging_ttl = "7d"
+
 ## Cache size for inverted index metadata.
 metadata_cache_size = "64MiB"

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -74,6 +74,9 @@ enable = true
 addr = "127.0.0.1:4002"
 ## The number of server worker threads.
 runtime_size = 2
+## Server-side keep-alive time.
+## Set to 0 (default) to disable.
+keep_alive = "0s"

 # MySQL server TLS options.
 [mysql.tls]
@@ -105,6 +108,9 @@ enable = true
 addr = "127.0.0.1:4003"
 ## The number of server worker threads.
 runtime_size = 2
+## Server-side keep-alive time.
+## Set to 0 (default) to disable.
+keep_alive = "0s"

 ## PostgresSQL server TLS options, see `mysql.tls` section.
 [postgres.tls]
@@ -132,6 +138,11 @@ enable = true
 ## Whether to enable InfluxDB protocol in HTTP API.
 enable = true

+## Jaeger protocol options.
+[jaeger]
+## Whether to enable Jaeger protocol in HTTP API.
+enable = true
+
 ## Prometheus remote storage options
 [prom_store]
 ## Whether to enable Prometheus remote write and read in HTTP API.
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -50,6 +50,9 @@ use_memory_store = false
 ## - Using shared storage (e.g., s3).
 enable_region_failover = false

+## Max allowed idle time before removing node info from metasrv memory.
+node_max_idle_time = "24hours"
+
 ## Whether to enable greptimedb telemetry. Enabled by default.
 #+ enable_telemetry = true

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -78,6 +78,9 @@ enable = true
 addr = "127.0.0.1:4002"
 ## The number of server worker threads.
 runtime_size = 2
+## Server-side keep-alive time.
+## Set to 0 (default) to disable.
+keep_alive = "0s"

 # MySQL server TLS options.
 [mysql.tls]
@@ -109,6 +112,9 @@ enable = true
 addr = "127.0.0.1:4003"
 ## The number of server worker threads.
 runtime_size = 2
+## Server-side keep-alive time.
+## Set to 0 (default) to disable.
+keep_alive = "0s"

 ## PostgresSQL server TLS options, see `mysql.tls` section.
 [postgres.tls]
@@ -136,6 +142,11 @@ enable = true
 ## Whether to enable InfluxDB protocol in HTTP API.
 enable = true

+## Jaeger protocol options.
+[jaeger]
+## Whether to enable Jaeger protocol in HTTP API.
+enable = true
+
 ## Prometheus remote storage options
 [prom_store]
 ## Whether to enable Prometheus remote write and read in HTTP API.
@@ -573,6 +584,11 @@ aux_path = ""
 ## The max capacity of the staging directory.
 staging_size = "2GB"

+## The TTL of the staging directory.
+## Defaults to 7 days.
+## Setting it to "0s" to disable TTL.
+staging_ttl = "7d"
+
 ## Cache size for inverted index metadata.
 metadata_cache_size = "64MiB"

--- a/docs/logo-text-padding-dark.png
+++ b/docs/logo-text-padding-dark.png
--- a/docs/logo-text-padding.png
+++ b/docs/logo-text-padding.png
--- a/grafana/greptimedb-cluster.json
+++ b/grafana/greptimedb-cluster.json
--- a/grafana/greptimedb.json
+++ b/grafana/greptimedb.json
@@ -384,8 +384,8 @@
        "rowHeight": 0.9,
        "showValue": "auto",
        "tooltip": {
-          "mode": "none",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -483,8 +483,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "pluginVersion": "10.2.3",
@@ -578,8 +578,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "pluginVersion": "10.2.3",
@@ -601,7 +601,7 @@
      "type": "timeseries"
    },
    {
-      "collapsed": true,
+      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
@@ -684,8 +684,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -878,8 +878,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1124,8 +1124,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1223,8 +1223,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1322,8 +1322,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1456,8 +1456,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1573,8 +1573,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1673,8 +1673,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1773,8 +1773,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1890,8 +1890,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2002,8 +2002,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2120,8 +2120,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2233,8 +2233,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2334,8 +2334,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2435,8 +2435,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2548,8 +2548,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2661,8 +2661,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2788,8 +2788,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2889,8 +2889,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2990,8 +2990,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3091,8 +3091,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3191,8 +3191,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3302,8 +3302,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3432,8 +3432,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3543,8 +3543,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3657,8 +3657,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3808,8 +3808,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3909,8 +3909,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -4011,8 +4011,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -4113,8 +4113,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@@ -15,13 +15,10 @@ common-macro.workspace = true
 common-time.workspace = true
 datatypes.workspace = true
 greptime-proto.workspace = true
-paste = "1.0"
+paste.workspace = true
 prost.workspace = true
 serde_json.workspace = true
 snafu.workspace = true

 [build-dependencies]
 tonic-build = "0.11"
-
-[dev-dependencies]
-paste = "1.0"
--- a/src/api/src/v1/column_def.rs
+++ b/src/api/src/v1/column_def.rs
@@ -15,10 +15,10 @@
 use std::collections::HashMap;

 use datatypes::schema::{
-    ColumnDefaultConstraint, ColumnSchema, FulltextAnalyzer, FulltextOptions, COMMENT_KEY,
-    FULLTEXT_KEY, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY,
+    ColumnDefaultConstraint, ColumnSchema, FulltextAnalyzer, FulltextOptions, SkippingIndexType,
+    COMMENT_KEY, FULLTEXT_KEY, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY,
 };
-use greptime_proto::v1::Analyzer;
+use greptime_proto::v1::{Analyzer, SkippingIndexType as PbSkippingIndexType};
 use snafu::ResultExt;

 use crate::error::{self, Result};
@@ -121,6 +121,13 @@ pub fn as_fulltext_option(analyzer: Analyzer) -> FulltextAnalyzer {
    }
 }

+/// Tries to construct a `SkippingIndexType` from the given skipping index type.
+pub fn as_skipping_index_type(skipping_index_type: PbSkippingIndexType) -> SkippingIndexType {
+    match skipping_index_type {
+        PbSkippingIndexType::BloomFilter => SkippingIndexType::BloomFilter,
+    }
+}
+
 #[cfg(test)]
 mod tests {

--- a/src/catalog/Cargo.toml
+++ b/src/catalog/Cargo.toml
@@ -15,7 +15,7 @@ api.workspace = true
 arrow.workspace = true
 arrow-schema.workspace = true
 async-stream.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 bytes.workspace = true
 common-catalog.workspace = true
 common-error.workspace = true
@@ -31,7 +31,7 @@ common-version.workspace = true
 dashmap.workspace = true
 datafusion.workspace = true
 datatypes.workspace = true
-futures = "0.3"
+futures.workspace = true
 futures-util.workspace = true
 humantime.workspace = true
 itertools.workspace = true
@@ -39,7 +39,7 @@ lazy_static.workspace = true
 meta-client.workspace = true
 moka = { workspace = true, features = ["future", "sync"] }
 partition.workspace = true
-paste = "1.0"
+paste.workspace = true
 prometheus.workspace = true
 rustc-hash.workspace = true
 serde_json.workspace = true
@@ -49,7 +49,7 @@ sql.workspace = true
 store-api.workspace = true
 table.workspace = true
 tokio.workspace = true
-tokio-stream = "0.1"
+tokio-stream.workspace = true

 [dev-dependencies]
 cache.workspace = true
--- a/src/catalog/src/system_schema/information_schema/key_column_usage.rs
+++ b/src/catalog/src/system_schema/information_schema/key_column_usage.rs
@@ -228,12 +228,6 @@ impl InformationSchemaKeyColumnUsageBuilder {
                let keys = &table_info.meta.primary_key_indices;
                let schema = table.schema();

-                // For compatibility, use primary key columns as inverted index columns.
-                let pk_as_inverted_index = !schema
-                    .column_schemas()
-                    .iter()
-                    .any(|c| c.has_inverted_index_key());
-
                for (idx, column) in schema.column_schemas().iter().enumerate() {
                    let mut constraints = vec![];
                    if column.is_time_index() {
@@ -251,10 +245,6 @@ impl InformationSchemaKeyColumnUsageBuilder {
                    // TODO(dimbtp): foreign key constraint not supported yet
                    if keys.contains(&idx) {
                        constraints.push(PRI_CONSTRAINT_NAME);
-
-                        if pk_as_inverted_index {
-                            constraints.push(INVERTED_INDEX_CONSTRAINT_NAME);
-                        }
                    }
                    if column.is_inverted_indexed() {
                        constraints.push(INVERTED_INDEX_CONSTRAINT_NAME);
--- a/src/cli/src/bench.rs
+++ b/src/cli/src/bench.rs
@@ -24,7 +24,7 @@ use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::etcd::EtcdStore;
 use common_meta::kv_backend::memory::MemoryKvBackend;
 #[cfg(feature = "pg_kvbackend")]
-use common_meta::kv_backend::postgres::PgStore;
+use common_meta::kv_backend::rds::PgStore;
 use common_meta::peer::Peer;
 use common_meta::rpc::router::{Region, RegionRoute};
 use common_telemetry::info;
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -16,7 +16,6 @@

 mod client;
 pub mod client_manager;
-#[cfg(feature = "testing")]
 mod database;
 pub mod error;
 pub mod flow;
@@ -34,7 +33,6 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
 use snafu::OptionExt;

 pub use self::client::Client;
-#[cfg(feature = "testing")]
 pub use self::database::Database;
 pub use self::error::{Error, Result};
 use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
 use common_telemetry::info;
 use common_telemetry::logging::TracingOptions;
 use common_version::{short_version, version};
-use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
+use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
 use meta_client::{MetaClientOptions, MetaClientType};
 use servers::Mode;
 use snafu::{OptionExt, ResultExt};
@@ -317,6 +317,8 @@ impl StartCommand {
            Arc::new(executor),
        );

+        let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
+
        let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
        let flownode_builder = FlownodeBuilder::new(
            opts,
@@ -324,6 +326,7 @@ impl StartCommand {
            table_metadata_manager,
            catalog_manager.clone(),
            flow_metadata_manager,
+            Arc::new(frontend_client),
        )
        .with_heartbeat_task(heartbeat_task);

--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -42,7 +42,7 @@ pub struct Instance {
 }

 impl Instance {
-    fn new(instance: MetasrvInstance, guard: Vec<WorkerGuard>) -> Self {
+    pub fn new(instance: MetasrvInstance, guard: Vec<WorkerGuard>) -> Self {
        Self {
            instance,
            _guard: guard,
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -54,13 +54,17 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use file_engine::config::EngineConfig as FileEngineConfig;
-use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
+use flow::{
+    FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
+    FrontendInvoker,
+};
 use frontend::frontend::FrontendOptions;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
 use frontend::server::Services;
 use frontend::service_config::{
-    InfluxdbOptions, MysqlOptions, OpentsdbOptions, PostgresOptions, PromStoreOptions,
+    InfluxdbOptions, JaegerOptions, MysqlOptions, OpentsdbOptions, PostgresOptions,
+    PromStoreOptions,
 };
 use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
 use mito2::config::MitoConfig;
@@ -140,6 +144,7 @@ pub struct StandaloneOptions {
    pub postgres: PostgresOptions,
    pub opentsdb: OpentsdbOptions,
    pub influxdb: InfluxdbOptions,
+    pub jaeger: JaegerOptions,
    pub prom_store: PromStoreOptions,
    pub wal: DatanodeWalConfig,
    pub storage: StorageConfig,
@@ -169,6 +174,7 @@ impl Default for StandaloneOptions {
            postgres: PostgresOptions::default(),
            opentsdb: OpentsdbOptions::default(),
            influxdb: InfluxdbOptions::default(),
+            jaeger: JaegerOptions::default(),
            prom_store: PromStoreOptions::default(),
            wal: DatanodeWalConfig::default(),
            storage: StorageConfig::default(),
@@ -217,6 +223,7 @@ impl StandaloneOptions {
            postgres: cloned_opts.postgres,
            opentsdb: cloned_opts.opentsdb,
            influxdb: cloned_opts.influxdb,
+            jaeger: cloned_opts.jaeger,
            prom_store: cloned_opts.prom_store,
            meta_client: None,
            logging: cloned_opts.logging,
@@ -529,12 +536,16 @@ impl StartCommand {
            flow: opts.flow.clone(),
            ..Default::default()
        };
+
+        let fe_server_addr = fe_opts.grpc.bind_addr.clone();
+        let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
        let flow_builder = FlownodeBuilder::new(
            flownode_options,
            plugins.clone(),
            table_metadata_manager.clone(),
            catalog_manager.clone(),
            flow_metadata_manager.clone(),
+            Arc::new(frontend_client),
        );
        let flownode = Arc::new(
            flow_builder
--- a/src/common/base/Cargo.toml
+++ b/src/common/base/Cargo.toml
@@ -18,7 +18,7 @@ bytes.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 futures.workspace = true
-paste = "1.0"
+paste.workspace = true
 pin-project.workspace = true
 rand.workspace = true
 serde = { version = "1.0", features = ["derive"] }
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -35,7 +35,7 @@ orc-rust = { version = "0.5", default-features = false, features = [
    "async",
 ] }
 parquet.workspace = true
-paste = "1.0"
+paste.workspace = true
 rand.workspace = true
 regex = "1.7"
 serde.workspace = true
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -12,9 +12,11 @@ default = ["geo"]
 geo = ["geohash", "h3o", "s2", "wkt", "geo-types", "dep:geo"]

 [dependencies]
+ahash = "0.8"
 api.workspace = true
 arc-swap = "1.0"
 async-trait.workspace = true
+bincode = "1.3"
 common-base.workspace = true
 common-catalog.workspace = true
 common-error.workspace = true
@@ -32,12 +34,13 @@ geo = { version = "0.29", optional = true }
 geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
 h3o = { version = "0.6", optional = true }
+hyperloglogplus = "0.4"
 jsonb.workspace = true
 nalgebra.workspace = true
 num = "0.4"
 num-traits = "0.2"
 once_cell.workspace = true
-paste = "1.0"
+paste.workspace = true
 s2 = { version = "0.0.12", optional = true }
 serde.workspace = true
 serde_json.workspace = true
@@ -47,6 +50,7 @@ sql.workspace = true
 statrs = "0.16"
 store-api.workspace = true
 table.workspace = true
+uddsketch = { git = "https://github.com/GreptimeTeam/timescaledb-toolkit.git", rev = "84828fe8fb494a6a61412a3da96517fc80f7bb20" }
 wkt = { version = "0.11", optional = true }

 [dev-dependencies]
--- a/src/common/function/src/aggr.rs
+++ b/src/common/function/src/aggr.rs
@@ -0,0 +1,20 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod hll;
+mod uddsketch_state;
+
+pub(crate) use hll::HllStateType;
+pub use hll::{HllState, HLL_MERGE_NAME, HLL_NAME};
+pub use uddsketch_state::{UddSketchState, UDDSKETCH_STATE_NAME};
--- a/src/common/function/src/aggr/hll.rs
+++ b/src/common/function/src/aggr/hll.rs
@@ -0,0 +1,319 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_query::prelude::*;
+use common_telemetry::trace;
+use datafusion::arrow::array::ArrayRef;
+use datafusion::common::cast::{as_binary_array, as_string_array};
+use datafusion::common::not_impl_err;
+use datafusion::error::{DataFusionError, Result as DfResult};
+use datafusion::logical_expr::function::AccumulatorArgs;
+use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF};
+use datafusion::prelude::create_udaf;
+use datatypes::arrow::datatypes::DataType;
+use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
+
+use crate::utils::FixedRandomState;
+
+pub const HLL_NAME: &str = "hll";
+pub const HLL_MERGE_NAME: &str = "hll_merge";
+
+const DEFAULT_PRECISION: u8 = 14;
+
+pub(crate) type HllStateType = HyperLogLogPlus<String, FixedRandomState>;
+
+pub struct HllState {
+    hll: HllStateType,
+}
+
+impl std::fmt::Debug for HllState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "HllState<Opaque>")
+    }
+}
+
+impl Default for HllState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl HllState {
+    pub fn new() -> Self {
+        Self {
+            // Safety: the DEFAULT_PRECISION is fixed and valid
+            hll: HllStateType::new(DEFAULT_PRECISION, FixedRandomState::new()).unwrap(),
+        }
+    }
+
+    /// Create a UDF for the `hll` function.
+    ///
+    /// `hll` accepts a string column and aggregates the
+    /// values into a HyperLogLog state.
+    pub fn state_udf_impl() -> AggregateUDF {
+        create_udaf(
+            HLL_NAME,
+            vec![DataType::Utf8],
+            Arc::new(DataType::Binary),
+            Volatility::Immutable,
+            Arc::new(Self::create_accumulator),
+            Arc::new(vec![DataType::Binary]),
+        )
+    }
+
+    /// Create a UDF for the `hll_merge` function.
+    ///
+    /// `hll_merge` accepts a binary column of states generated by `hll`
+    /// and merges them into a single state.
+    pub fn merge_udf_impl() -> AggregateUDF {
+        create_udaf(
+            HLL_MERGE_NAME,
+            vec![DataType::Binary],
+            Arc::new(DataType::Binary),
+            Volatility::Immutable,
+            Arc::new(Self::create_merge_accumulator),
+            Arc::new(vec![DataType::Binary]),
+        )
+    }
+
+    fn update(&mut self, value: &str) {
+        self.hll.insert(value);
+    }
+
+    fn merge(&mut self, raw: &[u8]) {
+        if let Ok(serialized) = bincode::deserialize::<HllStateType>(raw) {
+            if let Ok(()) = self.hll.merge(&serialized) {
+                return;
+            }
+        }
+        trace!("Warning: Failed to merge HyperLogLog from {:?}", raw);
+    }
+
+    fn create_accumulator(acc_args: AccumulatorArgs) -> DfResult<Box<dyn DfAccumulator>> {
+        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
+
+        match data_type {
+            DataType::Utf8 => Ok(Box::new(HllState::new())),
+            other => not_impl_err!("{HLL_NAME} does not support data type: {other}"),
+        }
+    }
+
+    fn create_merge_accumulator(acc_args: AccumulatorArgs) -> DfResult<Box<dyn DfAccumulator>> {
+        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
+
+        match data_type {
+            DataType::Binary => Ok(Box::new(HllState::new())),
+            other => not_impl_err!("{HLL_MERGE_NAME} does not support data type: {other}"),
+        }
+    }
+}
+
+impl DfAccumulator for HllState {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
+        let array = &values[0];
+
+        match array.data_type() {
+            DataType::Utf8 => {
+                let string_array = as_string_array(array)?;
+                for value in string_array.iter().flatten() {
+                    self.update(value);
+                }
+            }
+            DataType::Binary => {
+                let binary_array = as_binary_array(array)?;
+                for v in binary_array.iter().flatten() {
+                    self.merge(v);
+                }
+            }
+            _ => {
+                return not_impl_err!(
+                    "HLL functions do not support data type: {}",
+                    array.data_type()
+                )
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> DfResult<ScalarValue> {
+        Ok(ScalarValue::Binary(Some(
+            bincode::serialize(&self.hll).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize HyperLogLog: {}", e))
+            })?,
+        )))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(&self.hll)
+    }
+
+    fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
+        Ok(vec![ScalarValue::Binary(Some(
+            bincode::serialize(&self.hll).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize HyperLogLog: {}", e))
+            })?,
+        ))])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
+        let array = &states[0];
+        let binary_array = as_binary_array(array)?;
+        for v in binary_array.iter().flatten() {
+            self.merge(v);
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::arrow::array::{BinaryArray, StringArray};
+
+    use super::*;
+
+    #[test]
+    fn test_hll_basic() {
+        let mut state = HllState::new();
+        state.update("1");
+        state.update("2");
+        state.update("3");
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(hll.count().trunc() as u32, 3);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_hll_roundtrip() {
+        let mut state = HllState::new();
+        state.update("1");
+        state.update("2");
+
+        // Serialize
+        let serialized = state.evaluate().unwrap();
+
+        // Create new state and merge the serialized data
+        let mut new_state = HllState::new();
+        if let ScalarValue::Binary(Some(bytes)) = &serialized {
+            new_state.merge(bytes);
+
+            // Verify the merged state matches original
+            let result = new_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(new_bytes)) = result {
+                let mut original: HllStateType = bincode::deserialize(bytes).unwrap();
+                let mut merged: HllStateType = bincode::deserialize(&new_bytes).unwrap();
+                assert_eq!(original.count(), merged.count());
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_hll_batch_update() {
+        let mut state = HllState::new();
+
+        // Test string values
+        let str_values = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i"];
+        let str_array = Arc::new(StringArray::from(str_values)) as ArrayRef;
+        state.update_batch(&[str_array]).unwrap();
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(hll.count().trunc() as u32, 9);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_hll_merge_batch() {
+        let mut state1 = HllState::new();
+        state1.update("1");
+        let state1_binary = state1.evaluate().unwrap();
+
+        let mut state2 = HllState::new();
+        state2.update("2");
+        let state2_binary = state2.evaluate().unwrap();
+
+        let mut merged_state = HllState::new();
+        if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
+            (&state1_binary, &state2_binary)
+        {
+            let binary_array = Arc::new(BinaryArray::from(vec![
+                bytes1.as_slice(),
+                bytes2.as_slice(),
+            ])) as ArrayRef;
+            merged_state.merge_batch(&[binary_array]).unwrap();
+
+            let result = merged_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(bytes)) = result {
+                let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+                assert_eq!(hll.count().trunc() as u32, 2);
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar values");
+        }
+    }
+
+    #[test]
+    fn test_hll_merge_function() {
+        // Create two HLL states with different values
+        let mut state1 = HllState::new();
+        state1.update("1");
+        state1.update("2");
+        let state1_binary = state1.evaluate().unwrap();
+
+        let mut state2 = HllState::new();
+        state2.update("2");
+        state2.update("3");
+        let state2_binary = state2.evaluate().unwrap();
+
+        // Create a merge state and merge both states
+        let mut merge_state = HllState::new();
+        if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
+            (&state1_binary, &state2_binary)
+        {
+            let binary_array = Arc::new(BinaryArray::from(vec![
+                bytes1.as_slice(),
+                bytes2.as_slice(),
+            ])) as ArrayRef;
+            merge_state.update_batch(&[binary_array]).unwrap();
+
+            let result = merge_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(bytes)) = result {
+                let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+                // Should have 3 unique values: "1", "2", "3"
+                assert_eq!(hll.count().trunc() as u32, 3);
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar values");
+        }
+    }
+}
--- a/src/common/function/src/aggr/uddsketch_state.rs
+++ b/src/common/function/src/aggr/uddsketch_state.rs
@@ -0,0 +1,307 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_query::prelude::*;
+use common_telemetry::trace;
+use datafusion::common::cast::{as_binary_array, as_primitive_array};
+use datafusion::common::not_impl_err;
+use datafusion::error::{DataFusionError, Result as DfResult};
+use datafusion::logical_expr::function::AccumulatorArgs;
+use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF};
+use datafusion::physical_plan::expressions::Literal;
+use datafusion::prelude::create_udaf;
+use datatypes::arrow::array::ArrayRef;
+use datatypes::arrow::datatypes::{DataType, Float64Type};
+use uddsketch::{SketchHashKey, UDDSketch};
+
+pub const UDDSKETCH_STATE_NAME: &str = "uddsketch_state";
+
+#[derive(Debug)]
+pub struct UddSketchState {
+    uddsketch: UDDSketch,
+}
+
+impl UddSketchState {
+    pub fn new(bucket_size: u64, error_rate: f64) -> Self {
+        Self {
+            uddsketch: UDDSketch::new(bucket_size, error_rate),
+        }
+    }
+
+    pub fn udf_impl() -> AggregateUDF {
+        create_udaf(
+            UDDSKETCH_STATE_NAME,
+            vec![DataType::Int64, DataType::Float64, DataType::Float64],
+            Arc::new(DataType::Binary),
+            Volatility::Immutable,
+            Arc::new(|args| {
+                let (bucket_size, error_rate) = downcast_accumulator_args(args)?;
+                Ok(Box::new(UddSketchState::new(bucket_size, error_rate)))
+            }),
+            Arc::new(vec![DataType::Binary]),
+        )
+    }
+
+    fn update(&mut self, value: f64) {
+        self.uddsketch.add_value(value);
+    }
+
+    fn merge(&mut self, raw: &[u8]) {
+        if let Ok(uddsketch) = bincode::deserialize::<UDDSketch>(raw) {
+            if uddsketch.count() != 0 {
+                self.uddsketch.merge_sketch(&uddsketch);
+            }
+        } else {
+            trace!("Warning: Failed to deserialize UDDSketch from {:?}", raw);
+        }
+    }
+}
+
+fn downcast_accumulator_args(args: AccumulatorArgs) -> DfResult<(u64, f64)> {
+    let bucket_size = match args.exprs[0]
+        .as_any()
+        .downcast_ref::<Literal>()
+        .map(|lit| lit.value())
+    {
+        Some(ScalarValue::Int64(Some(value))) => *value as u64,
+        _ => {
+            return not_impl_err!(
+                "{} not supported for bucket size: {}",
+                UDDSKETCH_STATE_NAME,
+                &args.exprs[0]
+            )
+        }
+    };
+
+    let error_rate = match args.exprs[1]
+        .as_any()
+        .downcast_ref::<Literal>()
+        .map(|lit| lit.value())
+    {
+        Some(ScalarValue::Float64(Some(value))) => *value,
+        _ => {
+            return not_impl_err!(
+                "{} not supported for error rate: {}",
+                UDDSKETCH_STATE_NAME,
+                &args.exprs[1]
+            )
+        }
+    };
+
+    Ok((bucket_size, error_rate))
+}
+
+impl DfAccumulator for UddSketchState {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
+        let array = &values[2]; // the third column is data value
+        let f64_array = as_primitive_array::<Float64Type>(array)?;
+        for v in f64_array.iter().flatten() {
+            self.update(v);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> DfResult<ScalarValue> {
+        Ok(ScalarValue::Binary(Some(
+            bincode::serialize(&self.uddsketch).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize UDDSketch: {}", e))
+            })?,
+        )))
+    }
+
+    fn size(&self) -> usize {
+        // Base size of UDDSketch struct fields
+        let mut total_size = std::mem::size_of::<f64>() * 3 + // alpha, gamma, values_sum
+                            std::mem::size_of::<u32>() +      // compactions
+                            std::mem::size_of::<u64>() * 2; // max_buckets, num_values
+
+        // Size of buckets (SketchHashMap)
+        // Each bucket entry contains:
+        // - SketchHashKey (enum with i64/Zero/Invalid variants)
+        // - SketchHashEntry (count: u64, next: SketchHashKey)
+        let bucket_entry_size = std::mem::size_of::<SketchHashKey>() + // key
+                               std::mem::size_of::<u64>() +            // count
+                               std::mem::size_of::<SketchHashKey>(); // next
+
+        total_size += self.uddsketch.current_buckets_count() * bucket_entry_size;
+
+        total_size
+    }
+
+    fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
+        Ok(vec![ScalarValue::Binary(Some(
+            bincode::serialize(&self.uddsketch).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize UDDSketch: {}", e))
+            })?,
+        ))])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
+        let array = &states[0];
+        let binary_array = as_binary_array(array)?;
+        for v in binary_array.iter().flatten() {
+            self.merge(v);
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::arrow::array::{BinaryArray, Float64Array};
+
+    use super::*;
+
+    #[test]
+    fn test_uddsketch_state_basic() {
+        let mut state = UddSketchState::new(10, 0.01);
+        state.update(1.0);
+        state.update(2.0);
+        state.update(3.0);
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(deserialized.count(), 3);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_roundtrip() {
+        let mut state = UddSketchState::new(10, 0.01);
+        state.update(1.0);
+        state.update(2.0);
+
+        // Serialize
+        let serialized = state.evaluate().unwrap();
+
+        // Create new state and merge the serialized data
+        let mut new_state = UddSketchState::new(10, 0.01);
+        if let ScalarValue::Binary(Some(bytes)) = &serialized {
+            new_state.merge(bytes);
+
+            // Verify the merged state matches original by comparing deserialized values
+            let original_sketch: UDDSketch = bincode::deserialize(bytes).unwrap();
+            let new_result = new_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(new_bytes)) = new_result {
+                let new_sketch: UDDSketch = bincode::deserialize(&new_bytes).unwrap();
+                assert_eq!(original_sketch.count(), new_sketch.count());
+                assert_eq!(original_sketch.sum(), new_sketch.sum());
+                assert_eq!(original_sketch.mean(), new_sketch.mean());
+                assert_eq!(original_sketch.max_error(), new_sketch.max_error());
+                // Compare a few quantiles to ensure statistical equivalence
+                for q in [0.1, 0.5, 0.9].iter() {
+                    assert!(
+                        (original_sketch.estimate_quantile(*q) - new_sketch.estimate_quantile(*q))
+                            .abs()
+                            < 1e-10,
+                        "Quantile {} mismatch: original={}, new={}",
+                        q,
+                        original_sketch.estimate_quantile(*q),
+                        new_sketch.estimate_quantile(*q)
+                    );
+                }
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_batch_update() {
+        let mut state = UddSketchState::new(10, 0.01);
+        let values = vec![1.0f64, 2.0, 3.0];
+        let array = Arc::new(Float64Array::from(values)) as ArrayRef;
+
+        state
+            .update_batch(&[array.clone(), array.clone(), array])
+            .unwrap();
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(deserialized.count(), 3);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_merge_batch() {
+        let mut state1 = UddSketchState::new(10, 0.01);
+        state1.update(1.0);
+        let state1_binary = state1.evaluate().unwrap();
+
+        let mut state2 = UddSketchState::new(10, 0.01);
+        state2.update(2.0);
+        let state2_binary = state2.evaluate().unwrap();
+
+        let mut merged_state = UddSketchState::new(10, 0.01);
+        if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
+            (&state1_binary, &state2_binary)
+        {
+            let binary_array = Arc::new(BinaryArray::from(vec![
+                bytes1.as_slice(),
+                bytes2.as_slice(),
+            ])) as ArrayRef;
+            merged_state.merge_batch(&[binary_array]).unwrap();
+
+            let result = merged_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(bytes)) = result {
+                let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
+                assert_eq!(deserialized.count(), 2);
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar values");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_size() {
+        let mut state = UddSketchState::new(10, 0.01);
+        let initial_size = state.size();
+
+        // Add some values to create buckets
+        state.update(1.0);
+        state.update(2.0);
+        state.update(3.0);
+
+        let size_with_values = state.size();
+        assert!(
+            size_with_values > initial_size,
+            "Size should increase after adding values: initial={}, with_values={}",
+            initial_size,
+            size_with_values
+        );
+
+        // Verify size increases with more buckets
+        state.update(10.0); // This should create a new bucket
+        assert!(
+            state.size() > size_with_values,
+            "Size should increase after adding new bucket: prev={}, new={}",
+            size_with_values,
+            state.size()
+        );
+    }
+}
--- a/src/common/function/src/function_registry.rs
+++ b/src/common/function/src/function_registry.rs
@@ -22,10 +22,12 @@ use crate::function::{AsyncFunctionRef, FunctionRef};
 use crate::scalars::aggregate::{AggregateFunctionMetaRef, AggregateFunctions};
 use crate::scalars::date::DateFunction;
 use crate::scalars::expression::ExpressionFunction;
+use crate::scalars::hll_count::HllCalcFunction;
 use crate::scalars::json::JsonFunction;
 use crate::scalars::matches::MatchesFunction;
 use crate::scalars::math::MathFunction;
 use crate::scalars::timestamp::TimestampFunction;
+use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
 use crate::scalars::vector::VectorFunction;
 use crate::system::SystemFunction;
 use crate::table::TableFunction;
@@ -105,6 +107,8 @@ pub static FUNCTION_REGISTRY: Lazy<Arc<FunctionRegistry>> = Lazy::new(|| {
    TimestampFunction::register(&function_registry);
    DateFunction::register(&function_registry);
    ExpressionFunction::register(&function_registry);
+    UddSketchCalcFunction::register(&function_registry);
+    HllCalcFunction::register(&function_registry);

    // Aggregate functions
    AggregateFunctions::register(&function_registry);
--- a/src/common/function/src/lib.rs
+++ b/src/common/function/src/lib.rs
@@ -21,6 +21,7 @@ pub mod scalars;
 mod system;
 mod table;

+pub mod aggr;
 pub mod function;
 pub mod function_registry;
 pub mod handlers;
--- a/src/common/function/src/scalars.rs
+++ b/src/common/function/src/scalars.rs
@@ -22,7 +22,9 @@ pub mod matches;
 pub mod math;
 pub mod vector;

+pub(crate) mod hll_count;
 #[cfg(test)]
 pub(crate) mod test;
 pub(crate) mod timestamp;
+pub(crate) mod uddsketch_calc;
 pub mod udf;
--- a/src/common/function/src/scalars/hll_count.rs
+++ b/src/common/function/src/scalars/hll_count.rs
@@ -0,0 +1,175 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Implementation of the scalar function `hll_count`.
+
+use std::fmt;
+use std::fmt::Display;
+use std::sync::Arc;
+
+use common_query::error::{DowncastVectorSnafu, InvalidFuncArgsSnafu, Result};
+use common_query::prelude::{Signature, Volatility};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::Vector;
+use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
+use datatypes::vectors::{BinaryVector, MutableVector, UInt64VectorBuilder, VectorRef};
+use hyperloglogplus::HyperLogLog;
+use snafu::OptionExt;
+
+use crate::aggr::HllStateType;
+use crate::function::{Function, FunctionContext};
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "hll_count";
+
+/// HllCalcFunction implements the scalar function `hll_count`.
+///
+/// It accepts one argument:
+/// 1. The serialized HyperLogLogPlus state, as produced by the aggregator (binary).
+///
+/// For each row, it deserializes the sketch and returns the estimated cardinality.
+#[derive(Debug, Default)]
+pub struct HllCalcFunction;
+
+impl HllCalcFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register(Arc::new(HllCalcFunction));
+    }
+}
+
+impl Display for HllCalcFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for HllCalcFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::uint64_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        // Only argument: HyperLogLogPlus state (binary)
+        Signature::exact(
+            vec![ConcreteDataType::binary_datatype()],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        if columns.len() != 1 {
+            return InvalidFuncArgsSnafu {
+                err_msg: format!("hll_count expects 1 argument, got {}", columns.len()),
+            }
+            .fail();
+        }
+
+        let hll_vec = columns[0]
+            .as_any()
+            .downcast_ref::<BinaryVector>()
+            .with_context(|| DowncastVectorSnafu {
+                err_msg: format!("expect BinaryVector, got {}", columns[0].vector_type_name()),
+            })?;
+        let len = hll_vec.len();
+        let mut builder = UInt64VectorBuilder::with_capacity(len);
+
+        for i in 0..len {
+            let hll_opt = hll_vec.get_data(i);
+
+            if hll_opt.is_none() {
+                builder.push_null();
+                continue;
+            }
+
+            let hll_bytes = hll_opt.unwrap();
+
+            // Deserialize the HyperLogLogPlus from its bincode representation
+            let mut hll: HllStateType = match bincode::deserialize(hll_bytes) {
+                Ok(h) => h,
+                Err(e) => {
+                    common_telemetry::trace!("Failed to deserialize HyperLogLogPlus: {}", e);
+                    builder.push_null();
+                    continue;
+                }
+            };
+
+            builder.push(Some(hll.count().round() as u64));
+        }
+
+        Ok(builder.to_vector())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datatypes::vectors::BinaryVector;
+
+    use super::*;
+    use crate::utils::FixedRandomState;
+
+    #[test]
+    fn test_hll_count_function() {
+        let function = HllCalcFunction;
+        assert_eq!("hll_count", function.name());
+        assert_eq!(
+            ConcreteDataType::uint64_datatype(),
+            function
+                .return_type(&[ConcreteDataType::uint64_datatype()])
+                .unwrap()
+        );
+
+        // Create a test HLL
+        let mut hll = HllStateType::new(14, FixedRandomState::new()).unwrap();
+        for i in 1..=10 {
+            hll.insert(&i.to_string());
+        }
+
+        let serialized_bytes = bincode::serialize(&hll).unwrap();
+        let args: Vec<VectorRef> = vec![Arc::new(BinaryVector::from(vec![Some(serialized_bytes)]))];
+
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 1);
+
+        // Test cardinality estimate
+        if let datatypes::value::Value::UInt64(v) = result.get(0) {
+            assert_eq!(v, 10);
+        } else {
+            panic!("Expected uint64 value");
+        }
+    }
+
+    #[test]
+    fn test_hll_count_function_errors() {
+        let function = HllCalcFunction;
+
+        // Test with invalid number of arguments
+        let args: Vec<VectorRef> = vec![];
+        let result = function.eval(FunctionContext::default(), &args);
+        assert!(result.is_err());
+        assert!(result
+            .unwrap_err()
+            .to_string()
+            .contains("hll_count expects 1 argument"));
+
+        // Test with invalid binary data
+        let args: Vec<VectorRef> = vec![Arc::new(BinaryVector::from(vec![Some(vec![1, 2, 3])]))]; // Invalid binary data
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(matches!(result.get(0), datatypes::value::Value::Null));
+    }
+}
--- a/src/common/function/src/scalars/json.rs
+++ b/src/common/function/src/scalars/json.rs
@@ -13,7 +13,7 @@
 // limitations under the License.

 use std::sync::Arc;
-mod json_get;
+pub mod json_get;
 mod json_is;
 mod json_path_exists;
 mod json_path_match;
--- a/src/common/function/src/scalars/uddsketch_calc.rs
+++ b/src/common/function/src/scalars/uddsketch_calc.rs
@@ -0,0 +1,211 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Implementation of the scalar function `uddsketch_calc`.
+
+use std::fmt;
+use std::fmt::Display;
+use std::sync::Arc;
+
+use common_query::error::{DowncastVectorSnafu, InvalidFuncArgsSnafu, Result};
+use common_query::prelude::{Signature, Volatility};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::Vector;
+use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
+use datatypes::vectors::{BinaryVector, Float64VectorBuilder, MutableVector, VectorRef};
+use snafu::OptionExt;
+use uddsketch::UDDSketch;
+
+use crate::function::{Function, FunctionContext};
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "uddsketch_calc";
+
+/// UddSketchCalcFunction implements the scalar function `uddsketch_calc`.
+///
+/// It accepts two arguments:
+/// 1. A percentile (as f64) for which to compute the estimated quantile (e.g. 0.95 for p95).
+/// 2. The serialized UDDSketch state, as produced by the aggregator (binary).
+///
+/// For each row, it deserializes the sketch and returns the computed quantile value.
+#[derive(Debug, Default)]
+pub struct UddSketchCalcFunction;
+
+impl UddSketchCalcFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register(Arc::new(UddSketchCalcFunction));
+    }
+}
+
+impl Display for UddSketchCalcFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for UddSketchCalcFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::float64_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        // First argument: percentile (float64)
+        // Second argument: UDDSketch state (binary)
+        Signature::exact(
+            vec![
+                ConcreteDataType::float64_datatype(),
+                ConcreteDataType::binary_datatype(),
+            ],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        if columns.len() != 2 {
+            return InvalidFuncArgsSnafu {
+                err_msg: format!("uddsketch_calc expects 2 arguments, got {}", columns.len()),
+            }
+            .fail();
+        }
+
+        let perc_vec = &columns[0];
+        let sketch_vec = columns[1]
+            .as_any()
+            .downcast_ref::<BinaryVector>()
+            .with_context(|| DowncastVectorSnafu {
+                err_msg: format!("expect BinaryVector, got {}", columns[1].vector_type_name()),
+            })?;
+        let len = sketch_vec.len();
+        let mut builder = Float64VectorBuilder::with_capacity(len);
+
+        for i in 0..len {
+            let perc_opt = perc_vec.get(i).as_f64_lossy();
+            let sketch_opt = sketch_vec.get_data(i);
+
+            if sketch_opt.is_none() || perc_opt.is_none() {
+                builder.push_null();
+                continue;
+            }
+
+            let sketch_bytes = sketch_opt.unwrap();
+            let perc = perc_opt.unwrap();
+
+            // Deserialize the UDDSketch from its bincode representation
+            let sketch: UDDSketch = match bincode::deserialize(sketch_bytes) {
+                Ok(s) => s,
+                Err(e) => {
+                    common_telemetry::trace!("Failed to deserialize UDDSketch: {}", e);
+                    builder.push_null();
+                    continue;
+                }
+            };
+
+            // Compute the estimated quantile from the sketch
+            let result = sketch.estimate_quantile(perc);
+            builder.push(Some(result));
+        }
+
+        Ok(builder.to_vector())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datatypes::vectors::{BinaryVector, Float64Vector};
+
+    use super::*;
+
+    #[test]
+    fn test_uddsketch_calc_function() {
+        let function = UddSketchCalcFunction;
+        assert_eq!("uddsketch_calc", function.name());
+        assert_eq!(
+            ConcreteDataType::float64_datatype(),
+            function
+                .return_type(&[ConcreteDataType::float64_datatype()])
+                .unwrap()
+        );
+
+        // Create a test sketch
+        let mut sketch = UDDSketch::new(128, 0.01);
+        sketch.add_value(10.0);
+        sketch.add_value(20.0);
+        sketch.add_value(30.0);
+        sketch.add_value(40.0);
+        sketch.add_value(50.0);
+        sketch.add_value(60.0);
+        sketch.add_value(70.0);
+        sketch.add_value(80.0);
+        sketch.add_value(90.0);
+        sketch.add_value(100.0);
+
+        // Get expected values directly from the sketch
+        let expected_p50 = sketch.estimate_quantile(0.5);
+        let expected_p90 = sketch.estimate_quantile(0.9);
+        let expected_p95 = sketch.estimate_quantile(0.95);
+
+        let serialized = bincode::serialize(&sketch).unwrap();
+        let percentiles = vec![0.5, 0.9, 0.95];
+
+        let args: Vec<VectorRef> = vec![
+            Arc::new(Float64Vector::from_vec(percentiles.clone())),
+            Arc::new(BinaryVector::from(vec![Some(serialized.clone()); 3])),
+        ];
+
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 3);
+
+        // Test median (p50)
+        assert!(
+            matches!(result.get(0), datatypes::value::Value::Float64(v) if (v - expected_p50).abs() < 1e-10)
+        );
+        // Test p90
+        assert!(
+            matches!(result.get(1), datatypes::value::Value::Float64(v) if (v - expected_p90).abs() < 1e-10)
+        );
+        // Test p95
+        assert!(
+            matches!(result.get(2), datatypes::value::Value::Float64(v) if (v - expected_p95).abs() < 1e-10)
+        );
+    }
+
+    #[test]
+    fn test_uddsketch_calc_function_errors() {
+        let function = UddSketchCalcFunction;
+
+        // Test with invalid number of arguments
+        let args: Vec<VectorRef> = vec![Arc::new(Float64Vector::from_vec(vec![0.95]))];
+        let result = function.eval(FunctionContext::default(), &args);
+        assert!(result.is_err());
+        assert!(result
+            .unwrap_err()
+            .to_string()
+            .contains("uddsketch_calc expects 2 arguments"));
+
+        // Test with invalid binary data
+        let args: Vec<VectorRef> = vec![
+            Arc::new(Float64Vector::from_vec(vec![0.95])),
+            Arc::new(BinaryVector::from(vec![Some(vec![1, 2, 3])])), // Invalid binary data
+        ];
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(matches!(result.get(0), datatypes::value::Value::Null));
+    }
+}
--- a/src/common/function/src/utils.rs
+++ b/src/common/function/src/utils.rs
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::hash::BuildHasher;
+
+use ahash::RandomState;
+use serde::{Deserialize, Serialize};
+
 /// Escapes special characters in the provided pattern string for `LIKE`.
 ///
 /// Specifically, it prefixes the backslash (`\`), percent (`%`), and underscore (`_`)
@@ -32,6 +37,71 @@ pub fn escape_like_pattern(pattern: &str) -> String {
        })
        .collect::<String>()
 }
+
+/// A random state with fixed seeds.
+///
+/// This is used to ensure that the hash values are consistent across
+/// different processes, and easy to serialize and deserialize.
+#[derive(Debug)]
+pub struct FixedRandomState {
+    state: RandomState,
+}
+
+impl FixedRandomState {
+    // some random seeds
+    const RANDOM_SEED_0: u64 = 0x517cc1b727220a95;
+    const RANDOM_SEED_1: u64 = 0x428a2f98d728ae22;
+    const RANDOM_SEED_2: u64 = 0x7137449123ef65cd;
+    const RANDOM_SEED_3: u64 = 0xb5c0fbcfec4d3b2f;
+
+    pub fn new() -> Self {
+        Self {
+            state: ahash::RandomState::with_seeds(
+                Self::RANDOM_SEED_0,
+                Self::RANDOM_SEED_1,
+                Self::RANDOM_SEED_2,
+                Self::RANDOM_SEED_3,
+            ),
+        }
+    }
+}
+
+impl Default for FixedRandomState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BuildHasher for FixedRandomState {
+    type Hasher = ahash::AHasher;
+
+    fn build_hasher(&self) -> Self::Hasher {
+        self.state.build_hasher()
+    }
+
+    fn hash_one<T: std::hash::Hash>(&self, x: T) -> u64 {
+        self.state.hash_one(x)
+    }
+}
+
+impl Serialize for FixedRandomState {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_unit()
+    }
+}
+
+impl<'de> Deserialize<'de> for FixedRandomState {
+    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        Ok(Self::new())
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/src/common/grpc-expr/Cargo.toml
+++ b/src/common/grpc-expr/Cargo.toml
@@ -22,4 +22,4 @@ store-api.workspace = true
 table.workspace = true

 [dev-dependencies]
-paste = "1.0"
+paste.workspace = true
--- a/src/common/grpc-expr/src/alter.rs
+++ b/src/common/grpc-expr/src/alter.rs
@@ -15,13 +15,14 @@
 use api::helper::ColumnDataTypeWrapper;
 use api::v1::add_column_location::LocationType;
 use api::v1::alter_table_expr::Kind;
-use api::v1::column_def::as_fulltext_option;
+use api::v1::column_def::{as_fulltext_option, as_skipping_index_type};
 use api::v1::{
    column_def, AddColumnLocation as Location, AlterTableExpr, Analyzer, CreateTableExpr,
    DropColumns, ModifyColumnTypes, RenameTable, SemanticType,
+    SkippingIndexType as PbSkippingIndexType,
 };
 use common_query::AddColumnLocation;
-use datatypes::schema::{ColumnSchema, FulltextOptions, RawSchema};
+use datatypes::schema::{ColumnSchema, FulltextOptions, RawSchema, SkippingIndexOptions};
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::region_request::{SetRegionOption, UnsetRegionOption};
 use table::metadata::TableId;
@@ -31,7 +32,8 @@ use table::requests::{
 };

 use crate::error::{
-    InvalidColumnDefSnafu, InvalidSetFulltextOptionRequestSnafu, InvalidSetTableOptionRequestSnafu,
+    InvalidColumnDefSnafu, InvalidSetFulltextOptionRequestSnafu,
+    InvalidSetSkippingIndexOptionRequestSnafu, InvalidSetTableOptionRequestSnafu,
    InvalidUnsetTableOptionRequestSnafu, MissingAlterIndexOptionSnafu, MissingFieldSnafu,
    MissingTimestampColumnSnafu, Result, UnknownLocationTypeSnafu,
 };
@@ -137,6 +139,18 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result<
                        column_name: i.column_name,
                    },
                },
+                api::v1::set_index::Options::Skipping(s) => AlterKind::SetIndex {
+                    options: SetIndexOptions::Skipping {
+                        column_name: s.column_name,
+                        options: SkippingIndexOptions {
+                            granularity: s.granularity as u32,
+                            index_type: as_skipping_index_type(
+                                PbSkippingIndexType::try_from(s.skipping_index_type)
+                                    .context(InvalidSetSkippingIndexOptionRequestSnafu)?,
+                            ),
+                        },
+                    },
+                },
            },
            None => return MissingAlterIndexOptionSnafu.fail(),
        },
@@ -152,6 +166,11 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result<
                        column_name: i.column_name,
                    },
                },
+                api::v1::unset_index::Options::Skipping(s) => AlterKind::UnsetIndex {
+                    options: UnsetIndexOptions::Skipping {
+                        column_name: s.column_name,
+                    },
+                },
            },
            None => return MissingAlterIndexOptionSnafu.fail(),
        },
--- a/src/common/grpc-expr/src/error.rs
+++ b/src/common/grpc-expr/src/error.rs
@@ -140,6 +140,14 @@ pub enum Error {
        error: prost::UnknownEnumValue,
    },

+    #[snafu(display("Invalid set skipping index option request"))]
+    InvalidSetSkippingIndexOptionRequest {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: prost::UnknownEnumValue,
+    },
+
    #[snafu(display("Missing alter index options"))]
    MissingAlterIndexOption {
        #[snafu(implicit)]
@@ -171,6 +179,7 @@ impl ErrorExt for Error {
            Error::InvalidSetTableOptionRequest { .. }
            | Error::InvalidUnsetTableOptionRequest { .. }
            | Error::InvalidSetFulltextOptionRequest { .. }
+            | Error::InvalidSetSkippingIndexOptionRequest { .. }
            | Error::MissingAlterIndexOption { .. } => StatusCode::InvalidArguments,
        }
    }
--- a/src/common/grpc-expr/src/insert.rs
+++ b/src/common/grpc-expr/src/insert.rs
@@ -14,37 +14,12 @@

 use api::helper;
 use api::v1::column::Values;
-use api::v1::{Column, CreateTableExpr};
 use common_base::BitVec;
 use datatypes::data_type::{ConcreteDataType, DataType};
 use datatypes::prelude::VectorRef;
 use snafu::{ensure, ResultExt};
-use table::metadata::TableId;
-use table::table_reference::TableReference;

 use crate::error::{CreateVectorSnafu, Result, UnexpectedValuesLengthSnafu};
-use crate::util;
-use crate::util::ColumnExpr;
-
-/// Try to build create table request from insert data.
-pub fn build_create_expr_from_insertion(
-    catalog_name: &str,
-    schema_name: &str,
-    table_id: Option<TableId>,
-    table_name: &str,
-    columns: &[Column],
-    engine: &str,
-) -> Result<CreateTableExpr> {
-    let table_name = TableReference::full(catalog_name, schema_name, table_name);
-    let column_exprs = ColumnExpr::from_columns(columns);
-    util::build_create_table_expr(
-        table_id,
-        &table_name,
-        column_exprs,
-        engine,
-        "Created on insertion",
-    )
-}

 pub(crate) fn add_values_to_builder(
    data_type: ConcreteDataType,
@@ -87,276 +62,7 @@ fn is_null(null_mask: &BitVec, idx: usize) -> Option<bool> {

 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-    use std::{assert_eq, vec};
-
-    use api::helper::ColumnDataTypeWrapper;
-    use api::v1::column::Values;
-    use api::v1::column_data_type_extension::TypeExt;
-    use api::v1::{
-        Column, ColumnDataType, ColumnDataTypeExtension, Decimal128, DecimalTypeExtension,
-        IntervalMonthDayNano, SemanticType,
-    };
-    use common_base::BitVec;
-    use common_catalog::consts::MITO_ENGINE;
-    use common_time::interval::IntervalUnit;
-    use common_time::timestamp::TimeUnit;
-    use datatypes::data_type::ConcreteDataType;
-    use datatypes::schema::{ColumnSchema, SchemaBuilder};
-    use snafu::ResultExt;
-
    use super::*;
-    use crate::error;
-    use crate::error::ColumnDataTypeSnafu;
-
-    #[inline]
-    fn build_column_schema(
-        column_name: &str,
-        datatype: i32,
-        nullable: bool,
-    ) -> error::Result<ColumnSchema> {
-        let datatype_wrapper =
-            ColumnDataTypeWrapper::try_new(datatype, None).context(ColumnDataTypeSnafu)?;
-
-        Ok(ColumnSchema::new(
-            column_name,
-            datatype_wrapper.into(),
-            nullable,
-        ))
-    }
-
-    #[test]
-    fn test_build_create_table_request() {
-        let table_id = Some(10);
-        let table_name = "test_metric";
-
-        assert!(
-            build_create_expr_from_insertion("", "", table_id, table_name, &[], MITO_ENGINE)
-                .is_err()
-        );
-
-        let insert_batch = mock_insert_batch();
-
-        let create_expr = build_create_expr_from_insertion(
-            "",
-            "",
-            table_id,
-            table_name,
-            &insert_batch.0,
-            MITO_ENGINE,
-        )
-        .unwrap();
-
-        assert_eq!(table_id, create_expr.table_id.map(|x| x.id));
-        assert_eq!(table_name, create_expr.table_name);
-        assert_eq!("Created on insertion".to_string(), create_expr.desc);
-        assert_eq!(
-            vec![create_expr.column_defs[0].name.clone()],
-            create_expr.primary_keys
-        );
-
-        let column_defs = create_expr.column_defs;
-        assert_eq!(column_defs[5].name, create_expr.time_index);
-        assert_eq!(7, column_defs.len());
-
-        assert_eq!(
-            ConcreteDataType::string_datatype(),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    column_defs
-                        .iter()
-                        .find(|c| c.name == "host")
-                        .unwrap()
-                        .data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-
-        assert_eq!(
-            ConcreteDataType::float64_datatype(),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    column_defs
-                        .iter()
-                        .find(|c| c.name == "cpu")
-                        .unwrap()
-                        .data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-
-        assert_eq!(
-            ConcreteDataType::float64_datatype(),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    column_defs
-                        .iter()
-                        .find(|c| c.name == "memory")
-                        .unwrap()
-                        .data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-
-        assert_eq!(
-            ConcreteDataType::time_datatype(TimeUnit::Millisecond),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    column_defs
-                        .iter()
-                        .find(|c| c.name == "time")
-                        .unwrap()
-                        .data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-
-        assert_eq!(
-            ConcreteDataType::interval_datatype(IntervalUnit::MonthDayNano),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    column_defs
-                        .iter()
-                        .find(|c| c.name == "interval")
-                        .unwrap()
-                        .data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-
-        assert_eq!(
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    column_defs
-                        .iter()
-                        .find(|c| c.name == "ts")
-                        .unwrap()
-                        .data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-
-        let decimal_column = column_defs.iter().find(|c| c.name == "decimals").unwrap();
-        assert_eq!(
-            ConcreteDataType::decimal128_datatype(38, 10),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    decimal_column.data_type,
-                    decimal_column.datatype_extension,
-                )
-                .unwrap()
-            )
-        );
-    }
-
-    #[test]
-    fn test_find_new_columns() {
-        let mut columns = Vec::with_capacity(1);
-        let cpu_column = build_column_schema("cpu", 10, true).unwrap();
-        let ts_column = build_column_schema("ts", 15, false)
-            .unwrap()
-            .with_time_index(true);
-        columns.push(cpu_column);
-        columns.push(ts_column);
-
-        let schema = Arc::new(SchemaBuilder::try_from(columns).unwrap().build().unwrap());
-
-        assert!(
-            util::extract_new_columns(&schema, ColumnExpr::from_columns(&[]))
-                .unwrap()
-                .is_none()
-        );
-
-        let insert_batch = mock_insert_batch();
-
-        let add_columns =
-            util::extract_new_columns(&schema, ColumnExpr::from_columns(&insert_batch.0))
-                .unwrap()
-                .unwrap();
-
-        assert_eq!(5, add_columns.add_columns.len());
-        let host_column = &add_columns.add_columns[0];
-        assert_eq!(
-            ConcreteDataType::string_datatype(),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    host_column.column_def.as_ref().unwrap().data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-        assert!(host_column.add_if_not_exists);
-
-        let memory_column = &add_columns.add_columns[1];
-        assert_eq!(
-            ConcreteDataType::float64_datatype(),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    memory_column.column_def.as_ref().unwrap().data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-        assert!(host_column.add_if_not_exists);
-
-        let time_column = &add_columns.add_columns[2];
-        assert_eq!(
-            ConcreteDataType::time_datatype(TimeUnit::Millisecond),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    time_column.column_def.as_ref().unwrap().data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-        assert!(host_column.add_if_not_exists);
-
-        let interval_column = &add_columns.add_columns[3];
-        assert_eq!(
-            ConcreteDataType::interval_datatype(IntervalUnit::MonthDayNano),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    interval_column.column_def.as_ref().unwrap().data_type,
-                    None
-                )
-                .unwrap()
-            )
-        );
-        assert!(host_column.add_if_not_exists);
-
-        let decimal_column = &add_columns.add_columns[4];
-        assert_eq!(
-            ConcreteDataType::decimal128_datatype(38, 10),
-            ConcreteDataType::from(
-                ColumnDataTypeWrapper::try_new(
-                    decimal_column.column_def.as_ref().unwrap().data_type,
-                    decimal_column
-                        .column_def
-                        .as_ref()
-                        .unwrap()
-                        .datatype_extension
-                )
-                .unwrap()
-            )
-        );
-        assert!(host_column.add_if_not_exists);
-    }

    #[test]
    fn test_is_null() {
@@ -371,127 +77,4 @@ mod tests {
        assert_eq!(None, is_null(&null_mask, 16));
        assert_eq!(None, is_null(&null_mask, 99));
    }
-
-    fn mock_insert_batch() -> (Vec<Column>, u32) {
-        let row_count = 2;
-
-        let host_vals = Values {
-            string_values: vec!["host1".to_string(), "host2".to_string()],
-            ..Default::default()
-        };
-        let host_column = Column {
-            column_name: "host".to_string(),
-            semantic_type: SemanticType::Tag as i32,
-            values: Some(host_vals),
-            null_mask: vec![0],
-            datatype: ColumnDataType::String as i32,
-            ..Default::default()
-        };
-
-        let cpu_vals = Values {
-            f64_values: vec![0.31],
-            ..Default::default()
-        };
-        let cpu_column = Column {
-            column_name: "cpu".to_string(),
-            semantic_type: SemanticType::Field as i32,
-            values: Some(cpu_vals),
-            null_mask: vec![2],
-            datatype: ColumnDataType::Float64 as i32,
-            ..Default::default()
-        };
-
-        let mem_vals = Values {
-            f64_values: vec![0.1],
-            ..Default::default()
-        };
-        let mem_column = Column {
-            column_name: "memory".to_string(),
-            semantic_type: SemanticType::Field as i32,
-            values: Some(mem_vals),
-            null_mask: vec![1],
-            datatype: ColumnDataType::Float64 as i32,
-            ..Default::default()
-        };
-
-        let time_vals = Values {
-            time_millisecond_values: vec![100, 101],
-            ..Default::default()
-        };
-        let time_column = Column {
-            column_name: "time".to_string(),
-            semantic_type: SemanticType::Field as i32,
-            values: Some(time_vals),
-            null_mask: vec![0],
-            datatype: ColumnDataType::TimeMillisecond as i32,
-            ..Default::default()
-        };
-
-        let interval1 = IntervalMonthDayNano {
-            months: 1,
-            days: 2,
-            nanoseconds: 3,
-        };
-        let interval2 = IntervalMonthDayNano {
-            months: 4,
-            days: 5,
-            nanoseconds: 6,
-        };
-        let interval_vals = Values {
-            interval_month_day_nano_values: vec![interval1, interval2],
-            ..Default::default()
-        };
-        let interval_column = Column {
-            column_name: "interval".to_string(),
-            semantic_type: SemanticType::Field as i32,
-            values: Some(interval_vals),
-            null_mask: vec![0],
-            datatype: ColumnDataType::IntervalMonthDayNano as i32,
-            ..Default::default()
-        };
-
-        let ts_vals = Values {
-            timestamp_millisecond_values: vec![100, 101],
-            ..Default::default()
-        };
-        let ts_column = Column {
-            column_name: "ts".to_string(),
-            semantic_type: SemanticType::Timestamp as i32,
-            values: Some(ts_vals),
-            null_mask: vec![0],
-            datatype: ColumnDataType::TimestampMillisecond as i32,
-            ..Default::default()
-        };
-        let decimal_vals = Values {
-            decimal128_values: vec![Decimal128 { hi: 0, lo: 123 }, Decimal128 { hi: 0, lo: 456 }],
-            ..Default::default()
-        };
-        let decimal_column = Column {
-            column_name: "decimals".to_string(),
-            semantic_type: SemanticType::Field as i32,
-            values: Some(decimal_vals),
-            null_mask: vec![0],
-            datatype: ColumnDataType::Decimal128 as i32,
-            datatype_extension: Some(ColumnDataTypeExtension {
-                type_ext: Some(TypeExt::DecimalType(DecimalTypeExtension {
-                    precision: 38,
-                    scale: 10,
-                })),
-            }),
-            options: None,
-        };
-
-        (
-            vec![
-                host_column,
-                cpu_column,
-                mem_column,
-                time_column,
-                interval_column,
-                ts_column,
-                decimal_column,
-            ],
-            row_count,
-        )
-    }
 }
--- a/src/common/grpc-expr/src/lib.rs
+++ b/src/common/grpc-expr/src/lib.rs
@@ -19,4 +19,3 @@ pub mod insert;
 pub mod util;

 pub use alter::{alter_expr_to_request, create_table_schema};
-pub use insert::build_create_expr_from_insertion;
--- a/src/common/grpc-expr/src/util.rs
+++ b/src/common/grpc-expr/src/util.rs
@@ -236,3 +236,414 @@ pub fn extract_new_columns(
        }))
    }
 }
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::{assert_eq, vec};
+
+    use api::helper::ColumnDataTypeWrapper;
+    use api::v1::column::Values;
+    use api::v1::column_data_type_extension::TypeExt;
+    use api::v1::{
+        Column, ColumnDataType, ColumnDataTypeExtension, Decimal128, DecimalTypeExtension,
+        IntervalMonthDayNano, SemanticType,
+    };
+    use common_catalog::consts::MITO_ENGINE;
+    use common_time::interval::IntervalUnit;
+    use common_time::timestamp::TimeUnit;
+    use datatypes::data_type::ConcreteDataType;
+    use datatypes::schema::{ColumnSchema, SchemaBuilder};
+    use snafu::ResultExt;
+
+    use super::*;
+    use crate::error;
+    use crate::error::ColumnDataTypeSnafu;
+
+    #[inline]
+    fn build_column_schema(
+        column_name: &str,
+        datatype: i32,
+        nullable: bool,
+    ) -> error::Result<ColumnSchema> {
+        let datatype_wrapper =
+            ColumnDataTypeWrapper::try_new(datatype, None).context(ColumnDataTypeSnafu)?;
+
+        Ok(ColumnSchema::new(
+            column_name,
+            datatype_wrapper.into(),
+            nullable,
+        ))
+    }
+
+    fn build_create_expr_from_insertion(
+        catalog_name: &str,
+        schema_name: &str,
+        table_id: Option<TableId>,
+        table_name: &str,
+        columns: &[Column],
+        engine: &str,
+    ) -> Result<CreateTableExpr> {
+        let table_name = TableReference::full(catalog_name, schema_name, table_name);
+        let column_exprs = ColumnExpr::from_columns(columns);
+        build_create_table_expr(
+            table_id,
+            &table_name,
+            column_exprs,
+            engine,
+            "Created on insertion",
+        )
+    }
+
+    #[test]
+    fn test_build_create_table_request() {
+        let table_id = Some(10);
+        let table_name = "test_metric";
+
+        assert!(
+            build_create_expr_from_insertion("", "", table_id, table_name, &[], MITO_ENGINE)
+                .is_err()
+        );
+
+        let insert_batch = mock_insert_batch();
+
+        let create_expr = build_create_expr_from_insertion(
+            "",
+            "",
+            table_id,
+            table_name,
+            &insert_batch.0,
+            MITO_ENGINE,
+        )
+        .unwrap();
+
+        assert_eq!(table_id, create_expr.table_id.map(|x| x.id));
+        assert_eq!(table_name, create_expr.table_name);
+        assert_eq!("Created on insertion".to_string(), create_expr.desc);
+        assert_eq!(
+            vec![create_expr.column_defs[0].name.clone()],
+            create_expr.primary_keys
+        );
+
+        let column_defs = create_expr.column_defs;
+        assert_eq!(column_defs[5].name, create_expr.time_index);
+        assert_eq!(7, column_defs.len());
+
+        assert_eq!(
+            ConcreteDataType::string_datatype(),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    column_defs
+                        .iter()
+                        .find(|c| c.name == "host")
+                        .unwrap()
+                        .data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+
+        assert_eq!(
+            ConcreteDataType::float64_datatype(),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    column_defs
+                        .iter()
+                        .find(|c| c.name == "cpu")
+                        .unwrap()
+                        .data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+
+        assert_eq!(
+            ConcreteDataType::float64_datatype(),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    column_defs
+                        .iter()
+                        .find(|c| c.name == "memory")
+                        .unwrap()
+                        .data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+
+        assert_eq!(
+            ConcreteDataType::time_datatype(TimeUnit::Millisecond),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    column_defs
+                        .iter()
+                        .find(|c| c.name == "time")
+                        .unwrap()
+                        .data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+
+        assert_eq!(
+            ConcreteDataType::interval_datatype(IntervalUnit::MonthDayNano),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    column_defs
+                        .iter()
+                        .find(|c| c.name == "interval")
+                        .unwrap()
+                        .data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+
+        assert_eq!(
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    column_defs
+                        .iter()
+                        .find(|c| c.name == "ts")
+                        .unwrap()
+                        .data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+
+        let decimal_column = column_defs.iter().find(|c| c.name == "decimals").unwrap();
+        assert_eq!(
+            ConcreteDataType::decimal128_datatype(38, 10),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    decimal_column.data_type,
+                    decimal_column.datatype_extension,
+                )
+                .unwrap()
+            )
+        );
+    }
+
+    #[test]
+    fn test_find_new_columns() {
+        let mut columns = Vec::with_capacity(1);
+        let cpu_column = build_column_schema("cpu", 10, true).unwrap();
+        let ts_column = build_column_schema("ts", 15, false)
+            .unwrap()
+            .with_time_index(true);
+        columns.push(cpu_column);
+        columns.push(ts_column);
+
+        let schema = Arc::new(SchemaBuilder::try_from(columns).unwrap().build().unwrap());
+
+        assert!(extract_new_columns(&schema, ColumnExpr::from_columns(&[]))
+            .unwrap()
+            .is_none());
+
+        let insert_batch = mock_insert_batch();
+
+        let add_columns = extract_new_columns(&schema, ColumnExpr::from_columns(&insert_batch.0))
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(5, add_columns.add_columns.len());
+        let host_column = &add_columns.add_columns[0];
+        assert_eq!(
+            ConcreteDataType::string_datatype(),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    host_column.column_def.as_ref().unwrap().data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+        assert!(host_column.add_if_not_exists);
+
+        let memory_column = &add_columns.add_columns[1];
+        assert_eq!(
+            ConcreteDataType::float64_datatype(),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    memory_column.column_def.as_ref().unwrap().data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+        assert!(host_column.add_if_not_exists);
+
+        let time_column = &add_columns.add_columns[2];
+        assert_eq!(
+            ConcreteDataType::time_datatype(TimeUnit::Millisecond),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    time_column.column_def.as_ref().unwrap().data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+        assert!(host_column.add_if_not_exists);
+
+        let interval_column = &add_columns.add_columns[3];
+        assert_eq!(
+            ConcreteDataType::interval_datatype(IntervalUnit::MonthDayNano),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    interval_column.column_def.as_ref().unwrap().data_type,
+                    None
+                )
+                .unwrap()
+            )
+        );
+        assert!(host_column.add_if_not_exists);
+
+        let decimal_column = &add_columns.add_columns[4];
+        assert_eq!(
+            ConcreteDataType::decimal128_datatype(38, 10),
+            ConcreteDataType::from(
+                ColumnDataTypeWrapper::try_new(
+                    decimal_column.column_def.as_ref().unwrap().data_type,
+                    decimal_column
+                        .column_def
+                        .as_ref()
+                        .unwrap()
+                        .datatype_extension
+                )
+                .unwrap()
+            )
+        );
+        assert!(host_column.add_if_not_exists);
+    }
+
+    fn mock_insert_batch() -> (Vec<Column>, u32) {
+        let row_count = 2;
+
+        let host_vals = Values {
+            string_values: vec!["host1".to_string(), "host2".to_string()],
+            ..Default::default()
+        };
+        let host_column = Column {
+            column_name: "host".to_string(),
+            semantic_type: SemanticType::Tag as i32,
+            values: Some(host_vals),
+            null_mask: vec![0],
+            datatype: ColumnDataType::String as i32,
+            ..Default::default()
+        };
+
+        let cpu_vals = Values {
+            f64_values: vec![0.31],
+            ..Default::default()
+        };
+        let cpu_column = Column {
+            column_name: "cpu".to_string(),
+            semantic_type: SemanticType::Field as i32,
+            values: Some(cpu_vals),
+            null_mask: vec![2],
+            datatype: ColumnDataType::Float64 as i32,
+            ..Default::default()
+        };
+
+        let mem_vals = Values {
+            f64_values: vec![0.1],
+            ..Default::default()
+        };
+        let mem_column = Column {
+            column_name: "memory".to_string(),
+            semantic_type: SemanticType::Field as i32,
+            values: Some(mem_vals),
+            null_mask: vec![1],
+            datatype: ColumnDataType::Float64 as i32,
+            ..Default::default()
+        };
+
+        let time_vals = Values {
+            time_millisecond_values: vec![100, 101],
+            ..Default::default()
+        };
+        let time_column = Column {
+            column_name: "time".to_string(),
+            semantic_type: SemanticType::Field as i32,
+            values: Some(time_vals),
+            null_mask: vec![0],
+            datatype: ColumnDataType::TimeMillisecond as i32,
+            ..Default::default()
+        };
+
+        let interval1 = IntervalMonthDayNano {
+            months: 1,
+            days: 2,
+            nanoseconds: 3,
+        };
+        let interval2 = IntervalMonthDayNano {
+            months: 4,
+            days: 5,
+            nanoseconds: 6,
+        };
+        let interval_vals = Values {
+            interval_month_day_nano_values: vec![interval1, interval2],
+            ..Default::default()
+        };
+        let interval_column = Column {
+            column_name: "interval".to_string(),
+            semantic_type: SemanticType::Field as i32,
+            values: Some(interval_vals),
+            null_mask: vec![0],
+            datatype: ColumnDataType::IntervalMonthDayNano as i32,
+            ..Default::default()
+        };
+
+        let ts_vals = Values {
+            timestamp_millisecond_values: vec![100, 101],
+            ..Default::default()
+        };
+        let ts_column = Column {
+            column_name: "ts".to_string(),
+            semantic_type: SemanticType::Timestamp as i32,
+            values: Some(ts_vals),
+            null_mask: vec![0],
+            datatype: ColumnDataType::TimestampMillisecond as i32,
+            ..Default::default()
+        };
+        let decimal_vals = Values {
+            decimal128_values: vec![Decimal128 { hi: 0, lo: 123 }, Decimal128 { hi: 0, lo: 456 }],
+            ..Default::default()
+        };
+        let decimal_column = Column {
+            column_name: "decimals".to_string(),
+            semantic_type: SemanticType::Field as i32,
+            values: Some(decimal_vals),
+            null_mask: vec![0],
+            datatype: ColumnDataType::Decimal128 as i32,
+            datatype_extension: Some(ColumnDataTypeExtension {
+                type_ext: Some(TypeExt::DecimalType(DecimalTypeExtension {
+                    precision: 38,
+                    scale: 10,
+                })),
+            }),
+            options: None,
+        };
+
+        (
+            vec![
+                host_column,
+                cpu_column,
+                mem_column,
+                time_column,
+                interval_column,
+                ts_column,
+                decimal_column,
+            ],
+            row_count,
+        )
+    }
+}
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -445,10 +445,16 @@ impl Pool {

 async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
    let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
-
+    // use weak ref here to prevent pool being leaked
+    let pool_weak = Arc::downgrade(&pool);
    loop {
        let _ = interval.tick().await;
-        pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
+        if let Some(pool) = pool_weak.upgrade() {
+            pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
+        } else {
+            // no one is using this pool, so we can also let go
+            break;
+        }
    }
 }

--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -6,7 +6,7 @@ license.workspace = true

 [features]
 testing = []
-pg_kvbackend = ["dep:tokio-postgres", "dep:backon"]
+pg_kvbackend = ["dep:tokio-postgres", "dep:backon", "dep:deadpool-postgres", "dep:deadpool"]

 [lints]
 workspace = true
@@ -36,8 +36,8 @@ common-wal.workspace = true
 datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datatypes.workspace = true
-deadpool.workspace = true
-deadpool-postgres.workspace = true
+deadpool = { workspace = true, optional = true }
+deadpool-postgres = { workspace = true, optional = true }
 derive_builder.workspace = true
 etcd-client.workspace = true
 futures.workspace = true
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -16,7 +16,6 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use futures::future::BoxFuture;
-use futures::TryStreamExt;
 use moka::future::Cache;
 use moka::ops::compute::Op;
 use table::metadata::TableId;
@@ -54,9 +53,13 @@ fn init_factory(table_flow_manager: TableFlowManagerRef) -> Initializer<TableId,
        Box::pin(async move {
            table_flow_manager
                .flows(table_id)
-                .map_ok(|(key, value)| (key.flownode_id(), value.peer))
-                .try_collect::<HashMap<_, _>>()
                .await
+                .map(|flows| {
+                    flows
+                        .into_iter()
+                        .map(|(key, value)| (key.flownode_id(), value.peer))
+                        .collect::<HashMap<_, _>>()
+                })
                // We must cache the `HashSet` even if it's empty,
                // to avoid future requests to the remote storage next time;
                // If the value is added to the remote storage,
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::hash::{DefaultHasher, Hash, Hasher};
 use std::str::FromStr;

+use api::v1::meta::HeartbeatRequest;
 use common_error::ext::ErrorExt;
 use lazy_static::lazy_static;
 use regex::Regex;
@@ -55,12 +57,10 @@ pub trait ClusterInfo {
 }

 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-{cluster_id}-{role}-{node_id}`.
-///
-/// This key cannot be used to describe the `Metasrv` because the `Metasrv` does not have
-/// a `cluster_id`, it serves multiple clusters.
-#[derive(Debug, Clone, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub struct NodeInfoKey {
    /// The cluster id.
+    // todo(hl): remove cluster_id as it is not assigned anywhere.
    pub cluster_id: ClusterId,
    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
@@ -69,6 +69,28 @@ pub struct NodeInfoKey {
 }

 impl NodeInfoKey {
+    /// Try to create a `NodeInfoKey` from a "good" heartbeat request. "good" as in every needed
+    /// piece of information is provided and valid.  
+    pub fn new(request: &HeartbeatRequest) -> Option<Self> {
+        let HeartbeatRequest { header, peer, .. } = request;
+        let header = header.as_ref()?;
+        let peer = peer.as_ref()?;
+
+        let role = header.role.try_into().ok()?;
+        let node_id = match role {
+            // Because the Frontend is stateless, it's too easy to neglect choosing a unique id
+            // for it when setting up a cluster. So we calculate its id from its address.
+            Role::Frontend => calculate_node_id(&peer.addr),
+            _ => peer.id,
+        };
+
+        Some(NodeInfoKey {
+            cluster_id: header.cluster_id,
+            role,
+            node_id,
+        })
+    }
+
    pub fn key_prefix_with_cluster_id(cluster_id: u64) -> String {
        format!("{}-{}-", CLUSTER_NODE_INFO_PREFIX, cluster_id)
    }
@@ -83,6 +105,13 @@ impl NodeInfoKey {
    }
 }

+/// Calculate (by using the DefaultHasher) the node's id from its address.
+fn calculate_node_id(addr: &str) -> u64 {
+    let mut hasher = DefaultHasher::new();
+    addr.hash(&mut hasher);
+    hasher.finish()
+}
+
 /// The information of a node in the cluster.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct NodeInfo {
@@ -100,7 +129,7 @@ pub struct NodeInfo {
    pub start_time_ms: u64,
 }

-#[derive(Debug, Clone, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub enum Role {
    Datanode,
    Frontend,
@@ -201,8 +230,8 @@ impl TryFrom<Vec<u8>> for NodeInfoKey {
    }
 }

-impl From<NodeInfoKey> for Vec<u8> {
-    fn from(key: NodeInfoKey) -> Self {
+impl From<&NodeInfoKey> for Vec<u8> {
+    fn from(key: &NodeInfoKey) -> Self {
        format!(
            "{}-{}-{}-{}",
            CLUSTER_NODE_INFO_PREFIX,
@@ -271,6 +300,7 @@ impl TryFrom<i32> for Role {
 mod tests {
    use std::assert_matches::assert_matches;

+    use super::*;
    use crate::cluster::Role::{Datanode, Frontend};
    use crate::cluster::{DatanodeStatus, NodeInfo, NodeInfoKey, NodeStatus};
    use crate::peer::Peer;
@@ -283,7 +313,7 @@ mod tests {
            node_id: 2,
        };

-        let key_bytes: Vec<u8> = key.into();
+        let key_bytes: Vec<u8> = (&key).into();
        let new_key: NodeInfoKey = key_bytes.try_into().unwrap();

        assert_eq!(1, new_key.cluster_id);
@@ -338,4 +368,26 @@ mod tests {
        let prefix = NodeInfoKey::key_prefix_with_role(2, Frontend);
        assert_eq!(prefix, "__meta_cluster_node_info-2-1-");
    }
+
+    #[test]
+    fn test_calculate_node_id_from_addr() {
+        // Test empty string
+        assert_eq!(calculate_node_id(""), calculate_node_id(""));
+
+        // Test same addresses return same ids
+        let addr1 = "127.0.0.1:8080";
+        let id1 = calculate_node_id(addr1);
+        let id2 = calculate_node_id(addr1);
+        assert_eq!(id1, id2);
+
+        // Test different addresses return different ids
+        let addr2 = "127.0.0.1:8081";
+        let id3 = calculate_node_id(addr2);
+        assert_ne!(id1, id3);
+
+        // Test long address
+        let long_addr = "very.long.domain.name.example.com:9999";
+        let id4 = calculate_node_id(long_addr);
+        assert!(id4 > 0);
+    }
 }
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -15,6 +15,7 @@
 mod metadata;

 use std::collections::BTreeMap;
+use std::fmt;

 use api::v1::flow::flow_request::Body as PbFlowRequest;
 use api::v1::flow::{CreateRequest, FlowRequest, FlowRequestHeader};
@@ -28,7 +29,6 @@ use common_procedure::{
 use common_telemetry::info;
 use common_telemetry::tracing_context::TracingContext;
 use futures::future::join_all;
-use futures::TryStreamExt;
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};
@@ -77,6 +77,7 @@ impl CreateFlowProcedure {
                query_context,
                state: CreateFlowState::Prepare,
                prev_flow_info_value: None,
+                flow_type: None,
            },
        }
    }
@@ -104,7 +105,7 @@ impl CreateFlowProcedure {
        if create_if_not_exists && or_replace {
            // this is forbidden because not clear what does that mean exactly
            return error::UnsupportedSnafu {
-                operation: "Create flow with both `IF NOT EXISTS` and `OR REPLACE`".to_string(),
+                operation: "Create flow with both `IF NOT EXISTS` and `OR REPLACE`",
            }
            .fail();
        }
@@ -129,9 +130,10 @@ impl CreateFlowProcedure {
                .flow_metadata_manager
                .flow_route_manager()
                .routes(flow_id)
-                .map_ok(|(_, value)| value.peer)
-                .try_collect::<Vec<_>>()
-                .await?;
+                .await?
+                .into_iter()
+                .map(|(_, value)| value.peer)
+                .collect::<Vec<_>>();
            self.data.flow_id = Some(flow_id);
            self.data.peers = peers;
            info!("Replacing flow, flow_id: {}", flow_id);
@@ -175,6 +177,8 @@ impl CreateFlowProcedure {
            self.allocate_flow_id().await?;
        }
        self.data.state = CreateFlowState::CreateFlows;
+        // determine flow type
+        self.data.flow_type = Some(determine_flow_type(&self.data.task));

        Ok(Status::executing(true))
    }
@@ -309,6 +313,11 @@ impl Procedure for CreateFlowProcedure {
    }
 }

+pub fn determine_flow_type(_flow_task: &CreateFlowTask) -> FlowType {
+    // TODO(discord9): determine flow type
+    FlowType::RecordingRule
+}
+
 /// The state of [CreateFlowProcedure].
 #[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
 pub enum CreateFlowState {
@@ -322,6 +331,36 @@ pub enum CreateFlowState {
    CreateMetadata,
 }

+/// The type of flow.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum FlowType {
+    /// The flow is a recording rule task.
+    RecordingRule,
+    /// The flow is a streaming task.
+    Streaming,
+}
+
+impl FlowType {
+    pub const RECORDING_RULE: &str = "recording_rule";
+    pub const STREAMING: &str = "streaming";
+    pub const FLOW_TYPE_KEY: &str = "flow_type";
+}
+
+impl Default for FlowType {
+    fn default() -> Self {
+        Self::RecordingRule
+    }
+}
+
+impl fmt::Display for FlowType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            FlowType::RecordingRule => write!(f, "{}", FlowType::RECORDING_RULE),
+            FlowType::Streaming => write!(f, "{}", FlowType::STREAMING),
+        }
+    }
+}
+
 /// The serializable data.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct CreateFlowData {
@@ -335,6 +374,7 @@ pub struct CreateFlowData {
    /// For verify if prev value is consistent when need to update flow metadata.
    /// only set when `or_replace` is true.
    pub(crate) prev_flow_info_value: Option<DeserializedValueWithBytes<FlowInfoValue>>,
+    pub(crate) flow_type: Option<FlowType>,
 }

 impl From<&CreateFlowData> for CreateRequest {
@@ -342,7 +382,7 @@ impl From<&CreateFlowData> for CreateRequest {
        let flow_id = value.flow_id.unwrap();
        let source_table_ids = &value.source_table_ids;

-        CreateRequest {
+        let mut req = CreateRequest {
            flow_id: Some(api::v1::FlowId { id: flow_id }),
            source_table_ids: source_table_ids
                .iter()
@@ -356,7 +396,12 @@ impl From<&CreateFlowData> for CreateRequest {
            comment: value.task.comment.clone(),
            sql: value.task.sql.clone(),
            flow_options: value.task.flow_options.clone(),
-        }
+        };
+
+        let flow_type = value.flow_type.unwrap_or_default().to_string();
+        req.flow_options
+            .insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
+        req
    }
 }

@@ -369,7 +414,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            expire_after,
            comment,
            sql,
-            flow_options: options,
+            flow_options: mut options,
            ..
        } = value.task.clone();

@@ -386,19 +431,21 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            .map(|(idx, peer)| (idx as u32, FlowRouteValue { peer: peer.clone() }))
            .collect::<Vec<_>>();

-        (
-            FlowInfoValue {
-                source_table_ids: value.source_table_ids.clone(),
-                sink_table_name,
-                flownode_ids,
-                catalog_name,
-                flow_name,
-                raw_sql: sql,
-                expire_after,
-                comment,
-                options,
-            },
-            flow_routes,
-        )
+        let flow_type = value.flow_type.unwrap_or_default().to_string();
+        options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
+
+        let flow_info = FlowInfoValue {
+            source_table_ids: value.source_table_ids.clone(),
+            sink_table_name,
+            flownode_ids,
+            catalog_name,
+            flow_name,
+            raw_sql: sql,
+            expire_after,
+            comment,
+            options,
+        };
+
+        (flow_info, flow_routes)
    }
 }
--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -128,7 +128,7 @@ impl State for DropDatabaseExecutor {
            .await?;
        executor.invalidate_table_cache(ddl_ctx).await?;
        executor
-            .on_drop_regions(ddl_ctx, &self.physical_region_routes)
+            .on_drop_regions(ddl_ctx, &self.physical_region_routes, true)
            .await?;
        info!("Table: {}({}) is dropped", self.table_name, self.table_id);

--- a/src/common/meta/src/ddl/drop_flow/metadata.rs
+++ b/src/common/meta/src/ddl/drop_flow/metadata.rs
@@ -13,7 +13,6 @@
 // limitations under the License.

 use common_catalog::format_full_flow_name;
-use futures::TryStreamExt;
 use snafu::{ensure, OptionExt};

 use crate::ddl::drop_flow::DropFlowProcedure;
@@ -39,9 +38,10 @@ impl DropFlowProcedure {
            .flow_metadata_manager
            .flow_route_manager()
            .routes(self.data.task.flow_id)
-            .map_ok(|(_, value)| value)
-            .try_collect::<Vec<_>>()
-            .await?;
+            .await?
+            .into_iter()
+            .map(|(_, value)| value)
+            .collect::<Vec<_>>();
        ensure!(
            !flow_route_values.is_empty(),
            error::FlowRouteNotFoundSnafu {
--- a/src/common/meta/src/ddl/drop_table.rs
+++ b/src/common/meta/src/ddl/drop_table.rs
@@ -156,7 +156,7 @@ impl DropTableProcedure {

    pub async fn on_datanode_drop_regions(&mut self) -> Result<Status> {
        self.executor
-            .on_drop_regions(&self.context, &self.data.physical_region_routes)
+            .on_drop_regions(&self.context, &self.data.physical_region_routes, false)
            .await?;
        self.data.state = DropTableState::DeleteTombstone;
        Ok(Status::executing(true))
--- a/src/common/meta/src/ddl/drop_table/executor.rs
+++ b/src/common/meta/src/ddl/drop_table/executor.rs
@@ -214,6 +214,7 @@ impl DropTableExecutor {
        &self,
        ctx: &DdlContext,
        region_routes: &[RegionRoute],
+        fast_path: bool,
    ) -> Result<()> {
        let leaders = find_leaders(region_routes);
        let mut drop_region_tasks = Vec::with_capacity(leaders.len());
@@ -236,6 +237,7 @@ impl DropTableExecutor {
                    }),
                    body: Some(region_request::Body::Drop(PbDropRegionRequest {
                        region_id: region_id.as_u64(),
+                        fast_path,
                    })),
                };
                let datanode = datanode.clone();
--- a/src/common/meta/src/ddl/tests/create_view.rs
+++ b/src/common/meta/src/ddl/tests/create_view.rs
@@ -219,7 +219,7 @@ async fn test_replace_view_metadata() {
        assert_eq!(err.status_code(), StatusCode::TableAlreadyExists);
    }

-    // Set `or_replce` to be `true` and try again
+    // Set `or_replace` to be `true` and try again
    task.create_view.or_replace = true;
    task.create_view.logical_plan = vec![4, 5, 6];
    task.create_view.definition = "new_definition".to_string();
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -686,8 +686,8 @@ pub enum Error {
    },

    #[cfg(feature = "pg_kvbackend")]
-    #[snafu(display("Postgres transaction retry failed"))]
-    PostgresTransactionRetryFailed {
+    #[snafu(display("Rds transaction retry failed"))]
+    RdsTransactionRetryFailed {
        #[snafu(implicit)]
        location: Location,
    },
@@ -824,7 +824,7 @@ impl ErrorExt for Error {
            | CreatePostgresPool { .. }
            | GetPostgresConnection { .. }
            | PostgresTransaction { .. }
-            | PostgresTransactionRetryFailed { .. } => StatusCode::Internal,
+            | RdsTransactionRetryFailed { .. } => StatusCode::Internal,
            Error::DatanodeTableInfoNotFound { .. } => StatusCode::Internal,
        }
    }
--- a/src/common/meta/src/key/flow.rs
+++ b/src/common/meta/src/key/flow.rs
@@ -16,9 +16,9 @@ pub mod flow_info;
 pub(crate) mod flow_name;
 pub(crate) mod flow_route;
 pub mod flow_state;
+mod flownode_addr_helper;
 pub(crate) mod flownode_flow;
 pub(crate) mod table_flow;
-
 use std::ops::Deref;
 use std::sync::Arc;

@@ -506,7 +506,6 @@ mod tests {
        let routes = flow_metadata_manager
            .flow_route_manager()
            .routes(flow_id)
-            .try_collect::<Vec<_>>()
            .await
            .unwrap();
        assert_eq!(
@@ -538,7 +537,6 @@ mod tests {
            let nodes = flow_metadata_manager
                .table_flow_manager()
                .flows(table_id)
-                .try_collect::<Vec<_>>()
                .await
                .unwrap();
            assert_eq!(
@@ -727,7 +725,6 @@ mod tests {
        let routes = flow_metadata_manager
            .flow_route_manager()
            .routes(flow_id)
-            .try_collect::<Vec<_>>()
            .await
            .unwrap();
        assert_eq!(
@@ -759,7 +756,6 @@ mod tests {
            let nodes = flow_metadata_manager
                .table_flow_manager()
                .flows(table_id)
-                .try_collect::<Vec<_>>()
                .await
                .unwrap();
            assert_eq!(
--- a/src/common/meta/src/key/flow/flow_route.rs
+++ b/src/common/meta/src/key/flow/flow_route.rs
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use futures::stream::BoxStream;
+use futures::TryStreamExt;
 use lazy_static::lazy_static;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
 use snafu::OptionExt;

 use crate::error::{self, Result};
-use crate::key::flow::FlowScoped;
+use crate::key::flow::{flownode_addr_helper, FlowScoped};
+use crate::key::node_address::NodeAddressKey;
 use crate::key::{BytesAdapter, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
 use crate::kv_backend::txn::{Txn, TxnOp};
 use crate::kv_backend::KvBackendRef;
@@ -167,10 +168,7 @@ impl FlowRouteManager {
    }

    /// Retrieves all [FlowRouteValue]s of the specified `flow_id`.
-    pub fn routes(
-        &self,
-        flow_id: FlowId,
-    ) -> BoxStream<'static, Result<(FlowRouteKey, FlowRouteValue)>> {
+    pub async fn routes(&self, flow_id: FlowId) -> Result<Vec<(FlowRouteKey, FlowRouteValue)>> {
        let start_key = FlowRouteKey::range_start_key(flow_id);
        let req = RangeRequest::new().with_prefix(start_key);
        let stream = PaginationStream::new(
@@ -181,7 +179,9 @@ impl FlowRouteManager {
        )
        .into_stream();

-        Box::pin(stream)
+        let mut res = stream.try_collect::<Vec<_>>().await?;
+        self.remap_flow_route_addresses(&mut res).await?;
+        Ok(res)
    }

    /// Builds a create flow routes transaction.
@@ -203,6 +203,28 @@ impl FlowRouteManager {

        Ok(Txn::new().and_then(txns))
    }
+
+    async fn remap_flow_route_addresses(
+        &self,
+        flow_routes: &mut [(FlowRouteKey, FlowRouteValue)],
+    ) -> Result<()> {
+        let keys = flow_routes
+            .iter()
+            .map(|(_, value)| NodeAddressKey::with_flownode(value.peer.id))
+            .collect();
+        let flow_node_addrs =
+            flownode_addr_helper::get_flownode_addresses(&self.kv_backend, keys).await?;
+        for (_, flow_route_value) in flow_routes.iter_mut() {
+            let flownode_id = flow_route_value.peer.id;
+            // If an id lacks a corresponding address in the `flow_node_addrs`,
+            // it means the old address in `table_flow_value` is still valid,
+            // which is expected.
+            if let Some(node_addr) = flow_node_addrs.get(&flownode_id) {
+                flow_route_value.peer.addr = node_addr.peer.addr.clone();
+            }
+        }
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/src/common/meta/src/key/flow/flownode_addr_helper.rs
+++ b/src/common/meta/src/key/flow/flownode_addr_helper.rs
@@ -0,0 +1,47 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use crate::error::Result;
+use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
+use crate::key::{MetadataKey, MetadataValue};
+use crate::kv_backend::KvBackendRef;
+use crate::rpc::store::BatchGetRequest;
+
+/// Get the addresses of the flownodes.
+/// The result is a map: node_id -> NodeAddressValue
+pub(crate) async fn get_flownode_addresses(
+    kv_backend: &KvBackendRef,
+    keys: Vec<NodeAddressKey>,
+) -> Result<HashMap<u64, NodeAddressValue>> {
+    if keys.is_empty() {
+        return Ok(HashMap::default());
+    }
+
+    let req = BatchGetRequest {
+        keys: keys.into_iter().map(|k| k.to_bytes()).collect(),
+    };
+    kv_backend
+        .batch_get(req)
+        .await?
+        .kvs
+        .into_iter()
+        .map(|kv| {
+            let key = NodeAddressKey::from_bytes(&kv.key)?;
+            let value = NodeAddressValue::try_from_raw_value(&kv.value)?;
+            Ok((key.node_id, value))
+        })
+        .collect()
+}
--- a/src/common/meta/src/key/flow/table_flow.rs
+++ b/src/common/meta/src/key/flow/table_flow.rs
@@ -14,7 +14,7 @@

 use std::sync::Arc;

-use futures::stream::BoxStream;
+use futures::TryStreamExt;
 use lazy_static::lazy_static;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
@@ -22,7 +22,8 @@ use snafu::OptionExt;
 use table::metadata::TableId;

 use crate::error::{self, Result};
-use crate::key::flow::FlowScoped;
+use crate::key::flow::{flownode_addr_helper, FlowScoped};
+use crate::key::node_address::NodeAddressKey;
 use crate::key::{BytesAdapter, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
 use crate::kv_backend::txn::{Txn, TxnOp};
 use crate::kv_backend::KvBackendRef;
@@ -196,10 +197,7 @@ impl TableFlowManager {
    /// Retrieves all [TableFlowKey]s of the specified `table_id`.
    ///
    /// TODO(discord9): add cache for it since range request does not support cache.
-    pub fn flows(
-        &self,
-        table_id: TableId,
-    ) -> BoxStream<'static, Result<(TableFlowKey, TableFlowValue)>> {
+    pub async fn flows(&self, table_id: TableId) -> Result<Vec<(TableFlowKey, TableFlowValue)>> {
        let start_key = TableFlowKey::range_start_key(table_id);
        let req = RangeRequest::new().with_prefix(start_key);
        let stream = PaginationStream::new(
@@ -210,7 +208,9 @@ impl TableFlowManager {
        )
        .into_stream();

-        Box::pin(stream)
+        let mut res = stream.try_collect::<Vec<_>>().await?;
+        self.remap_table_flow_addresses(&mut res).await?;
+        Ok(res)
    }

    /// Builds a create table flow transaction.
@@ -238,6 +238,28 @@ impl TableFlowManager {

        Ok(Txn::new().and_then(txns))
    }
+
+    async fn remap_table_flow_addresses(
+        &self,
+        table_flows: &mut [(TableFlowKey, TableFlowValue)],
+    ) -> Result<()> {
+        let keys = table_flows
+            .iter()
+            .map(|(_, value)| NodeAddressKey::with_flownode(value.peer.id))
+            .collect::<Vec<_>>();
+        let flownode_addrs =
+            flownode_addr_helper::get_flownode_addresses(&self.kv_backend, keys).await?;
+        for (_, table_flow_value) in table_flows.iter_mut() {
+            let flownode_id = table_flow_value.peer.id;
+            // If an id lacks a corresponding address in the `flow_node_addrs`,
+            // it means the old address in `table_flow_value` is still valid,
+            // which is expected.
+            if let Some(flownode_addr) = flownode_addrs.get(&flownode_id) {
+                table_flow_value.peer.addr = flownode_addr.peer.addr.clone();
+            }
+        }
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/src/common/meta/src/key/node_address.rs
+++ b/src/common/meta/src/key/node_address.rs
@@ -39,6 +39,10 @@ impl NodeAddressKey {
    pub fn with_datanode(node_id: u64) -> Self {
        Self::new(Role::Datanode, node_id)
    }
+
+    pub fn with_flownode(node_id: u64) -> Self {
+        Self::new(Role::Flownode, node_id)
+    }
 }

 #[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
--- a/src/common/meta/src/kv_backend.rs
+++ b/src/common/meta/src/kv_backend.rs
@@ -32,7 +32,7 @@ pub mod chroot;
 pub mod etcd;
 pub mod memory;
 #[cfg(feature = "pg_kvbackend")]
-pub mod postgres;
+pub mod rds;
 pub mod test;
 pub mod txn;

--- a/src/common/meta/src/kv_backend/postgres.rs
+++ b/src/common/meta/src/kv_backend/postgres.rs
--- a/src/common/meta/src/kv_backend/rds.rs
+++ b/src/common/meta/src/kv_backend/rds.rs
@@ -0,0 +1,548 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::marker::PhantomData;
+use std::time::Duration;
+
+use backon::{BackoffBuilder, ExponentialBuilder};
+use common_telemetry::debug;
+
+use crate::error::{Error, RdsTransactionRetryFailedSnafu, Result};
+use crate::kv_backend::txn::{
+    Compare, Txn as KvTxn, TxnOp, TxnOpResponse, TxnResponse as KvTxnResponse,
+};
+use crate::kv_backend::{KvBackend, TxnService};
+use crate::metrics::METRIC_META_TXN_REQUEST;
+use crate::rpc::store::{
+    BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest,
+    BatchPutResponse, DeleteRangeRequest, DeleteRangeResponse, PutRequest, PutResponse,
+    RangeRequest, RangeResponse,
+};
+use crate::rpc::KeyValue;
+
+mod postgres;
+
+pub use postgres::PgStore;
+
+const RDS_STORE_TXN_RETRY_COUNT: usize = 3;
+
+/// Query executor for rds. It can execute queries or generate a transaction executor.
+#[async_trait::async_trait]
+pub trait Executor: Send + Sync {
+    type Transaction<'a>: 'a + Transaction<'a>
+    where
+        Self: 'a;
+
+    fn name() -> &'static str;
+
+    async fn query(&mut self, query: &str, params: &[&Vec<u8>]) -> Result<Vec<KeyValue>>;
+
+    /// Some queries don't need to return any result, such as `DELETE`.
+    async fn execute(&mut self, query: &str, params: &[&Vec<u8>]) -> Result<()> {
+        self.query(query, params).await?;
+        Ok(())
+    }
+
+    async fn txn_executor<'a>(&'a mut self) -> Result<Self::Transaction<'a>>;
+}
+
+/// Transaction query executor for rds. It can execute queries in transaction or commit the transaction.
+#[async_trait::async_trait]
+pub trait Transaction<'a>: Send + Sync {
+    async fn query(&mut self, query: &str, params: &[&Vec<u8>]) -> Result<Vec<KeyValue>>;
+
+    async fn execute(&mut self, query: &str, params: &[&Vec<u8>]) -> Result<()> {
+        self.query(query, params).await?;
+        Ok(())
+    }
+
+    async fn commit(self) -> Result<()>;
+}
+
+/// Factory for creating default and transaction query executors.
+#[async_trait::async_trait]
+pub trait ExecutorFactory<T: Executor>: Send + Sync {
+    async fn default_executor(&self) -> Result<T>;
+
+    async fn txn_executor<'a>(&self, default_executor: &'a mut T) -> Result<T::Transaction<'a>>;
+}
+
+/// Rds backed store for metsrv
+pub struct RdsStore<T, S, R>
+where
+    T: Executor + Send + Sync,
+    S: ExecutorFactory<T> + Send + Sync,
+{
+    max_txn_ops: usize,
+    txn_retry_count: usize,
+    executor_factory: S,
+    sql_template_set: R,
+    _phantom: PhantomData<T>,
+}
+
+pub enum ExecutorImpl<'a, T: Executor + 'a> {
+    Default(T),
+    Txn(T::Transaction<'a>),
+}
+
+impl<T: Executor> ExecutorImpl<'_, T> {
+    async fn query(&mut self, query: &str, params: &Vec<&Vec<u8>>) -> Result<Vec<KeyValue>> {
+        match self {
+            Self::Default(executor) => executor.query(query, params).await,
+            Self::Txn(executor) => executor.query(query, params).await,
+        }
+    }
+
+    async fn commit(self) -> Result<()> {
+        match self {
+            Self::Txn(executor) => executor.commit().await,
+            _ => Ok(()),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+pub trait KvQueryExecutor<T: Executor> {
+    async fn range_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        req: RangeRequest,
+    ) -> Result<RangeResponse>;
+
+    async fn put_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        req: PutRequest,
+    ) -> Result<PutResponse> {
+        let kv = KeyValue {
+            key: req.key,
+            value: req.value,
+        };
+        let mut res = self
+            .batch_put_with_query_executor(
+                query_executor,
+                BatchPutRequest {
+                    kvs: vec![kv],
+                    prev_kv: req.prev_kv,
+                },
+            )
+            .await?;
+
+        if !res.prev_kvs.is_empty() {
+            debug_assert!(req.prev_kv);
+            return Ok(PutResponse {
+                prev_kv: Some(res.prev_kvs.remove(0)),
+            });
+        }
+        Ok(PutResponse::default())
+    }
+
+    async fn batch_put_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        req: BatchPutRequest,
+    ) -> Result<BatchPutResponse>;
+
+    /// Batch get with certain client. It's needed for a client with transaction.
+    async fn batch_get_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        req: BatchGetRequest,
+    ) -> Result<BatchGetResponse>;
+
+    async fn delete_range_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        req: DeleteRangeRequest,
+    ) -> Result<DeleteRangeResponse>;
+
+    async fn batch_delete_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        req: BatchDeleteRequest,
+    ) -> Result<BatchDeleteResponse>;
+}
+
+impl<T, S, R> RdsStore<T, S, R>
+where
+    Self: KvQueryExecutor<T> + Send + Sync,
+    T: Executor + Send + Sync,
+    S: ExecutorFactory<T> + Send + Sync,
+{
+    async fn execute_txn_cmp(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        cmp: &[Compare],
+    ) -> Result<bool> {
+        let batch_get_req = BatchGetRequest {
+            keys: cmp.iter().map(|c| c.key.clone()).collect(),
+        };
+        let res = self
+            .batch_get_with_query_executor(query_executor, batch_get_req)
+            .await?;
+        debug!("batch get res: {:?}", res);
+        let res_map = res
+            .kvs
+            .into_iter()
+            .map(|kv| (kv.key, kv.value))
+            .collect::<HashMap<Vec<u8>, Vec<u8>>>();
+        for c in cmp {
+            let value = res_map.get(&c.key);
+            if !c.compare_value(value) {
+                return Ok(false);
+            }
+        }
+        Ok(true)
+    }
+
+    /// Execute a batch of transaction operations. This function is only used for transactions with the same operation type.
+    async fn try_batch_txn(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        txn_ops: &[TxnOp],
+    ) -> Result<Option<Vec<TxnOpResponse>>> {
+        if !check_txn_ops(txn_ops)? {
+            return Ok(None);
+        }
+        // Safety: txn_ops is not empty
+        match txn_ops.first().unwrap() {
+            TxnOp::Delete(_) => self.handle_batch_delete(query_executor, txn_ops).await,
+            TxnOp::Put(_, _) => self.handle_batch_put(query_executor, txn_ops).await,
+            TxnOp::Get(_) => self.handle_batch_get(query_executor, txn_ops).await,
+        }
+    }
+
+    async fn handle_batch_delete(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        txn_ops: &[TxnOp],
+    ) -> Result<Option<Vec<TxnOpResponse>>> {
+        let mut batch_del_req = BatchDeleteRequest {
+            keys: vec![],
+            prev_kv: true,
+        };
+        for op in txn_ops {
+            if let TxnOp::Delete(key) = op {
+                batch_del_req.keys.push(key.clone());
+            }
+        }
+        let res = self
+            .batch_delete_with_query_executor(query_executor, batch_del_req)
+            .await?;
+        let res_map = res
+            .prev_kvs
+            .into_iter()
+            .map(|kv| (kv.key, kv.value))
+            .collect::<HashMap<Vec<u8>, Vec<u8>>>();
+        let mut resps = Vec::with_capacity(txn_ops.len());
+        for op in txn_ops {
+            if let TxnOp::Delete(key) = op {
+                let value = res_map.get(key);
+                resps.push(TxnOpResponse::ResponseDelete(DeleteRangeResponse {
+                    deleted: if value.is_some() { 1 } else { 0 },
+                    prev_kvs: vec![],
+                }));
+            }
+        }
+        Ok(Some(resps))
+    }
+
+    async fn handle_batch_put(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        txn_ops: &[TxnOp],
+    ) -> Result<Option<Vec<TxnOpResponse>>> {
+        let mut batch_put_req = BatchPutRequest {
+            kvs: vec![],
+            prev_kv: false,
+        };
+        for op in txn_ops {
+            if let TxnOp::Put(key, value) = op {
+                batch_put_req.kvs.push(KeyValue {
+                    key: key.clone(),
+                    value: value.clone(),
+                });
+            }
+        }
+        let _ = self
+            .batch_put_with_query_executor(query_executor, batch_put_req)
+            .await?;
+        let mut resps = Vec::with_capacity(txn_ops.len());
+        for op in txn_ops {
+            if let TxnOp::Put(_, _) = op {
+                resps.push(TxnOpResponse::ResponsePut(PutResponse { prev_kv: None }));
+            }
+        }
+        Ok(Some(resps))
+    }
+
+    async fn handle_batch_get(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        txn_ops: &[TxnOp],
+    ) -> Result<Option<Vec<TxnOpResponse>>> {
+        let mut batch_get_req = BatchGetRequest { keys: vec![] };
+        for op in txn_ops {
+            if let TxnOp::Get(key) = op {
+                batch_get_req.keys.push(key.clone());
+            }
+        }
+        let res = self
+            .batch_get_with_query_executor(query_executor, batch_get_req)
+            .await?;
+        let res_map = res
+            .kvs
+            .into_iter()
+            .map(|kv| (kv.key, kv.value))
+            .collect::<HashMap<Vec<u8>, Vec<u8>>>();
+        let mut resps = Vec::with_capacity(txn_ops.len());
+        for op in txn_ops {
+            if let TxnOp::Get(key) = op {
+                let value = res_map.get(key);
+                resps.push(TxnOpResponse::ResponseGet(RangeResponse {
+                    kvs: value
+                        .map(|v| {
+                            vec![KeyValue {
+                                key: key.clone(),
+                                value: v.clone(),
+                            }]
+                        })
+                        .unwrap_or_default(),
+                    more: false,
+                }));
+            }
+        }
+        Ok(Some(resps))
+    }
+
+    async fn execute_txn_op(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, T>,
+        op: &TxnOp,
+    ) -> Result<TxnOpResponse> {
+        match op {
+            TxnOp::Put(key, value) => {
+                let res = self
+                    .put_with_query_executor(
+                        query_executor,
+                        PutRequest {
+                            key: key.clone(),
+                            value: value.clone(),
+                            prev_kv: false,
+                        },
+                    )
+                    .await?;
+                Ok(TxnOpResponse::ResponsePut(res))
+            }
+            TxnOp::Get(key) => {
+                let res = self
+                    .range_with_query_executor(
+                        query_executor,
+                        RangeRequest {
+                            key: key.clone(),
+                            range_end: vec![],
+                            limit: 1,
+                            keys_only: false,
+                        },
+                    )
+                    .await?;
+                Ok(TxnOpResponse::ResponseGet(res))
+            }
+            TxnOp::Delete(key) => {
+                let res = self
+                    .delete_range_with_query_executor(
+                        query_executor,
+                        DeleteRangeRequest {
+                            key: key.clone(),
+                            range_end: vec![],
+                            prev_kv: false,
+                        },
+                    )
+                    .await?;
+                Ok(TxnOpResponse::ResponseDelete(res))
+            }
+        }
+    }
+
+    async fn txn_inner(&self, txn: &KvTxn) -> Result<KvTxnResponse> {
+        let mut default_executor = self.executor_factory.default_executor().await?;
+        let mut txn_executor = ExecutorImpl::Txn(
+            self.executor_factory
+                .txn_executor(&mut default_executor)
+                .await?,
+        );
+        let mut success = true;
+        if txn.c_when {
+            success = self
+                .execute_txn_cmp(&mut txn_executor, &txn.req.compare)
+                .await?;
+        }
+        let mut responses = vec![];
+        if success && txn.c_then {
+            match self
+                .try_batch_txn(&mut txn_executor, &txn.req.success)
+                .await?
+            {
+                Some(res) => responses.extend(res),
+                None => {
+                    for txnop in &txn.req.success {
+                        let res = self.execute_txn_op(&mut txn_executor, txnop).await?;
+                        responses.push(res);
+                    }
+                }
+            }
+        } else if !success && txn.c_else {
+            match self
+                .try_batch_txn(&mut txn_executor, &txn.req.failure)
+                .await?
+            {
+                Some(res) => responses.extend(res),
+                None => {
+                    for txnop in &txn.req.failure {
+                        let res = self.execute_txn_op(&mut txn_executor, txnop).await?;
+                        responses.push(res);
+                    }
+                }
+            }
+        }
+
+        txn_executor.commit().await?;
+        Ok(KvTxnResponse {
+            responses,
+            succeeded: success,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl<T, S, R> KvBackend for RdsStore<T, S, R>
+where
+    R: 'static,
+    Self: KvQueryExecutor<T> + Send + Sync,
+    T: Executor + 'static,
+    S: ExecutorFactory<T> + 'static,
+{
+    fn name(&self) -> &str {
+        T::name()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    async fn range(&self, req: RangeRequest) -> Result<RangeResponse> {
+        let client = self.executor_factory.default_executor().await?;
+        let mut query_executor = ExecutorImpl::Default(client);
+        self.range_with_query_executor(&mut query_executor, req)
+            .await
+    }
+
+    async fn put(&self, req: PutRequest) -> Result<PutResponse> {
+        let client = self.executor_factory.default_executor().await?;
+        let mut query_executor = ExecutorImpl::Default(client);
+        self.put_with_query_executor(&mut query_executor, req).await
+    }
+
+    async fn batch_put(&self, req: BatchPutRequest) -> Result<BatchPutResponse> {
+        let client = self.executor_factory.default_executor().await?;
+        let mut query_executor = ExecutorImpl::Default(client);
+        self.batch_put_with_query_executor(&mut query_executor, req)
+            .await
+    }
+
+    async fn batch_get(&self, req: BatchGetRequest) -> Result<BatchGetResponse> {
+        let client = self.executor_factory.default_executor().await?;
+        let mut query_executor = ExecutorImpl::Default(client);
+        self.batch_get_with_query_executor(&mut query_executor, req)
+            .await
+    }
+
+    async fn delete_range(&self, req: DeleteRangeRequest) -> Result<DeleteRangeResponse> {
+        let client = self.executor_factory.default_executor().await?;
+        let mut query_executor = ExecutorImpl::Default(client);
+        self.delete_range_with_query_executor(&mut query_executor, req)
+            .await
+    }
+
+    async fn batch_delete(&self, req: BatchDeleteRequest) -> Result<BatchDeleteResponse> {
+        let client = self.executor_factory.default_executor().await?;
+        let mut query_executor = ExecutorImpl::Default(client);
+        self.batch_delete_with_query_executor(&mut query_executor, req)
+            .await
+    }
+}
+
+#[async_trait::async_trait]
+impl<T, S, R> TxnService for RdsStore<T, S, R>
+where
+    Self: KvQueryExecutor<T> + Send + Sync,
+    T: Executor + 'static,
+    S: ExecutorFactory<T> + 'static,
+{
+    type Error = Error;
+
+    async fn txn(&self, txn: KvTxn) -> Result<KvTxnResponse> {
+        let _timer = METRIC_META_TXN_REQUEST
+            .with_label_values(&[T::name(), "txn"])
+            .start_timer();
+
+        let mut backoff = ExponentialBuilder::default()
+            .with_min_delay(Duration::from_millis(10))
+            .with_max_delay(Duration::from_millis(200))
+            .with_max_times(self.txn_retry_count)
+            .build();
+
+        loop {
+            match self.txn_inner(&txn).await {
+                Ok(res) => return Ok(res),
+                Err(e) => {
+                    if e.is_serialization_error() {
+                        let d = backoff.next();
+                        if let Some(d) = d {
+                            tokio::time::sleep(d).await;
+                            continue;
+                        }
+                        break;
+                    } else {
+                        return Err(e);
+                    }
+                }
+            }
+        }
+
+        RdsTransactionRetryFailedSnafu {}.fail()
+    }
+
+    fn max_txn_ops(&self) -> usize {
+        self.max_txn_ops
+    }
+}
+
+/// Checks if the transaction operations are the same type.
+fn check_txn_ops(txn_ops: &[TxnOp]) -> Result<bool> {
+    if txn_ops.is_empty() {
+        return Ok(false);
+    }
+    let same = txn_ops.windows(2).all(|a| {
+        matches!(
+            (&a[0], &a[1]),
+            (TxnOp::Put(_, _), TxnOp::Put(_, _))
+                | (TxnOp::Get(_), TxnOp::Get(_))
+                | (TxnOp::Delete(_), TxnOp::Delete(_))
+        )
+    });
+    Ok(same)
+}
--- a/src/common/meta/src/kv_backend/rds/postgres.rs
+++ b/src/common/meta/src/kv_backend/rds/postgres.rs
@@ -0,0 +1,624 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+use common_telemetry::debug;
+use deadpool_postgres::{Config, Pool, Runtime};
+use snafu::ResultExt;
+use tokio_postgres::types::ToSql;
+use tokio_postgres::{IsolationLevel, NoTls, Row};
+
+use crate::error::{
+    CreatePostgresPoolSnafu, GetPostgresConnectionSnafu, PostgresExecutionSnafu,
+    PostgresTransactionSnafu, Result,
+};
+use crate::kv_backend::rds::{
+    Executor, ExecutorFactory, ExecutorImpl, KvQueryExecutor, RdsStore, Transaction,
+    RDS_STORE_TXN_RETRY_COUNT,
+};
+use crate::kv_backend::KvBackendRef;
+use crate::rpc::store::{
+    BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest,
+    BatchPutResponse, DeleteRangeRequest, DeleteRangeResponse, RangeRequest, RangeResponse,
+};
+use crate::rpc::KeyValue;
+
+pub struct PgClient(deadpool::managed::Object<deadpool_postgres::Manager>);
+pub struct PgTxnClient<'a>(deadpool_postgres::Transaction<'a>);
+
+/// Converts a row to a [`KeyValue`].
+fn key_value_from_row(r: Row) -> KeyValue {
+    KeyValue {
+        key: r.get(0),
+        value: r.get(1),
+    }
+}
+
+const EMPTY: &[u8] = &[0];
+
+/// Type of range template.
+#[derive(Debug, Clone, Copy)]
+enum RangeTemplateType {
+    Point,
+    Range,
+    Full,
+    LeftBounded,
+    Prefix,
+}
+
+/// Builds params for the given range template type.
+impl RangeTemplateType {
+    fn build_params(&self, mut key: Vec<u8>, range_end: Vec<u8>) -> Vec<Vec<u8>> {
+        match self {
+            RangeTemplateType::Point => vec![key],
+            RangeTemplateType::Range => vec![key, range_end],
+            RangeTemplateType::Full => vec![],
+            RangeTemplateType::LeftBounded => vec![key],
+            RangeTemplateType::Prefix => {
+                key.push(b'%');
+                vec![key]
+            }
+        }
+    }
+}
+
+/// Templates for range request.
+#[derive(Debug, Clone)]
+struct RangeTemplate {
+    point: String,
+    range: String,
+    full: String,
+    left_bounded: String,
+    prefix: String,
+}
+
+impl RangeTemplate {
+    /// Gets the template for the given type.
+    fn get(&self, typ: RangeTemplateType) -> &str {
+        match typ {
+            RangeTemplateType::Point => &self.point,
+            RangeTemplateType::Range => &self.range,
+            RangeTemplateType::Full => &self.full,
+            RangeTemplateType::LeftBounded => &self.left_bounded,
+            RangeTemplateType::Prefix => &self.prefix,
+        }
+    }
+
+    /// Adds limit to the template.
+    fn with_limit(template: &str, limit: i64) -> String {
+        if limit == 0 {
+            return format!("{};", template);
+        }
+        format!("{} LIMIT {};", template, limit)
+    }
+}
+
+fn is_prefix_range(start: &[u8], end: &[u8]) -> bool {
+    if start.len() != end.len() {
+        return false;
+    }
+    let l = start.len();
+    let same_prefix = start[0..l - 1] == end[0..l - 1];
+    if let (Some(rhs), Some(lhs)) = (start.last(), end.last()) {
+        return same_prefix && (*rhs + 1) == *lhs;
+    }
+    false
+}
+
+/// Determine the template type for range request.
+fn range_template(key: &[u8], range_end: &[u8]) -> RangeTemplateType {
+    match (key, range_end) {
+        (_, &[]) => RangeTemplateType::Point,
+        (EMPTY, EMPTY) => RangeTemplateType::Full,
+        (_, EMPTY) => RangeTemplateType::LeftBounded,
+        (start, end) => {
+            if is_prefix_range(start, end) {
+                RangeTemplateType::Prefix
+            } else {
+                RangeTemplateType::Range
+            }
+        }
+    }
+}
+
+/// Generate in placeholders for PostgreSQL.
+fn pg_generate_in_placeholders(from: usize, to: usize) -> Vec<String> {
+    (from..=to).map(|i| format!("${}", i)).collect()
+}
+
+/// Factory for building sql templates.
+struct PgSqlTemplateFactory<'a> {
+    table_name: &'a str,
+}
+
+impl<'a> PgSqlTemplateFactory<'a> {
+    /// Creates a new [`SqlTemplateFactory`] with the given table name.
+    fn new(table_name: &'a str) -> Self {
+        Self { table_name }
+    }
+
+    /// Builds the template set for the given table name.
+    fn build(&self) -> PgSqlTemplateSet {
+        let table_name = self.table_name;
+        PgSqlTemplateSet {
+            table_name: table_name.to_string(),
+            create_table_statement: format!(
+                "CREATE TABLE IF NOT EXISTS {table_name}(k bytea PRIMARY KEY, v bytea)",
+            ),
+            range_template: RangeTemplate {
+                point: format!("SELECT k, v FROM {table_name} WHERE k = $1"),
+                range: format!("SELECT k, v FROM {table_name} WHERE k >= $1 AND k < $2 ORDER BY k"),
+                full: format!("SELECT k, v FROM {table_name} $1 ORDER BY k"),
+                left_bounded: format!("SELECT k, v FROM {table_name} WHERE k >= $1 ORDER BY k"),
+                prefix: format!("SELECT k, v FROM {table_name} WHERE k LIKE $1 ORDER BY k"),
+            },
+            delete_template: RangeTemplate {
+                point: format!("DELETE FROM {table_name} WHERE k = $1 RETURNING k,v;"),
+                range: format!("DELETE FROM {table_name} WHERE k >= $1 AND k < $2 RETURNING k,v;"),
+                full: format!("DELETE FROM {table_name} RETURNING k,v"),
+                left_bounded: format!("DELETE FROM {table_name} WHERE k >= $1 RETURNING k,v;"),
+                prefix: format!("DELETE FROM {table_name} WHERE k LIKE $1 RETURNING k,v;"),
+            },
+        }
+    }
+}
+
+/// Templates for the given table name.
+#[derive(Debug, Clone)]
+pub struct PgSqlTemplateSet {
+    table_name: String,
+    create_table_statement: String,
+    range_template: RangeTemplate,
+    delete_template: RangeTemplate,
+}
+
+impl PgSqlTemplateSet {
+    /// Generates the sql for batch get.
+    fn generate_batch_get_query(&self, key_len: usize) -> String {
+        let table_name = &self.table_name;
+        let in_clause = pg_generate_in_placeholders(1, key_len).join(", ");
+        format!("SELECT k, v FROM {table_name} WHERE k in ({});", in_clause)
+    }
+
+    /// Generates the sql for batch delete.
+    fn generate_batch_delete_query(&self, key_len: usize) -> String {
+        let table_name = &self.table_name;
+        let in_clause = pg_generate_in_placeholders(1, key_len).join(", ");
+        format!(
+            "DELETE FROM {table_name} WHERE k in ({}) RETURNING k,v;",
+            in_clause
+        )
+    }
+
+    /// Generates the sql for batch upsert.
+    fn generate_batch_upsert_query(&self, kv_len: usize) -> String {
+        let table_name = &self.table_name;
+        let in_placeholders: Vec<String> = (1..=kv_len).map(|i| format!("${}", i)).collect();
+        let in_clause = in_placeholders.join(", ");
+        let mut param_index = kv_len + 1;
+        let mut values_placeholders = Vec::new();
+        for _ in 0..kv_len {
+            values_placeholders.push(format!("(${0}, ${1})", param_index, param_index + 1));
+            param_index += 2;
+        }
+        let values_clause = values_placeholders.join(", ");
+
+        format!(
+            r#"
+    WITH prev AS (
+        SELECT k,v FROM {table_name} WHERE k IN ({in_clause})
+    ), update AS (
+    INSERT INTO {table_name} (k, v) VALUES
+        {values_clause}
+    ON CONFLICT (
+        k
+    ) DO UPDATE SET
+        v = excluded.v
+    )
+
+    SELECT k, v FROM prev;
+    "#
+        )
+    }
+}
+
+#[async_trait::async_trait]
+impl Executor for PgClient {
+    type Transaction<'a>
+        = PgTxnClient<'a>
+    where
+        Self: 'a;
+
+    fn name() -> &'static str {
+        "Postgres"
+    }
+
+    async fn query(&mut self, query: &str, params: &[&Vec<u8>]) -> Result<Vec<KeyValue>> {
+        let params: Vec<&(dyn ToSql + Sync)> = params.iter().map(|p| p as _).collect();
+        let stmt = self
+            .0
+            .prepare_cached(query)
+            .await
+            .context(PostgresExecutionSnafu { sql: query })?;
+        let rows = self
+            .0
+            .query(&stmt, &params)
+            .await
+            .context(PostgresExecutionSnafu { sql: query })?;
+        Ok(rows.into_iter().map(key_value_from_row).collect())
+    }
+
+    async fn txn_executor<'a>(&'a mut self) -> Result<Self::Transaction<'a>> {
+        let txn = self
+            .0
+            .build_transaction()
+            .isolation_level(IsolationLevel::Serializable)
+            .start()
+            .await
+            .context(PostgresTransactionSnafu {
+                operation: "begin".to_string(),
+            })?;
+        Ok(PgTxnClient(txn))
+    }
+}
+
+#[async_trait::async_trait]
+impl<'a> Transaction<'a> for PgTxnClient<'a> {
+    async fn query(&mut self, query: &str, params: &[&Vec<u8>]) -> Result<Vec<KeyValue>> {
+        let params: Vec<&(dyn ToSql + Sync)> = params.iter().map(|p| p as _).collect();
+        let stmt = self
+            .0
+            .prepare_cached(query)
+            .await
+            .context(PostgresExecutionSnafu { sql: query })?;
+        let rows = self
+            .0
+            .query(&stmt, &params)
+            .await
+            .context(PostgresExecutionSnafu { sql: query })?;
+        Ok(rows.into_iter().map(key_value_from_row).collect())
+    }
+
+    async fn commit(self) -> Result<()> {
+        self.0.commit().await.context(PostgresTransactionSnafu {
+            operation: "commit",
+        })?;
+        Ok(())
+    }
+}
+
+pub struct PgExecutorFactory {
+    pool: Pool,
+}
+
+impl PgExecutorFactory {
+    async fn client(&self) -> Result<PgClient> {
+        match self.pool.get().await {
+            Ok(client) => Ok(PgClient(client)),
+            Err(e) => GetPostgresConnectionSnafu {
+                reason: e.to_string(),
+            }
+            .fail(),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ExecutorFactory<PgClient> for PgExecutorFactory {
+    async fn default_executor(&self) -> Result<PgClient> {
+        self.client().await
+    }
+
+    async fn txn_executor<'a>(
+        &self,
+        default_executor: &'a mut PgClient,
+    ) -> Result<PgTxnClient<'a>> {
+        default_executor.txn_executor().await
+    }
+}
+
+/// A PostgreSQL-backed key-value store for metasrv.
+/// It uses [deadpool_postgres::Pool] as the connection pool for [RdsStore].
+pub type PgStore = RdsStore<PgClient, PgExecutorFactory, PgSqlTemplateSet>;
+
+#[async_trait::async_trait]
+impl KvQueryExecutor<PgClient> for PgStore {
+    async fn range_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, PgClient>,
+        req: RangeRequest,
+    ) -> Result<RangeResponse> {
+        let template_type = range_template(&req.key, &req.range_end);
+        let template = self.sql_template_set.range_template.get(template_type);
+        let params = template_type.build_params(req.key, req.range_end);
+        let params_ref = params.iter().collect::<Vec<_>>();
+        // Always add 1 to limit to check if there is more data
+        let query =
+            RangeTemplate::with_limit(template, if req.limit == 0 { 0 } else { req.limit + 1 });
+        let limit = req.limit as usize;
+        debug!("query: {:?}, params: {:?}", query, params);
+        let mut kvs = query_executor.query(&query, &params_ref).await?;
+        if req.keys_only {
+            kvs.iter_mut().for_each(|kv| kv.value = vec![]);
+        }
+        // If limit is 0, we always return all data
+        if limit == 0 || kvs.len() <= limit {
+            return Ok(RangeResponse { kvs, more: false });
+        }
+        // If limit is greater than the number of rows, we remove the last row and set more to true
+        let removed = kvs.pop();
+        debug_assert!(removed.is_some());
+        Ok(RangeResponse { kvs, more: true })
+    }
+
+    async fn batch_put_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, PgClient>,
+        req: BatchPutRequest,
+    ) -> Result<BatchPutResponse> {
+        let mut in_params = Vec::with_capacity(req.kvs.len() * 3);
+        let mut values_params = Vec::with_capacity(req.kvs.len() * 2);
+
+        for kv in &req.kvs {
+            let processed_key = &kv.key;
+            in_params.push(processed_key);
+
+            let processed_value = &kv.value;
+            values_params.push(processed_key);
+            values_params.push(processed_value);
+        }
+        in_params.extend(values_params);
+        let params = in_params.iter().map(|x| x as _).collect::<Vec<_>>();
+        let query = self
+            .sql_template_set
+            .generate_batch_upsert_query(req.kvs.len());
+        let kvs = query_executor.query(&query, &params).await?;
+        if req.prev_kv {
+            Ok(BatchPutResponse { prev_kvs: kvs })
+        } else {
+            Ok(BatchPutResponse::default())
+        }
+    }
+
+    /// Batch get with certain client. It's needed for a client with transaction.
+    async fn batch_get_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, PgClient>,
+        req: BatchGetRequest,
+    ) -> Result<BatchGetResponse> {
+        if req.keys.is_empty() {
+            return Ok(BatchGetResponse { kvs: vec![] });
+        }
+        let query = self
+            .sql_template_set
+            .generate_batch_get_query(req.keys.len());
+        let params = req.keys.iter().map(|x| x as _).collect::<Vec<_>>();
+        let kvs = query_executor.query(&query, &params).await?;
+        Ok(BatchGetResponse { kvs })
+    }
+
+    async fn delete_range_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, PgClient>,
+        req: DeleteRangeRequest,
+    ) -> Result<DeleteRangeResponse> {
+        let template_type = range_template(&req.key, &req.range_end);
+        let template = self.sql_template_set.delete_template.get(template_type);
+        let params = template_type.build_params(req.key, req.range_end);
+        let params_ref = params.iter().map(|x| x as _).collect::<Vec<_>>();
+        let kvs = query_executor.query(template, &params_ref).await?;
+        let mut resp = DeleteRangeResponse::new(kvs.len() as i64);
+        if req.prev_kv {
+            resp.with_prev_kvs(kvs);
+        }
+        Ok(resp)
+    }
+
+    async fn batch_delete_with_query_executor(
+        &self,
+        query_executor: &mut ExecutorImpl<'_, PgClient>,
+        req: BatchDeleteRequest,
+    ) -> Result<BatchDeleteResponse> {
+        if req.keys.is_empty() {
+            return Ok(BatchDeleteResponse::default());
+        }
+        let query = self
+            .sql_template_set
+            .generate_batch_delete_query(req.keys.len());
+        let params = req.keys.iter().map(|x| x as _).collect::<Vec<_>>();
+        let kvs = query_executor.query(&query, &params).await?;
+        if req.prev_kv {
+            Ok(BatchDeleteResponse { prev_kvs: kvs })
+        } else {
+            Ok(BatchDeleteResponse::default())
+        }
+    }
+}
+
+impl PgStore {
+    /// Create [PgStore] impl of [KvBackendRef] from url.
+    pub async fn with_url(url: &str, table_name: &str, max_txn_ops: usize) -> Result<KvBackendRef> {
+        let mut cfg = Config::new();
+        cfg.url = Some(url.to_string());
+        // TODO(weny, CookiePie): add tls support
+        let pool = cfg
+            .create_pool(Some(Runtime::Tokio1), NoTls)
+            .context(CreatePostgresPoolSnafu)?;
+        Self::with_pg_pool(pool, table_name, max_txn_ops).await
+    }
+
+    /// Create [PgStore] impl of [KvBackendRef] from [deadpool_postgres::Pool].
+    pub async fn with_pg_pool(
+        pool: Pool,
+        table_name: &str,
+        max_txn_ops: usize,
+    ) -> Result<KvBackendRef> {
+        // This step ensures the postgres metadata backend is ready to use.
+        // We check if greptime_metakv table exists, and we will create a new table
+        // if it does not exist.
+        let client = match pool.get().await {
+            Ok(client) => client,
+            Err(e) => {
+                return GetPostgresConnectionSnafu {
+                    reason: e.to_string(),
+                }
+                .fail();
+            }
+        };
+        let template_factory = PgSqlTemplateFactory::new(table_name);
+        let sql_template_set = template_factory.build();
+        client
+            .execute(&sql_template_set.create_table_statement, &[])
+            .await
+            .with_context(|_| PostgresExecutionSnafu {
+                sql: sql_template_set.create_table_statement.to_string(),
+            })?;
+        Ok(Arc::new(Self {
+            max_txn_ops,
+            sql_template_set,
+            txn_retry_count: RDS_STORE_TXN_RETRY_COUNT,
+            executor_factory: PgExecutorFactory { pool },
+            _phantom: PhantomData,
+        }))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::kv_backend::test::{
+        prepare_kv_with_prefix, test_kv_batch_delete_with_prefix, test_kv_batch_get_with_prefix,
+        test_kv_compare_and_put_with_prefix, test_kv_delete_range_with_prefix,
+        test_kv_put_with_prefix, test_kv_range_2_with_prefix, test_kv_range_with_prefix,
+        test_txn_compare_equal, test_txn_compare_greater, test_txn_compare_less,
+        test_txn_compare_not_equal, test_txn_one_compare_op, text_txn_multi_compare_op,
+        unprepare_kv,
+    };
+
+    async fn build_pg_kv_backend(table_name: &str) -> Option<PgStore> {
+        let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default();
+        if endpoints.is_empty() {
+            return None;
+        }
+
+        let mut cfg = Config::new();
+        cfg.url = Some(endpoints);
+        let pool = cfg
+            .create_pool(Some(Runtime::Tokio1), NoTls)
+            .context(CreatePostgresPoolSnafu)
+            .unwrap();
+        let client = pool.get().await.unwrap();
+        let template_factory = PgSqlTemplateFactory::new(table_name);
+        let sql_templates = template_factory.build();
+        client
+            .execute(&sql_templates.create_table_statement, &[])
+            .await
+            .context(PostgresExecutionSnafu {
+                sql: sql_templates.create_table_statement.to_string(),
+            })
+            .unwrap();
+        Some(PgStore {
+            max_txn_ops: 128,
+            sql_template_set: sql_templates,
+            txn_retry_count: RDS_STORE_TXN_RETRY_COUNT,
+            executor_factory: PgExecutorFactory { pool },
+            _phantom: PhantomData,
+        })
+    }
+
+    #[tokio::test]
+    async fn test_pg_put() {
+        let kv_backend = build_pg_kv_backend("put_test").await.unwrap();
+        let prefix = b"put/";
+        prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+        test_kv_put_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_range() {
+        let kv_backend = build_pg_kv_backend("range_test").await.unwrap();
+        let prefix = b"range/";
+        prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+        test_kv_range_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_range_2() {
+        let kv_backend = build_pg_kv_backend("range2_test").await.unwrap();
+        let prefix = b"range2/";
+        test_kv_range_2_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_batch_get() {
+        let kv_backend = build_pg_kv_backend("batch_get_test").await.unwrap();
+        let prefix = b"batch_get/";
+        prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+        test_kv_batch_get_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_batch_delete() {
+        let kv_backend = build_pg_kv_backend("batch_delete_test").await.unwrap();
+        let prefix = b"batch_delete/";
+        prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+        test_kv_delete_range_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_batch_delete_with_prefix() {
+        let kv_backend = build_pg_kv_backend("batch_delete_with_prefix_test")
+            .await
+            .unwrap();
+        let prefix = b"batch_delete/";
+        prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+        test_kv_batch_delete_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_delete_range() {
+        let kv_backend = build_pg_kv_backend("delete_range_test").await.unwrap();
+        let prefix = b"delete_range/";
+        prepare_kv_with_prefix(&kv_backend, prefix.to_vec()).await;
+        test_kv_delete_range_with_prefix(&kv_backend, prefix.to_vec()).await;
+        unprepare_kv(&kv_backend, prefix).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_compare_and_put() {
+        let kv_backend = build_pg_kv_backend("compare_and_put_test").await.unwrap();
+        let prefix = b"compare_and_put/";
+        let kv_backend = Arc::new(kv_backend);
+        test_kv_compare_and_put_with_prefix(kv_backend.clone(), prefix.to_vec()).await;
+    }
+
+    #[tokio::test]
+    async fn test_pg_txn() {
+        let kv_backend = build_pg_kv_backend("txn_test").await.unwrap();
+        test_txn_one_compare_op(&kv_backend).await;
+        text_txn_multi_compare_op(&kv_backend).await;
+        test_txn_compare_equal(&kv_backend).await;
+        test_txn_compare_greater(&kv_backend).await;
+        test_txn_compare_less(&kv_backend).await;
+        test_txn_compare_not_equal(&kv_backend).await;
+    }
+}
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -34,6 +34,7 @@ pub mod kv_backend;
 pub mod leadership_notifier;
 pub mod lock_key;
 pub mod metrics;
+pub mod node_expiry_listener;
 pub mod node_manager;
 pub mod peer;
 pub mod range_stream;
--- a/src/common/meta/src/node_expiry_listener.rs
+++ b/src/common/meta/src/node_expiry_listener.rs
@@ -0,0 +1,152 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Mutex;
+use std::time::Duration;
+
+use common_telemetry::{debug, error, info, warn};
+use tokio::task::JoinHandle;
+use tokio::time::{interval, MissedTickBehavior};
+
+use crate::cluster::{NodeInfo, NodeInfoKey};
+use crate::error;
+use crate::kv_backend::ResettableKvBackendRef;
+use crate::leadership_notifier::LeadershipChangeListener;
+use crate::rpc::store::RangeRequest;
+use crate::rpc::KeyValue;
+
+/// [NodeExpiryListener] periodically checks all node info in memory and removes
+/// expired node info to prevent memory leak.
+pub struct NodeExpiryListener {
+    handle: Mutex<Option<JoinHandle<()>>>,
+    max_idle_time: Duration,
+    in_memory: ResettableKvBackendRef,
+}
+
+impl Drop for NodeExpiryListener {
+    fn drop(&mut self) {
+        self.stop();
+    }
+}
+
+impl NodeExpiryListener {
+    pub fn new(max_idle_time: Duration, in_memory: ResettableKvBackendRef) -> Self {
+        Self {
+            handle: Mutex::new(None),
+            max_idle_time,
+            in_memory,
+        }
+    }
+
+    async fn start(&self) {
+        let mut handle = self.handle.lock().unwrap();
+        if handle.is_none() {
+            let in_memory = self.in_memory.clone();
+
+            let max_idle_time = self.max_idle_time;
+            let ticker_loop = tokio::spawn(async move {
+                // Run clean task every minute.
+                let mut interval = interval(Duration::from_secs(60));
+                interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+                loop {
+                    interval.tick().await;
+                    if let Err(e) = Self::clean_expired_nodes(&in_memory, max_idle_time).await {
+                        error!(e; "Failed to clean expired node");
+                    }
+                }
+            });
+            *handle = Some(ticker_loop);
+        }
+    }
+
+    fn stop(&self) {
+        if let Some(handle) = self.handle.lock().unwrap().take() {
+            handle.abort();
+            info!("Node expiry listener stopped")
+        }
+    }
+
+    /// Cleans expired nodes from memory.
+    async fn clean_expired_nodes(
+        in_memory: &ResettableKvBackendRef,
+        max_idle_time: Duration,
+    ) -> error::Result<()> {
+        let node_keys = Self::list_expired_nodes(in_memory, max_idle_time).await?;
+        for key in node_keys {
+            let key_bytes: Vec<u8> = (&key).into();
+            if let Err(e) = in_memory.delete(&key_bytes, false).await {
+                warn!(e; "Failed to delete expired node: {:?}", key_bytes);
+            } else {
+                debug!("Deleted expired node key: {:?}", key);
+            }
+        }
+        Ok(())
+    }
+
+    /// Lists expired nodes that have been inactive more than `max_idle_time`.
+    async fn list_expired_nodes(
+        in_memory: &ResettableKvBackendRef,
+        max_idle_time: Duration,
+    ) -> error::Result<impl Iterator<Item = NodeInfoKey>> {
+        let prefix = NodeInfoKey::key_prefix_with_cluster_id(0);
+        let req = RangeRequest::new().with_prefix(prefix);
+        let current_time_millis = common_time::util::current_time_millis();
+        let resp = in_memory.range(req).await?;
+        Ok(resp
+            .kvs
+            .into_iter()
+            .filter_map(move |KeyValue { key, value }| {
+                let Ok(info) = NodeInfo::try_from(value).inspect_err(|e| {
+                    warn!(e; "Unrecognized node info value");
+                }) else {
+                    return None;
+                };
+                if (current_time_millis - info.last_activity_ts) > max_idle_time.as_millis() as i64
+                {
+                    NodeInfoKey::try_from(key)
+                        .inspect_err(|e| {
+                            warn!(e; "Unrecognized node info key: {:?}", info.peer);
+                        })
+                        .ok()
+                        .inspect(|node_key| {
+                            debug!("Found expired node: {:?}", node_key);
+                        })
+                } else {
+                    None
+                }
+            }))
+    }
+}
+
+#[async_trait::async_trait]
+impl LeadershipChangeListener for NodeExpiryListener {
+    fn name(&self) -> &str {
+        "NodeExpiryListener"
+    }
+
+    async fn on_leader_start(&self) -> error::Result<()> {
+        self.start().await;
+        info!(
+            "On leader start, node expiry listener started with max idle time: {:?}",
+            self.max_idle_time
+        );
+        Ok(())
+    }
+
+    async fn on_leader_stop(&self) -> error::Result<()> {
+        self.stop();
+        info!("On leader stop, node expiry listener stopped");
+        Ok(())
+    }
+}
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -1239,6 +1239,7 @@ impl From<QueryContext> for PbQueryContext {
            timezone,
            extensions,
            channel: channel as u32,
+            snapshot_seqs: None,
        }
    }
 }
--- a/src/common/time/Cargo.toml
+++ b/src/common/time/Cargo.toml
@@ -10,7 +10,7 @@ workspace = true
 [dependencies]
 arrow.workspace = true
 chrono.workspace = true
-chrono-tz = "0.8"
+chrono-tz.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 humantime.workspace = true
--- a/src/datanode/Cargo.toml
+++ b/src/datanode/Cargo.toml
@@ -39,7 +39,7 @@ datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datatypes.workspace = true
 file-engine.workspace = true
-futures = "0.3"
+futures.workspace = true
 futures-util.workspace = true
 humantime-serde.workspace = true
 lazy_static.workspace = true
@@ -47,6 +47,7 @@ log-store.workspace = true
 meta-client.workspace = true
 metric-engine.workspace = true
 mito2.workspace = true
+num_cpus.workspace = true
 object-store.workspace = true
 prometheus.workspace = true
 prost.workspace = true
--- a/src/datanode/src/error.rs
+++ b/src/datanode/src/error.rs
@@ -260,6 +260,14 @@ pub enum Error {
        source: BoxedError,
    },

+    #[snafu(display("Failed to handle batch ddl request, ddl_type: {}", ddl_type))]
+    HandleBatchDdlRequest {
+        #[snafu(implicit)]
+        location: Location,
+        source: BoxedError,
+        ddl_type: String,
+    },
+
    #[snafu(display("RegionId {} not found", region_id))]
    RegionNotFound {
        region_id: RegionId,
@@ -438,7 +446,8 @@ impl ErrorExt for Error {
            UnsupportedOutput { .. } => StatusCode::Unsupported,
            HandleRegionRequest { source, .. }
            | GetRegionMetadata { source, .. }
-            | HandleBatchOpenRequest { source, .. } => source.status_code(),
+            | HandleBatchOpenRequest { source, .. }
+            | HandleBatchDdlRequest { source, .. } => source.status_code(),
            StopRegionEngine { source, .. } => source.status_code(),

            FindLogicalRegions { source, .. } => source.status_code(),
--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -224,6 +224,20 @@ impl HeartbeatTask {
        common_runtime::spawn_hb(async move {
            let sleep = tokio::time::sleep(Duration::from_millis(0));
            tokio::pin!(sleep);
+
+            let build_info = common_version::build_info();
+            let heartbeat_request = HeartbeatRequest {
+                peer: self_peer,
+                node_epoch,
+                info: Some(NodeInfo {
+                    version: build_info.version.to_string(),
+                    git_commit: build_info.commit_short.to_string(),
+                    start_time_ms: node_epoch,
+                    cpus: num_cpus::get() as u32,
+                }),
+                ..Default::default()
+            };
+
            loop {
                if !running.load(Ordering::Relaxed) {
                    info!("shutdown heartbeat task");
@@ -235,9 +249,8 @@ impl HeartbeatTask {
                            match outgoing_message_to_mailbox_message(message) {
                                Ok(message) => {
                                    let req = HeartbeatRequest {
-                                        peer: self_peer.clone(),
                                        mailbox_message: Some(message),
-                                        ..Default::default()
+                                        ..heartbeat_request.clone()
                                    };
                                    HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
                                    Some(req)
@@ -253,22 +266,13 @@ impl HeartbeatTask {
                        }
                    }
                    _ = &mut sleep => {
-                        let build_info = common_version::build_info();
                        let region_stats = Self::load_region_stats(&region_server_clone);
                        let now = Instant::now();
                        let duration_since_epoch = (now - epoch).as_millis() as u64;
                        let req = HeartbeatRequest {
-                            peer: self_peer.clone(),
                            region_stats,
                            duration_since_epoch,
-                            node_epoch,
-                            info: Some(NodeInfo {
-                                version: build_info.version.to_string(),
-                                git_commit: build_info.commit_short.to_string(),
-                                // The start timestamp is the same as node_epoch currently.
-                                start_time_ms: node_epoch,
-                            }),
-                            ..Default::default()
+                            ..heartbeat_request.clone()
                        };
                        sleep.as_mut().reset(now + Duration::from_millis(interval));
                        Some(req)
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -59,7 +59,7 @@ use store_api::region_engine::{
    SettableRegionRoleState,
 };
 use store_api::region_request::{
-    AffectedRows, RegionCloseRequest, RegionOpenRequest, RegionRequest,
+    AffectedRows, BatchRegionDdlRequest, RegionCloseRequest, RegionOpenRequest, RegionRequest,
 };
 use store_api::storage::RegionId;
 use tokio::sync::{Semaphore, SemaphorePermit};
@@ -69,9 +69,10 @@ use tonic::{Request, Response, Result as TonicResult};
 use crate::error::{
    self, BuildRegionRequestsSnafu, ConcurrentQueryLimiterClosedSnafu,
    ConcurrentQueryLimiterTimeoutSnafu, DataFusionSnafu, DecodeLogicalPlanSnafu,
-    ExecuteLogicalPlanSnafu, FindLogicalRegionsSnafu, HandleBatchOpenRequestSnafu,
-    HandleRegionRequestSnafu, NewPlanDecoderSnafu, RegionEngineNotFoundSnafu, RegionNotFoundSnafu,
-    RegionNotReadySnafu, Result, StopRegionEngineSnafu, UnexpectedSnafu, UnsupportedOutputSnafu,
+    ExecuteLogicalPlanSnafu, FindLogicalRegionsSnafu, HandleBatchDdlRequestSnafu,
+    HandleBatchOpenRequestSnafu, HandleRegionRequestSnafu, NewPlanDecoderSnafu,
+    RegionEngineNotFoundSnafu, RegionNotFoundSnafu, RegionNotReadySnafu, Result,
+    StopRegionEngineSnafu, UnexpectedSnafu, UnsupportedOutputSnafu,
 };
 use crate::event_listener::RegionServerEventListenerRef;

@@ -158,7 +159,12 @@ impl RegionServer {
        self.inner.handle_request(region_id, request).await
    }

-    async fn table_provider(&self, region_id: RegionId) -> Result<Arc<dyn TableProvider>> {
+    /// Returns a table provider for the region. Will set snapshot sequence if available in the context.
+    async fn table_provider(
+        &self,
+        region_id: RegionId,
+        ctx: Option<&session::context::QueryContext>,
+    ) -> Result<Arc<dyn TableProvider>> {
        let status = self
            .inner
            .region_map
@@ -172,7 +178,7 @@ impl RegionServer {

        self.inner
            .table_provider_factory
-            .create(region_id, status.into_engine())
+            .create(region_id, status.into_engine(), ctx)
            .await
            .context(ExecuteLogicalPlanSnafu)
    }
@@ -187,9 +193,6 @@ impl RegionServer {
        } else {
            None
        };
-        let region_id = RegionId::from_u64(request.region_id);
-        let provider = self.table_provider(region_id).await?;
-        let catalog_list = Arc::new(DummyCatalogList::with_table_provider(provider));

        let query_ctx: QueryContextRef = request
            .header
@@ -197,6 +200,10 @@ impl RegionServer {
            .map(|h| Arc::new(h.into()))
            .unwrap_or_else(|| Arc::new(QueryContextBuilder::default().build()));

+        let region_id = RegionId::from_u64(request.region_id);
+        let provider = self.table_provider(region_id, Some(&query_ctx)).await?;
+        let catalog_list = Arc::new(DummyCatalogList::with_table_provider(provider));
+
        let decoder = self
            .inner
            .query_engine
@@ -225,7 +232,10 @@ impl RegionServer {
        } else {
            None
        };
-        let provider = self.table_provider(request.region_id).await?;
+
+        let ctx: Option<session::context::QueryContext> = request.header.as_ref().map(|h| h.into());
+
+        let provider = self.table_provider(request.region_id, ctx.as_ref()).await?;

        struct RegionDataSourceInjector {
            source: Arc<dyn TableSource>,
@@ -344,62 +354,47 @@ impl RegionServer {
            .region_map
            .insert(region_id, RegionEngineWithStatus::Ready(engine));
    }
-}
-
-#[async_trait]
-impl RegionServerHandler for RegionServer {
-    async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponseV1> {
-        let is_parallel = matches!(
-            request,
-            region_request::Body::Inserts(_) | region_request::Body::Deletes(_)
-        );
-        let requests = RegionRequest::try_from_request_body(request)
-            .context(BuildRegionRequestsSnafu)
-            .map_err(BoxedError::new)
-            .context(ExecuteGrpcRequestSnafu)?;

+    async fn handle_batch_ddl_requests(
+        &self,
+        request: region_request::Body,
+    ) -> Result<RegionResponse> {
+        // Safety: we have already checked the request type in `RegionServer::handle()`.
+        let batch_request = BatchRegionDdlRequest::try_from_request_body(request)
+            .context(BuildRegionRequestsSnafu)?
+            .unwrap();
        let tracing_context = TracingContext::from_current_span();

-        let results = if is_parallel {
-            let join_tasks = requests.into_iter().map(|(region_id, req)| {
-                let self_to_move = self.clone();
-                let span = tracing_context.attach(info_span!(
-                    "RegionServer::handle_region_request",
-                    region_id = region_id.to_string()
-                ));
-                async move {
-                    self_to_move
-                        .handle_request(region_id, req)
-                        .trace(span)
-                        .await
-                }
-            });
+        let span = tracing_context.attach(info_span!("RegionServer::handle_batch_ddl_requests"));
+        self.inner
+            .handle_batch_request(batch_request)
+            .trace(span)
+            .await
+    }

-            try_join_all(join_tasks)
-                .await
-                .map_err(BoxedError::new)
-                .context(ExecuteGrpcRequestSnafu)?
-        } else {
-            let mut results = Vec::with_capacity(requests.len());
-            // FIXME(jeremy, ruihang): Once the engine supports merged calls, we should immediately
-            // modify this part to avoid inefficient serial loop calls.
-            for (region_id, req) in requests {
-                let span = tracing_context.attach(info_span!(
-                    "RegionServer::handle_region_request",
-                    region_id = region_id.to_string()
-                ));
-                let result = self
+    async fn handle_requests_in_parallel(
+        &self,
+        request: region_request::Body,
+    ) -> Result<RegionResponse> {
+        let requests =
+            RegionRequest::try_from_request_body(request).context(BuildRegionRequestsSnafu)?;
+        let tracing_context = TracingContext::from_current_span();
+
+        let join_tasks = requests.into_iter().map(|(region_id, req)| {
+            let self_to_move = self;
+            let span = tracing_context.attach(info_span!(
+                "RegionServer::handle_region_request",
+                region_id = region_id.to_string()
+            ));
+            async move {
+                self_to_move
                    .handle_request(region_id, req)
                    .trace(span)
                    .await
-                    .map_err(BoxedError::new)
-                    .context(ExecuteGrpcRequestSnafu)?;
-                results.push(result);
            }
-            results
-        };
+        });

-        // merge results by sum up affected rows and merge extensions.
+        let results = try_join_all(join_tasks).await?;
        let mut affected_rows = 0;
        let mut extensions = HashMap::new();
        for result in results {
@@ -407,6 +402,57 @@ impl RegionServerHandler for RegionServer {
            extensions.extend(result.extensions);
        }

+        Ok(RegionResponse {
+            affected_rows,
+            extensions,
+        })
+    }
+
+    async fn handle_requests_in_serial(
+        &self,
+        request: region_request::Body,
+    ) -> Result<RegionResponse> {
+        let requests =
+            RegionRequest::try_from_request_body(request).context(BuildRegionRequestsSnafu)?;
+        let tracing_context = TracingContext::from_current_span();
+
+        let mut affected_rows = 0;
+        let mut extensions = HashMap::new();
+        // FIXME(jeremy, ruihang): Once the engine supports merged calls, we should immediately
+        // modify this part to avoid inefficient serial loop calls.
+        for (region_id, req) in requests {
+            let span = tracing_context.attach(info_span!(
+                "RegionServer::handle_region_request",
+                region_id = region_id.to_string()
+            ));
+            let result = self.handle_request(region_id, req).trace(span).await?;
+
+            affected_rows += result.affected_rows;
+            extensions.extend(result.extensions);
+        }
+
+        Ok(RegionResponse {
+            affected_rows,
+            extensions,
+        })
+    }
+}
+
+#[async_trait]
+impl RegionServerHandler for RegionServer {
+    async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponseV1> {
+        let response = match &request {
+            region_request::Body::Creates(_)
+            | region_request::Body::Drops(_)
+            | region_request::Body::Alters(_) => self.handle_batch_ddl_requests(request).await,
+            region_request::Body::Inserts(_) | region_request::Body::Deletes(_) => {
+                self.handle_requests_in_parallel(request).await
+            }
+            _ => self.handle_requests_in_serial(request).await,
+        }
+        .map_err(BoxedError::new)
+        .context(ExecuteGrpcRequestSnafu)?;
+
        Ok(RegionResponseV1 {
            header: Some(ResponseHeader {
                status: Some(Status {
@@ -414,8 +460,8 @@ impl RegionServerHandler for RegionServer {
                    ..Default::default()
                }),
            }),
-            affected_rows: affected_rows as _,
-            extensions,
+            affected_rows: response.affected_rows as _,
+            extensions: response.extensions,
        })
    }
 }
@@ -661,7 +707,7 @@ impl RegionServerInner {
                            }
                        }
                        Err(e) => {
-                            self.unset_region_status(region_id, *region_change);
+                            self.unset_region_status(region_id, &engine, *region_change);
                            error!(e; "Failed to open region: {}", region_id);
                            errors.push(e);
                        }
@@ -670,7 +716,7 @@ impl RegionServerInner {
            }
            Err(e) => {
                for (&region_id, region_change) in &region_changes {
-                    self.unset_region_status(region_id, *region_change);
+                    self.unset_region_status(region_id, &engine, *region_change);
                }
                error!(e; "Failed to open batch regions");
                errors.push(BoxedError::new(e));
@@ -727,6 +773,71 @@ impl RegionServerInner {
            .collect::<Vec<_>>())
    }

+    // Handle requests in batch.
+    //
+    // limitation: all create requests must be in the same engine.
+    pub async fn handle_batch_request(
+        &self,
+        batch_request: BatchRegionDdlRequest,
+    ) -> Result<RegionResponse> {
+        let region_changes = match &batch_request {
+            BatchRegionDdlRequest::Create(requests) => requests
+                .iter()
+                .map(|(region_id, create)| {
+                    let attribute = parse_region_attribute(&create.engine, &create.options)?;
+                    Ok((*region_id, RegionChange::Register(attribute)))
+                })
+                .collect::<Result<Vec<_>>>()?,
+            BatchRegionDdlRequest::Drop(requests) => requests
+                .iter()
+                .map(|(region_id, _)| (*region_id, RegionChange::Deregisters))
+                .collect::<Vec<_>>(),
+            BatchRegionDdlRequest::Alter(requests) => requests
+                .iter()
+                .map(|(region_id, _)| (*region_id, RegionChange::None))
+                .collect::<Vec<_>>(),
+        };
+
+        // The ddl procedure will ensure all requests are in the same engine.
+        // Therefore, we can get the engine from the first request.
+        let (first_region_id, first_region_change) = region_changes.first().unwrap();
+        let engine = match self.get_engine(*first_region_id, first_region_change)? {
+            CurrentEngine::Engine(engine) => engine,
+            CurrentEngine::EarlyReturn(rows) => return Ok(RegionResponse::new(rows)),
+        };
+
+        for (region_id, region_change) in region_changes.iter() {
+            self.set_region_status_not_ready(*region_id, &engine, region_change);
+        }
+
+        let ddl_type = batch_request.request_type();
+        let result = engine
+            .handle_batch_ddl_requests(batch_request)
+            .await
+            .context(HandleBatchDdlRequestSnafu { ddl_type });
+
+        match result {
+            Ok(result) => {
+                for (region_id, region_change) in region_changes {
+                    self.set_region_status_ready(region_id, engine.clone(), region_change)
+                        .await?;
+                }
+
+                Ok(RegionResponse {
+                    affected_rows: result.affected_rows,
+                    extensions: result.extensions,
+                })
+            }
+            Err(err) => {
+                for (region_id, region_change) in region_changes {
+                    self.unset_region_status(region_id, &engine, region_change);
+                }
+
+                Err(err)
+            }
+        }
+    }
+
    pub async fn handle_request(
        &self,
        region_id: RegionId,
@@ -780,7 +891,7 @@ impl RegionServerInner {
            }
            Err(err) => {
                // Removes the region status if the operation fails.
-                self.unset_region_status(region_id, region_change);
+                self.unset_region_status(region_id, &engine, region_change);
                Err(err)
            }
        }
@@ -809,12 +920,21 @@ impl RegionServerInner {
        }
    }

-    fn unset_region_status(&self, region_id: RegionId, region_change: RegionChange) {
+    fn unset_region_status(
+        &self,
+        region_id: RegionId,
+        engine: &RegionEngineRef,
+        region_change: RegionChange,
+    ) {
        match region_change {
            RegionChange::None => {}
-            RegionChange::Register(_) | RegionChange::Deregisters => {
+            RegionChange::Register(_) => {
                self.region_map.remove(&region_id);
            }
+            RegionChange::Deregisters => {
+                self.region_map
+                    .insert(region_id, RegionEngineWithStatus::Ready(engine.clone()));
+            }
            RegionChange::Catchup => {}
        }
    }
@@ -1098,7 +1218,10 @@ mod tests {
        );

        let response = mock_region_server
-            .handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
+            .handle_request(
+                region_id,
+                RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+            )
            .await
            .unwrap();
        assert_eq!(response.affected_rows, 0);
@@ -1190,12 +1313,15 @@ mod tests {
            .insert(region_id, RegionEngineWithStatus::Ready(engine.clone()));

        mock_region_server
-            .handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
+            .handle_request(
+                region_id,
+                RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+            )
            .await
            .unwrap_err();

        let status = mock_region_server.inner.region_map.get(&region_id);
-        assert!(status.is_none());
+        assert!(status.is_some());
    }

    struct CurrentEngineTest {
--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -37,7 +37,7 @@ use store_api::region_engine::{
    SettableRegionRoleState,
 };
 use store_api::region_request::{AffectedRows, RegionRequest};
-use store_api::storage::{RegionId, ScanRequest};
+use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
 use table::TableRef;
 use tokio::sync::mpsc::{Receiver, Sender};

@@ -218,6 +218,10 @@ impl RegionEngine for MockRegionEngine {
        unimplemented!()
    }

+    async fn get_last_seq_num(&self, _: RegionId) -> Result<Option<SequenceNumber>, BoxedError> {
+        unimplemented!()
+    }
+
    async fn stop(&self) -> Result<(), BoxedError> {
        Ok(())
    }
--- a/src/datatypes/Cargo.toml
+++ b/src/datatypes/Cargo.toml
@@ -29,7 +29,7 @@ jsonb.workspace = true
 num = "0.4"
 num-traits = "0.2"
 ordered-float = { version = "3.0", features = ["serde"] }
-paste = "1.0"
+paste.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
--- a/src/datatypes/src/lib.rs
+++ b/src/datatypes/src/lib.rs
@@ -32,5 +32,5 @@ pub mod types;
 pub mod value;
 pub mod vectors;

-pub use arrow;
+pub use arrow::{self, compute};
 pub use error::{Error, Result};
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -183,12 +183,6 @@ impl ColumnSchema {
        self
    }

-    // Put a placeholder to invalidate schemas.all(!has_inverted_index_key).
-    pub fn insert_inverted_index_placeholder(&mut self) {
-        self.metadata
-            .insert(INVERTED_INDEX_KEY.to_string(), "".to_string());
-    }
-
    pub fn is_inverted_indexed(&self) -> bool {
        self.metadata
            .get(INVERTED_INDEX_KEY)
@@ -386,6 +380,11 @@ impl ColumnSchema {
        );
        Ok(())
    }
+
+    pub fn unset_skipping_options(&mut self) -> Result<()> {
+        self.metadata.remove(SKIPPING_INDEX_KEY);
+        Ok(())
+    }
 }

 /// Column extended type set in column schema's metadata.
--- a/src/file-engine/Cargo.toml
+++ b/src/file-engine/Cargo.toml
@@ -13,7 +13,7 @@ workspace = true

 [dependencies]
 api.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 common-catalog.workspace = true
 common-datasource.workspace = true
 common-error.workspace = true
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -33,7 +33,7 @@ use store_api::region_request::{
    AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
    RegionRequest,
 };
-use store_api::storage::{RegionId, ScanRequest};
+use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
 use tokio::sync::Mutex;

 use crate::config::EngineConfig;
@@ -114,6 +114,10 @@ impl RegionEngine for FileRegionEngine {
        None
    }

+    async fn get_last_seq_num(&self, _: RegionId) -> Result<Option<SequenceNumber>, BoxedError> {
+        Ok(None)
+    }
+
    fn set_region_role(&self, region_id: RegionId, role: RegionRole) -> Result<(), BoxedError> {
        self.inner
            .set_region_role(region_id, role)
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -16,6 +16,7 @@ async-trait.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
+chrono.workspace = true
 client.workspace = true
 common-base.workspace = true
 common-config.workspace = true
@@ -41,7 +42,7 @@ datafusion-substrait.workspace = true
 datatypes.workspace = true
 enum-as-inner = "0.6.0"
 enum_dispatch = "0.3"
-futures = "0.3"
+futures.workspace = true
 get-size2 = "0.1.2"
 greptime-proto.workspace = true
 # This fork of hydroflow is simply for keeping our dependency in our org, and pin the version
@@ -53,6 +54,7 @@ lazy_static.workspace = true
 meta-client.workspace = true
 nom = "7.1.3"
 num-traits = "0.2"
+num_cpus.workspace = true
 operator.workspace = true
 partition.workspace = true
 prometheus.workspace = true
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -49,12 +49,13 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
 use crate::adapter::refill::RefillTask;
 use crate::adapter::table_source::ManagedTableSource;
 use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
-pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
+pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
 use crate::compute::ErrCollector;
 use crate::df_optimizer::sql_to_flow_plan;
 use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
 use crate::expr::Batch;
 use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
+use crate::recording_rules::RecordingRuleEngine;
 use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};

 mod flownode_impl;
@@ -63,7 +64,7 @@ pub(crate) mod refill;
 mod stat;
 #[cfg(test)]
 mod tests;
-mod util;
+pub(crate) mod util;
 mod worker;

 pub(crate) mod node_context;
@@ -171,6 +172,8 @@ pub struct FlowWorkerManager {
    flush_lock: RwLock<()>,
    /// receive a oneshot sender to send state size report
    state_report_handler: RwLock<Option<StateReportHandler>>,
+    /// engine for recording rule
+    rule_engine: RecordingRuleEngine,
 }

 /// Building FlownodeManager
@@ -185,6 +188,7 @@ impl FlowWorkerManager {
        node_id: Option<u32>,
        query_engine: Arc<dyn QueryEngine>,
        table_meta: TableMetadataManagerRef,
+        rule_engine: RecordingRuleEngine,
    ) -> Self {
        let srv_map = ManagedTableSource::new(
            table_meta.table_info_manager().clone(),
@@ -207,6 +211,7 @@ impl FlowWorkerManager {
            node_id,
            flush_lock: RwLock::new(()),
            state_report_handler: RwLock::new(None),
+            rule_engine,
        }
    }

@@ -215,25 +220,6 @@ impl FlowWorkerManager {
        self
    }

-    /// Create a flownode manager with one worker
-    pub fn new_with_workers<'s>(
-        node_id: Option<u32>,
-        query_engine: Arc<dyn QueryEngine>,
-        table_meta: TableMetadataManagerRef,
-        num_workers: usize,
-    ) -> (Self, Vec<Worker<'s>>) {
-        let mut zelf = Self::new(node_id, query_engine, table_meta);
-
-        let workers: Vec<_> = (0..num_workers)
-            .map(|_| {
-                let (handle, worker) = create_worker();
-                zelf.add_worker_handle(handle);
-                worker
-            })
-            .collect();
-        (zelf, workers)
-    }
-
    /// add a worker handler to manager, meaning this corresponding worker is under it's manage
    pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
        self.worker_handles.push(handle);
@@ -751,7 +737,11 @@ pub struct CreateFlowArgs {
 /// Create&Remove flow
 impl FlowWorkerManager {
    /// remove a flow by it's id
+    #[allow(unreachable_code)]
    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+        // TODO(discord9): reroute some back to streaming engine later
+        return self.rule_engine.remove_flow(flow_id).await;
+
        for handle in self.worker_handles.iter() {
            if handle.contains_flow(flow_id).await? {
                handle.remove_flow(flow_id).await?;
@@ -767,8 +757,10 @@ impl FlowWorkerManager {
    /// steps to create task:
    /// 1. parse query into typed plan(and optional parse expire_after expr)
    /// 2. render source/sink with output table id and used input table id
-    #[allow(clippy::too_many_arguments)]
+    #[allow(clippy::too_many_arguments, unreachable_code)]
    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
+        // TODO(discord9): reroute some back to streaming engine later
+        return self.rule_engine.create_flow(args).await;
        let CreateFlowArgs {
            flow_id,
            sink_table_name,
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -153,7 +153,13 @@ impl Flownode for FlowWorkerManager {
        }
    }

+    #[allow(unreachable_code, unused)]
    async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
+        return self
+            .rule_engine
+            .handle_inserts(request)
+            .await
+            .map_err(to_meta_err(snafu::location!()));
        // using try_read to ensure two things:
        // 1. flush wouldn't happen until inserts before it is inserted
        // 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -206,15 +212,15 @@ impl Flownode for FlowWorkerManager {
                    .collect_vec();
                let table_col_names = table_schema.relation_desc.names;
                let table_col_names = table_col_names
-                    .iter().enumerate()
-                    .map(|(idx,name)| match name {
-                        Some(name) => Ok(name.clone()),
-                        None => InternalSnafu {
-                            reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
-                        }
-                        .fail().map_err(BoxedError::new).context(ExternalSnafu),
-                    })
-                    .collect::<Result<Vec<_>>>()?;
+                        .iter().enumerate()
+                        .map(|(idx,name)| match name {
+                            Some(name) => Ok(name.clone()),
+                            None => InternalSnafu {
+                                reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
+                            }
+                            .fail().map_err(BoxedError::new).context(ExternalSnafu),
+                        })
+                        .collect::<Result<Vec<_>>>()?;
                let name_to_col = HashMap::<_, _>::from_iter(
                    insert_schema
                        .iter()
--- a/src/flow/src/adapter/util.rs
+++ b/src/flow/src/adapter/util.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! Some utility functions
+
 use std::sync::Arc;

 use api::helper::ColumnDataTypeWrapper;
@@ -22,7 +24,7 @@ use common_meta::key::table_info::TableInfoValue;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use itertools::Itertools;
-use operator::expr_factory::CreateExprFactory;
+use operator::expr_helper;
 use session::context::QueryContextBuilder;
 use snafu::{OptionExt, ResultExt};
 use table::table_reference::TableReference;
@@ -32,7 +34,6 @@ use crate::adapter::{TableName, WorkerHandle, AUTO_CREATED_PLACEHOLDER_TS_COL};
 use crate::error::{Error, ExternalSnafu, UnexpectedSnafu};
 use crate::repr::{ColumnType, RelationDesc, RelationType};
 use crate::FlowWorkerManager;
-
 impl FlowWorkerManager {
    /// Get a worker handle for creating flow, using round robin to select a worker
    pub(crate) async fn get_worker_handle_for_create_flow(&self) -> &WorkerHandle {
@@ -66,19 +67,18 @@ impl FlowWorkerManager {
        let proto_schema = column_schemas_to_proto(tys.clone(), &pks)?;

        // create sink table
-        let create_expr = CreateExprFactory {}
-            .create_table_expr_by_column_schemas(
-                &TableReference {
-                    catalog: &table_name[0],
-                    schema: &table_name[1],
-                    table: &table_name[2],
-                },
-                &proto_schema,
-                "mito",
-                Some(&format!("Sink table for flow {}", flow_name)),
-            )
-            .map_err(BoxedError::new)
-            .context(ExternalSnafu)?;
+        let create_expr = expr_helper::create_table_expr_by_column_schemas(
+            &TableReference {
+                catalog: &table_name[0],
+                schema: &table_name[1],
+                table: &table_name[2],
+            },
+            &proto_schema,
+            "mito",
+            Some(&format!("Sink table for flow {}", flow_name)),
+        )
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;

        self.submit_create_sink_table_ddl(create_expr).await?;
        Ok(true)
--- a/src/flow/src/error.rs
+++ b/src/flow/src/error.rs
@@ -16,6 +16,7 @@

 use std::any::Any;

+use arrow_schema::ArrowError;
 use common_error::ext::BoxedError;
 use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
 use common_macro::stack_trace_debug;
@@ -53,6 +54,13 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Time error"))]
+    Time {
+        source: common_time::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("External error"))]
    External {
        source: BoxedError,
@@ -156,6 +164,15 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Arrow error: {raw:?} in context: {context}"))]
+    Arrow {
+        #[snafu(source)]
+        raw: ArrowError,
+        context: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
    Datafusion {
        #[snafu(source)]
@@ -230,6 +247,7 @@ impl ErrorExt for Error {
        match self {
            Self::Eval { .. }
            | Self::JoinTask { .. }
+            | Self::Arrow { .. }
            | Self::Datafusion { .. }
            | Self::InsertIntoFlow { .. } => StatusCode::Internal,
            Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
@@ -238,7 +256,9 @@ impl ErrorExt for Error {
            | Self::FlowNotFound { .. }
            | Self::ListFlows { .. } => StatusCode::TableNotFound,
            Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
-            Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
+            Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
+                StatusCode::EngineExecuteQuery
+            }
            Self::Unexpected { .. } => StatusCode::Unexpected,
            Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
                StatusCode::Unsupported
--- a/src/flow/src/expr/utils.rs
+++ b/src/flow/src/expr/utils.rs
@@ -238,6 +238,7 @@ mod test {

        for (sql, current, expected) in &testcases {
            let plan = sql_to_substrait(engine.clone(), sql).await;
+
            let mut ctx = create_test_ctx();
            let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
                .await
--- a/src/flow/src/heartbeat.rs
+++ b/src/flow/src/heartbeat.rs
@@ -60,12 +60,12 @@ async fn query_flow_state(
 #[derive(Clone)]
 pub struct HeartbeatTask {
    node_id: u64,
+    node_epoch: u64,
    peer_addr: String,
    meta_client: Arc<MetaClient>,
    report_interval: Duration,
    retry_interval: Duration,
    resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
-    start_time_ms: u64,
    running: Arc<AtomicBool>,
    query_stat_size: Option<SizeReportSender>,
 }
@@ -83,12 +83,12 @@ impl HeartbeatTask {
    ) -> Self {
        Self {
            node_id: opts.node_id.unwrap_or(0),
+            node_epoch: common_time::util::current_time_millis() as u64,
            peer_addr: addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr)),
            meta_client,
            report_interval: heartbeat_opts.interval,
            retry_interval: heartbeat_opts.retry_interval,
            resp_handler_executor,
-            start_time_ms: common_time::util::current_time_millis() as u64,
            running: Arc::new(AtomicBool::new(false)),
            query_stat_size: None,
        }
@@ -103,6 +103,11 @@ impl HeartbeatTask {
            warn!("Heartbeat task started multiple times");
            return Ok(());
        }
+
+        self.create_streams().await
+    }
+
+    async fn create_streams(&self) -> Result<(), Error> {
        info!("Start to establish the heartbeat connection to metasrv.");
        let (req_sender, resp_stream) = self
            .meta_client
@@ -125,19 +130,11 @@ impl HeartbeatTask {

    pub fn shutdown(&self) {
        info!("Close heartbeat task for flownode");
-        if self
-            .running
-            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
-            warn!("Call close heartbeat task multiple times");
-        }
    }

-    fn create_heartbeat_request(
+    fn new_heartbeat_request(
+        heartbeat_request: &HeartbeatRequest,
        message: Option<OutgoingMessage>,
-        peer: Option<Peer>,
-        start_time_ms: u64,
        latest_report: &Option<FlowStat>,
    ) -> Option<HeartbeatRequest> {
        let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
@@ -161,10 +158,8 @@ impl HeartbeatTask {

        Some(HeartbeatRequest {
            mailbox_message,
-            peer,
-            info: Self::build_node_info(start_time_ms),
            flow_stat,
-            ..Default::default()
+            ..heartbeat_request.clone()
        })
    }

@@ -174,6 +169,7 @@ impl HeartbeatTask {
            version: build_info.version.to_string(),
            git_commit: build_info.commit_short.to_string(),
            start_time_ms,
+            cpus: num_cpus::get() as u32,
        })
    }

@@ -183,7 +179,7 @@ impl HeartbeatTask {
        mut outgoing_rx: mpsc::Receiver<OutgoingMessage>,
    ) {
        let report_interval = self.report_interval;
-        let start_time_ms = self.start_time_ms;
+        let node_epoch = self.node_epoch;
        let self_peer = Some(Peer {
            id: self.node_id,
            addr: self.peer_addr.clone(),
@@ -198,18 +194,25 @@ impl HeartbeatTask {
            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
            let mut latest_report = None;

+            let heartbeat_request = HeartbeatRequest {
+                peer: self_peer,
+                node_epoch,
+                info: Self::build_node_info(node_epoch),
+                ..Default::default()
+            };
+
            loop {
                let req = tokio::select! {
                    message = outgoing_rx.recv() => {
                        if let Some(message) = message {
-                            Self::create_heartbeat_request(Some(message), self_peer.clone(), start_time_ms, &latest_report)
+                            Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report)
                        } else {
                            // Receives None that means Sender was dropped, we need to break the current loop
                            break
                        }
                    }
                    _ = interval.tick() => {
-                        Self::create_heartbeat_request(None, self_peer.clone(), start_time_ms, &latest_report)
+                        Self::new_heartbeat_request(&heartbeat_request, None, &latest_report)
                    }
                };

@@ -226,6 +229,8 @@ impl HeartbeatTask {
                // set the timeout to half of the report interval so that it wouldn't delay heartbeat if something went horribly wrong
                latest_report = query_flow_state(&query_stat_size, report_interval / 2).await;
            }
+
+            info!("flownode heartbeat task stopped.");
        });
    }

@@ -269,7 +274,7 @@ impl HeartbeatTask {

            info!("Try to re-establish the heartbeat connection to metasrv.");

-            if self.start().await.is_ok() {
+            if self.create_streams().await.is_ok() {
                break;
            }
        }
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -33,6 +33,7 @@ mod expr;
 pub mod heartbeat;
 mod metrics;
 mod plan;
+mod recording_rules;
 mod repr;
 mod server;
 mod transform;
@@ -43,4 +44,5 @@ mod test_utils;

 pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
 pub use error::{Error, Result};
+pub use recording_rules::FrontendClient;
 pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -28,6 +28,32 @@ lazy_static! {
        &["table_id"]
    )
    .unwrap();
+    pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
+        "greptime_flow_rule_engine_query_time",
+        "flow rule engine query time",
+        &["flow_id"],
+        vec![
+            0.0,
+            1.,
+            3.,
+            5.,
+            10.,
+            20.,
+            30.,
+            60.,
+            2. * 60.,
+            5. * 60.,
+            10. * 60.
+        ]
+    )
+    .unwrap();
+    pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
+        "greptime_flow_rule_engine_slow_query",
+        "flow rule engine slow query",
+        &["flow_id", "sql", "peer"],
+        vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
+    )
+    .unwrap();
    pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
        register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
    pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
--- a/src/flow/src/recording_rules.rs
+++ b/src/flow/src/recording_rules.rs
@@ -0,0 +1,940 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
+
+mod engine;
+mod frontend_client;
+
+use std::collections::BTreeSet;
+use std::sync::Arc;
+
+use api::helper::pb_value_to_value_ref;
+use catalog::CatalogManagerRef;
+use common_error::ext::BoxedError;
+use common_recordbatch::DfRecordBatch;
+use common_telemetry::warn;
+use common_time::timestamp::TimeUnit;
+use common_time::Timestamp;
+use datafusion::error::Result as DfResult;
+use datafusion::logical_expr::Expr;
+use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
+use datafusion::prelude::SessionContext;
+use datafusion::sql::unparser::Unparser;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
+use datafusion_common::{DFSchema, TableReference};
+use datafusion_expr::{ColumnarValue, LogicalPlan};
+use datafusion_physical_expr::PhysicalExprRef;
+use datatypes::prelude::{ConcreteDataType, DataType};
+use datatypes::scalars::ScalarVector;
+use datatypes::schema::TIME_INDEX_KEY;
+use datatypes::value::Value;
+use datatypes::vectors::{
+    TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
+    TimestampSecondVector, Vector,
+};
+pub use engine::RecordingRuleEngine;
+pub use frontend_client::FrontendClient;
+use itertools::Itertools;
+use query::parser::QueryLanguageParser;
+use query::QueryEngineRef;
+use session::context::QueryContextRef;
+use snafu::{ensure, OptionExt, ResultExt};
+
+use crate::adapter::util::from_proto_to_data_type;
+use crate::df_optimizer::apply_df_optimizer;
+use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
+use crate::expr::error::DataTypeSnafu;
+use crate::Error;
+
+#[derive(Debug, Clone)]
+pub struct TimeWindowExpr {
+    phy_expr: PhysicalExprRef,
+    column_name: String,
+    logical_expr: Expr,
+    df_schema: DFSchema,
+}
+
+impl TimeWindowExpr {
+    pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
+        let phy_planner = DefaultPhysicalPlanner::default();
+
+        let phy_expr: PhysicalExprRef = phy_planner
+            .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+            .with_context(|_e| DatafusionSnafu {
+                context: format!(
+                    "Failed to create physical expression from {expr:?} using {df_schema:?}"
+                ),
+            })?;
+        Ok(Self {
+            phy_expr,
+            column_name: column_name.to_string(),
+            logical_expr: expr.clone(),
+            df_schema: df_schema.clone(),
+        })
+    }
+
+    pub fn eval(
+        &self,
+        current: Timestamp,
+    ) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
+        let lower_bound =
+            find_expr_time_window_lower_bound(&self.logical_expr, &self.df_schema, current)?;
+        let upper_bound =
+            find_expr_time_window_upper_bound(&self.logical_expr, &self.df_schema, current)?;
+        Ok((lower_bound, upper_bound))
+    }
+
+    /// Find timestamps from rows using time window expr
+    pub async fn handle_rows(
+        &self,
+        rows_list: Vec<api::v1::Rows>,
+    ) -> Result<BTreeSet<Timestamp>, Error> {
+        let mut time_windows = BTreeSet::new();
+
+        for rows in rows_list {
+            // pick the time index column and use it to eval on `self.expr`
+            let ts_col_index = rows
+                .schema
+                .iter()
+                .map(|col| col.column_name.clone())
+                .position(|name| name == self.column_name);
+            let Some(ts_col_index) = ts_col_index else {
+                warn!("can't found time index column in schema: {:?}", rows.schema);
+                continue;
+            };
+            let col_schema = &rows.schema[ts_col_index];
+            let cdt = from_proto_to_data_type(col_schema)?;
+
+            let column_values = rows
+                .rows
+                .iter()
+                .map(|row| &row.values[ts_col_index])
+                .collect_vec();
+
+            let mut vector = cdt.create_mutable_vector(column_values.len());
+            for value in column_values {
+                let value = pb_value_to_value_ref(value, &None);
+                vector.try_push_value_ref(value).context(DataTypeSnafu {
+                    msg: "Failed to convert rows to columns",
+                })?;
+            }
+            let vector = vector.to_vector();
+
+            let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
+
+            let rb =
+                DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
+                    .with_context(|_e| ArrowSnafu {
+                        context: format!(
+                            "Failed to create record batch from {df_schema:?} and {vector:?}"
+                        ),
+                    })?;
+
+            let eval_res = self
+                .phy_expr
+                .evaluate(&rb)
+                .with_context(|_| DatafusionSnafu {
+                    context: format!(
+                        "Failed to evaluate physical expression {:?} on {rb:?}",
+                        self.phy_expr
+                    ),
+                })?;
+
+            let res = columnar_to_ts_vector(&eval_res)?;
+
+            for ts in res.into_iter().flatten() {
+                time_windows.insert(ts);
+            }
+        }
+
+        Ok(time_windows)
+    }
+}
+
+fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
+    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+        name,
+        cdt.as_arrow_type(),
+        false,
+    )]));
+
+    let df_schema = DFSchema::from_field_specific_qualified_schema(
+        vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
+        &arrow_schema,
+    )
+    .with_context(|_e| DatafusionSnafu {
+        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
+    })?;
+
+    Ok(df_schema)
+}
+
+/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
+fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
+    let val = match columnar {
+        datafusion_expr::ColumnarValue::Array(array) => {
+            let ty = array.data_type();
+            let ty = ConcreteDataType::from_arrow_type(ty);
+            let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
+                ty.unit()
+            } else {
+                return UnexpectedSnafu {
+                    reason: format!("Non-timestamp type: {ty:?}"),
+                }
+                .fail();
+            };
+
+            match time_unit {
+                TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
+                    .with_context(|_| DatatypesSnafu {
+                        extra: format!("Failed to create vector from arrow array {array:?}"),
+                    })?
+                    .iter_data()
+                    .map(|d| d.map(|d| d.0))
+                    .collect_vec(),
+                TimeUnit::Millisecond => {
+                    TimestampMillisecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+                TimeUnit::Microsecond => {
+                    TimestampMicrosecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+                TimeUnit::Nanosecond => {
+                    TimestampNanosecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+            }
+        }
+        datafusion_expr::ColumnarValue::Scalar(scalar) => {
+            let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
+                extra: format!("Failed to convert scalar {scalar:?} to value"),
+            })?;
+            let ts = value.as_timestamp().context(UnexpectedSnafu {
+                reason: format!("Expect Timestamp, found {:?}", value),
+            })?;
+            vec![Some(ts)]
+        }
+    };
+    Ok(val)
+}
+
+/// Convert sql to datafusion logical plan
+pub async fn sql_to_df_plan(
+    query_ctx: QueryContextRef,
+    engine: QueryEngineRef,
+    sql: &str,
+    optimize: bool,
+) -> Result<LogicalPlan, Error> {
+    let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;
+    let plan = engine
+        .planner()
+        .plan(&stmt, query_ctx)
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;
+    let plan = if optimize {
+        apply_df_optimizer(plan).await?
+    } else {
+        plan
+    };
+    Ok(plan)
+}
+
+/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
+async fn find_time_window_expr(
+    plan: &LogicalPlan,
+    catalog_man: CatalogManagerRef,
+    query_ctx: QueryContextRef,
+) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
+    // TODO(discord9): find the expr that do time window
+
+    let mut table_name = None;
+
+    // first find the table source in the logical plan
+    plan.apply(|plan| {
+        let LogicalPlan::TableScan(table_scan) = plan else {
+            return Ok(TreeNodeRecursion::Continue);
+        };
+        table_name = Some(table_scan.table_name.clone());
+        Ok(TreeNodeRecursion::Stop)
+    })
+    .with_context(|_| DatafusionSnafu {
+        context: format!("Can't find table source in plan {plan:?}"),
+    })?;
+    let Some(table_name) = table_name else {
+        UnexpectedSnafu {
+            reason: format!("Can't find table source in plan {plan:?}"),
+        }
+        .fail()?
+    };
+
+    let current_schema = query_ctx.current_schema();
+
+    let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
+    let schema_name = table_name.schema().unwrap_or(&current_schema);
+    let table_name = table_name.table();
+
+    let Some(table_ref) = catalog_man
+        .table(catalog_name, schema_name, table_name, Some(&query_ctx))
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?
+    else {
+        UnexpectedSnafu {
+            reason: format!(
+                "Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
+            ),
+        }
+        .fail()?
+    };
+
+    let schema = &table_ref.table_info().meta.schema;
+
+    let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
+        reason: format!("Can't find timestamp column in table {table_name:?}"),
+    })?;
+
+    let ts_col_name = ts_index.name.clone();
+
+    let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
+        reason: format!(
+            "Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
+        ),
+    })?.unit();
+
+    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+        ts_col_name.clone(),
+        ts_index.data_type.as_arrow_type(),
+        false,
+    )]));
+
+    let df_schema = DFSchema::from_field_specific_qualified_schema(
+        vec![Some(TableReference::bare(table_name))],
+        &arrow_schema,
+    )
+    .with_context(|_e| DatafusionSnafu {
+        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
+    })?;
+
+    // find the time window expr which refers to the time index column
+    let mut aggr_expr = None;
+    let mut time_window_expr: Option<Expr> = None;
+
+    let find_inner_aggr_expr = |plan: &LogicalPlan| {
+        if let LogicalPlan::Aggregate(aggregate) = plan {
+            aggr_expr = Some(aggregate.clone());
+        };
+
+        Ok(TreeNodeRecursion::Continue)
+    };
+    plan.apply(find_inner_aggr_expr)
+        .with_context(|_| DatafusionSnafu {
+            context: format!("Can't find aggr expr in plan {plan:?}"),
+        })?;
+
+    if let Some(aggregate) = aggr_expr {
+        for group_expr in &aggregate.group_expr {
+            let refs = group_expr.column_refs();
+            if refs.len() != 1 {
+                continue;
+            }
+            let ref_col = refs.iter().next().unwrap();
+
+            let index = aggregate.input.schema().maybe_index_of_column(ref_col);
+            let Some(index) = index else {
+                continue;
+            };
+            let field = aggregate.input.schema().field(index);
+
+            let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
+
+            if is_time_index {
+                let rewrite_column = group_expr.clone();
+                let rewritten = rewrite_column
+                    .rewrite(&mut RewriteColumn {
+                        table_name: table_name.to_string(),
+                    })
+                    .with_context(|_| DatafusionSnafu {
+                        context: format!("Rewrite expr failed, expr={:?}", group_expr),
+                    })?
+                    .data;
+                struct RewriteColumn {
+                    table_name: String,
+                }
+
+                impl TreeNodeRewriter for RewriteColumn {
+                    type Node = Expr;
+                    fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+                        let Expr::Column(mut column) = node else {
+                            return Ok(Transformed::no(node));
+                        };
+
+                        column.relation = Some(TableReference::bare(self.table_name.clone()));
+
+                        Ok(Transformed::yes(Expr::Column(column)))
+                    }
+                }
+
+                time_window_expr = Some(rewritten);
+                break;
+            }
+        }
+        Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
+    } else {
+        // can't found time window expr, return None
+        Ok((ts_col_name, None, expected_time_unit, df_schema))
+    }
+}
+
+/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
+/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
+/// return `Some("2021-07-01 00:00:00.000")`
+/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
+///
+/// Time window expr is a expr that:
+/// 1. ref only to a time index column
+/// 2. is monotonic increasing
+/// 3. show up in GROUP BY clause
+///
+/// note this plan should only contain one TableScan
+pub async fn find_plan_time_window_bound(
+    plan: &LogicalPlan,
+    current: Timestamp,
+    query_ctx: QueryContextRef,
+    engine: QueryEngineRef,
+) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
+    // TODO(discord9): find the expr that do time window
+    let catalog_man = engine.engine_state().catalog_manager();
+
+    let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
+        find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
+    // cast current to ts_index's type
+    let new_current = current
+        .convert_to(expected_time_unit)
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
+        })?;
+
+    // if no time_window_expr is found, return None
+    if let Some(time_window_expr) = time_window_expr {
+        let lower_bound =
+            find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
+        let upper_bound =
+            find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
+        Ok((ts_col_name, lower_bound, upper_bound))
+    } else {
+        Ok((ts_col_name, None, None))
+    }
+}
+
+/// Find the lower bound of time window in given `expr` and `current` timestamp.
+///
+/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
+/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
+/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
+/// of current time window given the current timestamp
+///
+/// if return None, meaning this time window have no lower bound
+fn find_expr_time_window_lower_bound(
+    expr: &Expr,
+    df_schema: &DFSchema,
+    current: Timestamp,
+) -> Result<Option<Timestamp>, Error> {
+    let phy_planner = DefaultPhysicalPlanner::default();
+
+    let phy_expr: PhysicalExprRef = phy_planner
+        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+        .with_context(|_e| DatafusionSnafu {
+            context: format!(
+                "Failed to create physical expression from {expr:?} using {df_schema:?}"
+            ),
+        })?;
+
+    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
+    let input_time_unit = cur_time_window.unit();
+    Ok(cur_time_window.convert_to(input_time_unit))
+}
+
+/// Find the upper bound for time window expression
+fn find_expr_time_window_upper_bound(
+    expr: &Expr,
+    df_schema: &DFSchema,
+    current: Timestamp,
+) -> Result<Option<Timestamp>, Error> {
+    use std::cmp::Ordering;
+
+    let phy_planner = DefaultPhysicalPlanner::default();
+
+    let phy_expr: PhysicalExprRef = phy_planner
+        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+        .with_context(|_e| DatafusionSnafu {
+            context: format!(
+                "Failed to create physical expression from {expr:?} using {df_schema:?}"
+            ),
+        })?;
+
+    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
+
+    // search to find the lower bound
+    let mut offset: i64 = 1;
+    let mut lower_bound = Some(current);
+    let upper_bound;
+    // first expontial probe to found a range for binary search
+    loop {
+        let Some(next_val) = current.value().checked_add(offset) else {
+            // no upper bound if overflow
+            return Ok(None);
+        };
+
+        let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
+
+        let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
+
+        match next_time_window.cmp(&cur_time_window) {
+            Ordering::Less => {UnexpectedSnafu {
+                reason: format!(
+                    "Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
+                ),
+            }
+            .fail()?
+            }
+            Ordering::Equal => {
+                lower_bound = Some(next_time_probe);
+            }
+            Ordering::Greater => {
+                upper_bound = Some(next_time_probe);
+                break
+            }
+        }
+
+        let Some(new_offset) = offset.checked_mul(2) else {
+            // no upper bound if overflow
+            return Ok(None);
+        };
+        offset = new_offset;
+    }
+
+    // binary search for the exact upper bound
+
+    ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
+        reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
+    });
+
+    let output_unit = upper_bound
+        .context(UnexpectedSnafu {
+            reason: "should have lower bound",
+        })?
+        .unit();
+
+    let mut low = lower_bound
+        .context(UnexpectedSnafu {
+            reason: "should have lower bound",
+        })?
+        .value();
+    let mut high = upper_bound
+        .context(UnexpectedSnafu {
+            reason: "should have upper bound",
+        })?
+        .value();
+    while low < high {
+        let mid = (low + high) / 2;
+        let mid_probe = common_time::Timestamp::new(mid, output_unit);
+        let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
+
+        match mid_time_window.cmp(&cur_time_window) {
+            Ordering::Less => UnexpectedSnafu {
+                reason: format!("Binary search failed for time window expression {expr:?}"),
+            }
+            .fail()?,
+            Ordering::Equal => low = mid + 1,
+            Ordering::Greater => high = mid,
+        }
+    }
+
+    let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
+
+    Ok(Some(final_upper_bound_for_time_window))
+}
+
+fn eval_ts_to_ts(
+    phy: &PhysicalExprRef,
+    df_schema: &DFSchema,
+    input_value: Timestamp,
+) -> Result<Timestamp, Error> {
+    let schema_ty = df_schema.field(0).data_type();
+    let schema_cdt = ConcreteDataType::from_arrow_type(schema_ty);
+    let schema_unit = if let ConcreteDataType::Timestamp(ts) = schema_cdt {
+        ts.unit()
+    } else {
+        return UnexpectedSnafu {
+            reason: format!("Expect Timestamp, found {:?}", schema_cdt),
+        }
+        .fail();
+    };
+    let input_value = input_value
+        .convert_to(schema_unit)
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Failed to convert timestamp {input_value:?} to {schema_unit}"),
+        })?;
+    let ts_vector = match schema_unit {
+        TimeUnit::Second => {
+            TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Millisecond => {
+            TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Microsecond => {
+            TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Nanosecond => {
+            TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+    };
+
+    let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
+        .with_context(|_| ArrowSnafu {
+            context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
+        })?;
+
+    let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
+        context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
+    })?;
+
+    if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
+        Ok(*ts)
+    } else {
+        UnexpectedSnafu {
+            reason: format!(
+                "Expected timestamp in expression {phy:?} but got {:?}",
+                eval_res
+            ),
+        }
+        .fail()?
+    }
+}
+
+// TODO(discord9): a method to found out the precise time window
+
+/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
+#[derive(Debug)]
+pub struct AddFilterRewriter {
+    extra_filter: Expr,
+    is_rewritten: bool,
+}
+
+impl AddFilterRewriter {
+    fn new(filter: Expr) -> Self {
+        Self {
+            extra_filter: filter,
+            is_rewritten: false,
+        }
+    }
+}
+
+impl TreeNodeRewriter for AddFilterRewriter {
+    type Node = LogicalPlan;
+    fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+        if self.is_rewritten {
+            return Ok(Transformed::no(node));
+        }
+        match node {
+            LogicalPlan::Filter(mut filter) if !filter.having => {
+                filter.predicate = filter.predicate.and(self.extra_filter.clone());
+                self.is_rewritten = true;
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            }
+            LogicalPlan::TableScan(_) => {
+                // add a new filter
+                let filter =
+                    datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
+                self.is_rewritten = true;
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            }
+            _ => Ok(Transformed::no(node)),
+        }
+    }
+}
+
+fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
+    /// A dialect that forces all identifiers to be quoted
+    struct ForceQuoteIdentifiers;
+    impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
+        fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
+            if identifier.to_lowercase() != identifier {
+                Some('"')
+            } else {
+                None
+            }
+        }
+    }
+    let unparser = Unparser::new(&ForceQuoteIdentifiers);
+    // first make all column qualified
+    let sql = unparser
+        .plan_to_sql(plan)
+        .with_context(|_e| DatafusionSnafu {
+            context: format!("Failed to unparse logical plan {plan:?}"),
+        })?;
+    Ok(sql.to_string())
+}
+
+#[cfg(test)]
+mod test {
+    use datafusion_common::tree_node::TreeNode;
+    use pretty_assertions::assert_eq;
+    use session::context::QueryContext;
+
+    use super::{sql_to_df_plan, *};
+    use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
+    use crate::test_utils::create_test_query_engine;
+
+    #[tokio::test]
+    async fn test_sql_plan_convert() {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
+        let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
+            .await
+            .unwrap();
+        let new_sql = df_plan_to_sql(&new).unwrap();
+
+        assert_eq!(
+            r#"SELECT "UPPERCASE_NUMBERS_WITH_TS"."NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#,
+            new_sql
+        );
+    }
+
+    #[tokio::test]
+    async fn test_add_filter() {
+        let testcases = vec![
+            (
+                "SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
+            ),
+            (
+                "SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
+                "SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
+            ),
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            )
+        ];
+        use datafusion_expr::{col, lit};
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+
+        for (before, after) in testcases {
+            let sql = before;
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+                .await
+                .unwrap();
+
+            let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
+            let plan = plan.rewrite(&mut add_filter).unwrap().data;
+            let new_sql = df_plan_to_sql(&plan).unwrap();
+            assert_eq!(after, new_sql);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_time_window_lower_bound() {
+        use datafusion_expr::{col, lit};
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+
+        let testcases = [
+            // same alias is not same column
+            (
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
+                Timestamp::new(1740394109, TimeUnit::Second),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
+                ),
+                r#"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"#
+            ),
+            // complex time window index
+            (
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(1740394109, TimeUnit::Second),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(1740394080, TimeUnit::Second)),
+                    Some(Timestamp::new(1740394140, TimeUnit::Second)),
+                ),
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
+            ),
+            // no time index
+            (
+                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                ("ts".to_string(), None, None),
+                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
+            ),
+            // time index
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // on spot
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(0, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // different time unit
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23_000_000, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // time index with other fields
+            (
+                "SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // time index with other pks
+            (
+                "SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
+            ),
+            // subquery
+            (
+                "SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
+            ),
+            // cte
+            (
+                "with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
+            ),
+            // complex subquery without alias
+            (
+                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
+            ),
+            // complex subquery alias
+            (
+                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
+            ),
+        ];
+
+        for (sql, current, expected, expected_unparsed) in testcases {
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
+                .await
+                .unwrap();
+
+            let real =
+                find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
+                    .await
+                    .unwrap();
+            assert_eq!(expected, real);
+
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+                .await
+                .unwrap();
+            let (col_name, lower, upper) = real;
+            let new_sql = if lower.is_some() {
+                let to_df_literal = |value| {
+                    let value = Value::from(value);
+
+                    value.try_to_scalar_value(&value.data_type()).unwrap()
+                };
+                let lower = to_df_literal(lower.unwrap());
+                let upper = to_df_literal(upper.unwrap());
+                let expr = col(&col_name)
+                    .gt_eq(lit(lower))
+                    .and(col(&col_name).lt_eq(lit(upper)));
+                let mut add_filter = AddFilterRewriter::new(expr);
+                let plan = plan.rewrite(&mut add_filter).unwrap().data;
+                df_plan_to_sql(&plan).unwrap()
+            } else {
+                sql.to_string()
+            };
+            assert_eq!(expected_unparsed, new_sql);
+        }
+    }
+}
--- a/Show More
+++ b/Show More