fix: conn pool leak & placeholder feature so ci can compile

fix: placeholder feature so ci can compile
fix: time window filter expr use OR
2026-01-06 13:22:57 +00:00 · 2025-04-10 15:01:07 +08:00 · 2025-04-08 14:37:55 +08:00 · 2025-04-07 16:50:17 +08:00 · 2025-04-07 16:50:17 +08:00 · 2025-04-07 16:50:17 +08:00
234 changed files with 10924 additions and 3908 deletions
--- a/.github/actions/build-greptime-images/action.yml
+++ b/.github/actions/build-greptime-images/action.yml
@@ -34,8 +34,8 @@ inputs:
    required: true
  push-latest-tag:
    description: Whether to push the latest tag
-    required: false
-    default: 'true'
+    required: true
+    default: 'false'
 runs:
  using: composite
  steps:
--- a/.github/actions/build-images/action.yml
+++ b/.github/actions/build-images/action.yml
@@ -22,8 +22,8 @@ inputs:
    required: true
  push-latest-tag:
    description: Whether to push the latest tag
-    required: false
-    default: 'true'
+    required: true
+    default: 'false'
  dev-mode:
    description: Enable dev mode, only build standard greptime
    required: false
--- a/.github/actions/release-cn-artifacts/action.yaml
+++ b/.github/actions/release-cn-artifacts/action.yaml
@@ -51,8 +51,8 @@ inputs:
    required: true
  upload-to-s3:
    description: Upload to S3
-    required: false
-    default: 'true'
+    required: true
+    default: 'false'
  artifacts-dir:
    description: Directory to store artifacts
    required: false
@@ -77,13 +77,21 @@ runs:
      with:
        path: ${{ inputs.artifacts-dir }}

+    - name: Install s5cmd
+      shell: bash
+      run: |
+        wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz
+        tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz
+        sudo mv s5cmd /usr/local/bin/
+        sudo chmod +x /usr/local/bin/s5cmd
+
    - name: Release artifacts to cn region
      uses: nick-invision/retry@v2
      if: ${{ inputs.upload-to-s3 == 'true' }}
      env:
        AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }}
-        AWS_DEFAULT_REGION: ${{ inputs.aws-cn-region }}
+        AWS_REGION: ${{ inputs.aws-cn-region }}
        UPDATE_VERSION_INFO: ${{ inputs.update-version-info }}
      with:
        max_attempts: ${{ inputs.upload-max-retry-times }}
--- a/.github/scripts/upload-artifacts-to-s3.sh
+++ b/.github/scripts/upload-artifacts-to-s3.sh
@@ -33,7 +33,7 @@ function upload_artifacts() {
  #    ├── greptime-darwin-amd64-v0.2.0.sha256sum
  #    └── greptime-darwin-amd64-v0.2.0.tar.gz
  find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
-    aws s3 cp \
+    s5cmd cp \
      "$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")"
  done
 }
@@ -45,7 +45,7 @@ function update_version_info() {
    if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
      echo "Updating latest-version.txt"
      echo "$VERSION" > latest-version.txt
-      aws s3 cp \
+      s5cmd cp \
        latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt"
    fi

@@ -53,7 +53,7 @@ function update_version_info() {
    if [[ "$VERSION" == *"nightly"* ]]; then
      echo "Updating latest-nightly-version.txt"
      echo "$VERSION" > latest-nightly-version.txt
-      aws s3 cp \
+      s5cmd cp \
        latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt"
    fi
  fi
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -274,6 +274,7 @@ jobs:
          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          upload-to-s3: false
          dev-mode: true                     # Only build the standard images(exclude centos images).
          push-latest-tag: false             # Don't push the latest tag to registry.
          update-version-info: false         # Don't update the version info in S3.
--- a/.github/workflows/docbot.yml
+++ b/.github/workflows/docbot.yml
@@ -3,6 +3,10 @@ on:
  pull_request_target:
    types: [opened, edited]

+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  docbot:
    runs-on: ubuntu-20.04
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -200,7 +200,7 @@ jobs:
          image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
          image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
          version: ${{ needs.allocate-runners.outputs.version }}
-          push-latest-tag: true
+          push-latest-tag: false

      - name: Set nightly build result
        id: set-nightly-build-result
@@ -240,9 +240,10 @@ jobs:
          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          upload-to-s3: false
          dev-mode: false
          update-version-info: false  # Don't update version info in S3.
-          push-latest-tag: true
+          push-latest-tag: false

  stop-linux-amd64-runner: # It's always run as the last job in the workflow to make sure that the runner is released.
    name: Stop linux-amd64 runner
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -317,6 +317,7 @@ jobs:
          image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
          image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
          version: ${{ needs.allocate-runners.outputs.version }}
+          push-latest-tag: true

      - name: Set build image result
        id: set-build-image-result
@@ -361,6 +362,7 @@ jobs:
          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
          dev-mode: false
+          upload-to-s3: true
          update-version-info: true
          push-latest-tag: true

--- a/.github/workflows/semantic-pull-request.yml
+++ b/.github/workflows/semantic-pull-request.yml
@@ -7,6 +7,10 @@ on:
      - reopened
      - edited

+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  check:
    runs-on: ubuntu-20.04
--- a/AUTHOR.md
+++ b/AUTHOR.md
@@ -3,30 +3,28 @@
 ## Individual Committers (in alphabetical order)

 * [CookiePieWw](https://github.com/CookiePieWw)
-* [KKould](https://github.com/KKould)
-* [NiwakaDev](https://github.com/NiwakaDev)
 * [etolbakov](https://github.com/etolbakov)
 * [irenjj](https://github.com/irenjj)
-* [tisonkun](https://github.com/tisonkun)
+* [KKould](https://github.com/KKould)
 * [Lanqing Yang](https://github.com/lyang24)
+* [NiwakaDev](https://github.com/NiwakaDev)
+* [tisonkun](https://github.com/tisonkun)
+

 ## Team Members (in alphabetical order)

-* [Breeze-P](https://github.com/Breeze-P)
-* [GrepTime](https://github.com/GrepTime)
-* [MichaelScofield](https://github.com/MichaelScofield)
-* [Wenjie0329](https://github.com/Wenjie0329)
-* [WenyXu](https://github.com/WenyXu)
-* [ZonaHex](https://github.com/ZonaHex)
 * [apdong2022](https://github.com/apdong2022)
 * [beryl678](https://github.com/beryl678)
+* [Breeze-P](https://github.com/Breeze-P)
 * [daviderli614](https://github.com/daviderli614)
 * [discord9](https://github.com/discord9)
 * [evenyag](https://github.com/evenyag)
 * [fengjiachun](https://github.com/fengjiachun)
 * [fengys1996](https://github.com/fengys1996)
+* [GrepTime](https://github.com/GrepTime)
 * [holalengyu](https://github.com/holalengyu)
 * [killme2008](https://github.com/killme2008)
+* [MichaelScofield](https://github.com/MichaelScofield)
 * [nicecui](https://github.com/nicecui)
 * [paomian](https://github.com/paomian)
 * [shuiyisong](https://github.com/shuiyisong)
@@ -34,11 +32,14 @@
 * [sunng87](https://github.com/sunng87)
 * [v0y4g3r](https://github.com/v0y4g3r)
 * [waynexia](https://github.com/waynexia)
+* [Wenjie0329](https://github.com/Wenjie0329)
+* [WenyXu](https://github.com/WenyXu)
 * [xtang](https://github.com/xtang)
 * [zhaoyingnan01](https://github.com/zhaoyingnan01)
 * [zhongzc](https://github.com/zhongzc)
+* [ZonaHex](https://github.com/ZonaHex)
 * [zyy17](https://github.com/zyy17)

 ## All Contributors

-[![All Contributors](https://contrib.rocks/image?repo=GreptimeTeam/greptimedb)](https://github.com/GreptimeTeam/greptimedb/graphs/contributors)
+To see the full list of contributors, please visit our [Contributors page](https://github.com/GreptimeTeam/greptimedb/graphs/contributors)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -432,7 +432,7 @@ dependencies = [
 "arrow-schema",
 "chrono",
 "half",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "lexical-core",
 "num",
 "serde",
@@ -1475,7 +1475,7 @@ version = "0.13.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6026d8cd82ada8bbcfe337805dd1eb6afdc9e80fa4d57e977b3a36315e0c5525"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "lazy_static",
 "num-traits",
 "regex",
@@ -2009,10 +2009,12 @@ dependencies = [
 name = "common-function"
 version = "0.12.0"
 dependencies = [
+ "ahash 0.8.11",
 "api",
 "approx 0.5.1",
 "arc-swap",
 "async-trait",
+ "bincode",
 "common-base",
 "common-catalog",
 "common-error",
@@ -2030,6 +2032,7 @@ dependencies = [
 "geo-types",
 "geohash",
 "h3o",
+ "hyperloglogplus",
 "jsonb",
 "nalgebra 0.33.2",
 "num",
@@ -2046,6 +2049,7 @@ dependencies = [
 "store-api",
 "table",
 "tokio",
+ "uddsketch",
 "wkt",
 ]

@@ -2972,7 +2976,7 @@ dependencies = [
 "chrono",
 "half",
 "hashbrown 0.14.5",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "libc",
 "object_store",
 "parquet",
@@ -3032,7 +3036,7 @@ dependencies = [
 "datafusion-functions-aggregate-common",
 "datafusion-functions-window-common",
 "datafusion-physical-expr-common",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "paste",
 "recursive",
 "serde_json",
@@ -3154,7 +3158,7 @@ dependencies = [
 "datafusion-physical-expr-common",
 "datafusion-physical-plan",
 "half",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "log",
 "parking_lot 0.12.3",
 "paste",
@@ -3205,7 +3209,7 @@ dependencies = [
 "datafusion-common",
 "datafusion-expr",
 "datafusion-physical-expr",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itertools 0.13.0",
 "log",
 "recursive",
@@ -3230,7 +3234,7 @@ dependencies = [
 "datafusion-physical-expr-common",
 "half",
 "hashbrown 0.14.5",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itertools 0.13.0",
 "log",
 "paste",
@@ -3289,7 +3293,7 @@ dependencies = [
 "futures",
 "half",
 "hashbrown 0.14.5",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itertools 0.13.0",
 "log",
 "once_cell",
@@ -3309,7 +3313,7 @@ dependencies = [
 "arrow-schema",
 "datafusion-common",
 "datafusion-expr",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "log",
 "recursive",
 "regex",
@@ -3376,6 +3380,7 @@ dependencies = [
 "meta-client",
 "metric-engine",
 "mito2",
+ "num_cpus",
 "object-store",
 "prometheus",
 "prost 0.13.3",
@@ -4160,6 +4165,7 @@ dependencies = [
 "bytes",
 "cache",
 "catalog",
+ "chrono",
 "client",
 "common-base",
 "common-catalog",
@@ -4196,6 +4202,7 @@ dependencies = [
 "meta-client",
 "nom",
 "num-traits",
+ "num_cpus",
 "operator",
 "partition",
 "pretty_assertions",
@@ -4302,6 +4309,7 @@ dependencies = [
 "log-query",
 "log-store",
 "meta-client",
+ "num_cpus",
 "opentelemetry-proto 0.27.0",
 "operator",
 "partition",
@@ -4692,7 +4700,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=fc09a5696608d2a0aa718cc835d5cb9c4e8e9387#fc09a5696608d2a0aa718cc835d5cb9c4e8e9387"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=072ce580502e015df1a6b03a185b60309a7c2a7a#072ce580502e015df1a6b03a185b60309a7c2a7a"
 dependencies = [
 "prost 0.13.3",
 "serde",
@@ -4715,7 +4723,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.12",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "slab",
 "tokio",
 "tokio-util",
@@ -4734,7 +4742,7 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http 1.1.0",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "slab",
 "tokio",
 "tokio-util",
@@ -5284,6 +5292,15 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "hyperloglogplus"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "i_float"
 version = "1.3.1"
@@ -5572,9 +5589,9 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
 dependencies = [
 "equivalent",
 "hashbrown 0.15.2",
@@ -5588,7 +5605,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
 "ahash 0.8.11",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "is-terminal",
 "itoa",
 "log",
@@ -5935,7 +5952,7 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ee7893dab2e44ae5f9d0173f26ff4aa327c10b01b06a72b52dd9405b628640d"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 ]

 [[package]]
@@ -6418,7 +6435,7 @@ dependencies = [
 "cactus",
 "cfgrammar",
 "filetime",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "lazy_static",
 "lrtable",
 "num-traits",
@@ -7659,7 +7676,7 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
 dependencies = [
 "futures-core",
 "futures-sink",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "js-sys",
 "once_cell",
 "pin-project-lite",
@@ -8231,7 +8248,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
 "fixedbitset",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 ]

 [[package]]
@@ -8756,8 +8773,7 @@ dependencies = [
 [[package]]
 name = "promql-parser"
 version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe99e6f80a79abccf1e8fb48dd63473a36057e600cc6ea36147c8318698ae6f"
+source = "git+https://github.com/GreptimeTeam/promql-parser.git?rev=27abb8e16003a50c720f00d6c85f41f5fa2a2a8e#27abb8e16003a50c720f00d6c85f41f5fa2a2a8e"
 dependencies = [
 "cfgrammar",
 "chrono",
@@ -10323,7 +10339,7 @@ version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itoa",
 "memchr",
 "ryu",
@@ -10394,7 +10410,7 @@ dependencies = [
 "chrono",
 "hex",
 "indexmap 1.9.3",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "serde",
 "serde_derive",
 "serde_json",
@@ -10420,7 +10436,7 @@ version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "itoa",
 "ryu",
 "serde",
@@ -10481,6 +10497,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 1.4.1",
+ "indexmap 2.7.1",
 "influxdb_line_protocol",
 "itertools 0.10.5",
 "json5",
@@ -10891,12 +10908,12 @@ dependencies = [
 [[package]]
 name = "sqlness"
 version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "308a7338f2211813d6e9da117e9b9b7aee5d072872d11a934002fd2bd4ab5276"
+source = "git+https://github.com/CeresDB/sqlness.git?rev=bb91f31ff58993e07ea89845791235138283a24c#bb91f31ff58993e07ea89845791235138283a24c"
 dependencies = [
 "async-trait",
 "derive_builder 0.11.2",
 "duration-str",
+ "futures",
 "minijinja",
 "prettydiff",
 "regex",
@@ -10922,6 +10939,7 @@ dependencies = [
 "hex",
 "local-ip-address",
 "mysql",
+ "num_cpus",
 "reqwest",
 "serde",
 "serde_json",
@@ -11021,7 +11039,7 @@ dependencies = [
 "futures-util",
 "hashbrown 0.15.2",
 "hashlink",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "log",
 "memchr",
 "once_cell",
@@ -12317,7 +12335,7 @@ version = "0.19.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "toml_datetime",
 "winnow 0.5.40",
 ]
@@ -12328,7 +12346,7 @@ version = "0.22.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -12466,7 +12484,7 @@ dependencies = [
 "futures-core",
 "futures-util",
 "hdrhistogram",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 "pin-project-lite",
 "slab",
 "sync_wrapper 1.0.1",
@@ -12954,6 +12972,14 @@ version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"

+[[package]]
+name = "uddsketch"
+version = "0.1.0"
+source = "git+https://github.com/GreptimeTeam/timescaledb-toolkit.git?rev=84828fe8fb494a6a61412a3da96517fc80f7bb20#84828fe8fb494a6a61412a3da96517fc80f7bb20"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "unescaper"
 version = "0.1.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,7 +129,7 @@ etcd-client = "0.14"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "fc09a5696608d2a0aa718cc835d5cb9c4e8e9387" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "072ce580502e015df1a6b03a185b60309a7c2a7a" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -160,7 +160,9 @@ parquet = { version = "53.0.0", default-features = false, features = ["arrow", "
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { version = "0.4.3", features = ["ser"] }
+promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", features = [
+    "ser",
+], rev = "27abb8e16003a50c720f00d6c85f41f5fa2a2a8e" }
 prost = "0.13"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.8"
--- a/config/config.md
+++ b/config/config.md
@@ -152,6 +152,7 @@
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
 | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
 | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
+| `region_engine.mito.index.staging_ttl` | String | `7d` | The TTL of the staging directory.<br/>Defaults to 7 days.<br/>Setting it to "0s" to disable TTL. |
 | `region_engine.mito.index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
 | `region_engine.mito.index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
 | `region_engine.mito.index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
@@ -318,6 +319,7 @@
 | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
 | `use_memory_store` | Bool | `false` | Store data in memory. |
 | `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
+| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
 | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
@@ -491,6 +493,7 @@
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
 | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
 | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
+| `region_engine.mito.index.staging_ttl` | String | `7d` | The TTL of the staging directory.<br/>Defaults to 7 days.<br/>Setting it to "0s" to disable TTL. |
 | `region_engine.mito.index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
 | `region_engine.mito.index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
 | `region_engine.mito.index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -497,6 +497,11 @@ aux_path = ""
 ## The max capacity of the staging directory.
 staging_size = "2GB"

+## The TTL of the staging directory.
+## Defaults to 7 days.
+## Setting it to "0s" to disable TTL.
+staging_ttl = "7d"
+
 ## Cache size for inverted index metadata.
 metadata_cache_size = "64MiB"

--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -50,6 +50,9 @@ use_memory_store = false
 ## - Using shared storage (e.g., s3).
 enable_region_failover = false

+## Max allowed idle time before removing node info from metasrv memory.
+node_max_idle_time = "24hours"
+
 ## Whether to enable greptimedb telemetry. Enabled by default.
 #+ enable_telemetry = true

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -584,6 +584,11 @@ aux_path = ""
 ## The max capacity of the staging directory.
 staging_size = "2GB"

+## The TTL of the staging directory.
+## Defaults to 7 days.
+## Setting it to "0s" to disable TTL.
+staging_ttl = "7d"
+
 ## Cache size for inverted index metadata.
 metadata_cache_size = "64MiB"

--- a/grafana/greptimedb-cluster.json
+++ b/grafana/greptimedb-cluster.json
--- a/grafana/greptimedb.json
+++ b/grafana/greptimedb.json
@@ -384,8 +384,8 @@
        "rowHeight": 0.9,
        "showValue": "auto",
        "tooltip": {
-          "mode": "none",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -483,8 +483,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "pluginVersion": "10.2.3",
@@ -578,8 +578,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "pluginVersion": "10.2.3",
@@ -601,7 +601,7 @@
      "type": "timeseries"
    },
    {
-      "collapsed": true,
+      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
@@ -684,8 +684,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -878,8 +878,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1124,8 +1124,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1223,8 +1223,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1322,8 +1322,8 @@
              "showLegend": true
            },
            "tooltip": {
-              "mode": "single",
-              "sort": "none"
+              "mode": "multi",
+              "sort": "desc"
            }
          },
          "targets": [
@@ -1456,8 +1456,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1573,8 +1573,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1673,8 +1673,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1773,8 +1773,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -1890,8 +1890,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2002,8 +2002,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2120,8 +2120,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2233,8 +2233,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2334,8 +2334,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2435,8 +2435,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2548,8 +2548,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2661,8 +2661,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2788,8 +2788,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2889,8 +2889,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -2990,8 +2990,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3091,8 +3091,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3191,8 +3191,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3302,8 +3302,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3432,8 +3432,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3543,8 +3543,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3657,8 +3657,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3808,8 +3808,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -3909,8 +3909,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -4011,8 +4011,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
@@ -4113,8 +4113,8 @@
          "showLegend": true
        },
        "tooltip": {
-          "mode": "single",
-          "sort": "none"
+          "mode": "multi",
+          "sort": "desc"
        }
      },
      "targets": [
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@@ -15,13 +15,10 @@ common-macro.workspace = true
 common-time.workspace = true
 datatypes.workspace = true
 greptime-proto.workspace = true
-paste = "1.0"
+paste.workspace = true
 prost.workspace = true
 serde_json.workspace = true
 snafu.workspace = true

 [build-dependencies]
 tonic-build = "0.11"
-
-[dev-dependencies]
-paste = "1.0"
--- a/src/catalog/Cargo.toml
+++ b/src/catalog/Cargo.toml
@@ -15,7 +15,7 @@ api.workspace = true
 arrow.workspace = true
 arrow-schema.workspace = true
 async-stream.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 bytes.workspace = true
 common-catalog.workspace = true
 common-error.workspace = true
@@ -31,7 +31,7 @@ common-version.workspace = true
 dashmap.workspace = true
 datafusion.workspace = true
 datatypes.workspace = true
-futures = "0.3"
+futures.workspace = true
 futures-util.workspace = true
 humantime.workspace = true
 itertools.workspace = true
@@ -39,7 +39,7 @@ lazy_static.workspace = true
 meta-client.workspace = true
 moka = { workspace = true, features = ["future", "sync"] }
 partition.workspace = true
-paste = "1.0"
+paste.workspace = true
 prometheus.workspace = true
 rustc-hash.workspace = true
 serde_json.workspace = true
@@ -49,7 +49,7 @@ sql.workspace = true
 store-api.workspace = true
 table.workspace = true
 tokio.workspace = true
-tokio-stream = "0.1"
+tokio-stream.workspace = true

 [dev-dependencies]
 cache.workspace = true
--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -16,7 +16,6 @@

 mod client;
 pub mod client_manager;
-#[cfg(feature = "testing")]
 mod database;
 pub mod error;
 pub mod flow;
@@ -34,7 +33,6 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
 use snafu::OptionExt;

 pub use self::client::Client;
-#[cfg(feature = "testing")]
 pub use self::database::Database;
 pub use self::error::{Error, Result};
 use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
 use common_telemetry::info;
 use common_telemetry::logging::TracingOptions;
 use common_version::{short_version, version};
-use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
+use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
 use meta_client::{MetaClientOptions, MetaClientType};
 use servers::Mode;
 use snafu::{OptionExt, ResultExt};
@@ -317,6 +317,8 @@ impl StartCommand {
            Arc::new(executor),
        );

+        let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
+
        let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
        let flownode_builder = FlownodeBuilder::new(
            opts,
@@ -324,6 +326,7 @@ impl StartCommand {
            table_metadata_manager,
            catalog_manager.clone(),
            flow_metadata_manager,
+            Arc::new(frontend_client),
        )
        .with_heartbeat_task(heartbeat_task);

--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -42,7 +42,7 @@ pub struct Instance {
 }

 impl Instance {
-    fn new(instance: MetasrvInstance, guard: Vec<WorkerGuard>) -> Self {
+    pub fn new(instance: MetasrvInstance, guard: Vec<WorkerGuard>) -> Self {
        Self {
            instance,
            _guard: guard,
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -54,7 +54,10 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use file_engine::config::EngineConfig as FileEngineConfig;
-use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
+use flow::{
+    FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
+    FrontendInvoker,
+};
 use frontend::frontend::FrontendOptions;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
@@ -533,12 +536,16 @@ impl StartCommand {
            flow: opts.flow.clone(),
            ..Default::default()
        };
+
+        let fe_server_addr = fe_opts.grpc.bind_addr.clone();
+        let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
        let flow_builder = FlownodeBuilder::new(
            flownode_options,
            plugins.clone(),
            table_metadata_manager.clone(),
            catalog_manager.clone(),
            flow_metadata_manager.clone(),
+            Arc::new(frontend_client),
        );
        let flownode = Arc::new(
            flow_builder
--- a/src/common/base/Cargo.toml
+++ b/src/common/base/Cargo.toml
@@ -18,7 +18,7 @@ bytes.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 futures.workspace = true
-paste = "1.0"
+paste.workspace = true
 pin-project.workspace = true
 rand.workspace = true
 serde = { version = "1.0", features = ["derive"] }
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -35,7 +35,7 @@ orc-rust = { version = "0.5", default-features = false, features = [
    "async",
 ] }
 parquet.workspace = true
-paste = "1.0"
+paste.workspace = true
 rand.workspace = true
 regex = "1.7"
 serde.workspace = true
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -12,9 +12,11 @@ default = ["geo"]
 geo = ["geohash", "h3o", "s2", "wkt", "geo-types", "dep:geo"]

 [dependencies]
+ahash = "0.8"
 api.workspace = true
 arc-swap = "1.0"
 async-trait.workspace = true
+bincode = "1.3"
 common-base.workspace = true
 common-catalog.workspace = true
 common-error.workspace = true
@@ -32,12 +34,13 @@ geo = { version = "0.29", optional = true }
 geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
 h3o = { version = "0.6", optional = true }
+hyperloglogplus = "0.4"
 jsonb.workspace = true
 nalgebra.workspace = true
 num = "0.4"
 num-traits = "0.2"
 once_cell.workspace = true
-paste = "1.0"
+paste.workspace = true
 s2 = { version = "0.0.12", optional = true }
 serde.workspace = true
 serde_json.workspace = true
@@ -47,6 +50,7 @@ sql.workspace = true
 statrs = "0.16"
 store-api.workspace = true
 table.workspace = true
+uddsketch = { git = "https://github.com/GreptimeTeam/timescaledb-toolkit.git", rev = "84828fe8fb494a6a61412a3da96517fc80f7bb20" }
 wkt = { version = "0.11", optional = true }

 [dev-dependencies]
--- a/src/common/function/src/aggr.rs
+++ b/src/common/function/src/aggr.rs
@@ -0,0 +1,20 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod hll;
+mod uddsketch_state;
+
+pub(crate) use hll::HllStateType;
+pub use hll::{HllState, HLL_MERGE_NAME, HLL_NAME};
+pub use uddsketch_state::{UddSketchState, UDDSKETCH_STATE_NAME};
--- a/src/common/function/src/aggr/hll.rs
+++ b/src/common/function/src/aggr/hll.rs
@@ -0,0 +1,319 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_query::prelude::*;
+use common_telemetry::trace;
+use datafusion::arrow::array::ArrayRef;
+use datafusion::common::cast::{as_binary_array, as_string_array};
+use datafusion::common::not_impl_err;
+use datafusion::error::{DataFusionError, Result as DfResult};
+use datafusion::logical_expr::function::AccumulatorArgs;
+use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF};
+use datafusion::prelude::create_udaf;
+use datatypes::arrow::datatypes::DataType;
+use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
+
+use crate::utils::FixedRandomState;
+
+pub const HLL_NAME: &str = "hll";
+pub const HLL_MERGE_NAME: &str = "hll_merge";
+
+const DEFAULT_PRECISION: u8 = 14;
+
+pub(crate) type HllStateType = HyperLogLogPlus<String, FixedRandomState>;
+
+pub struct HllState {
+    hll: HllStateType,
+}
+
+impl std::fmt::Debug for HllState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "HllState<Opaque>")
+    }
+}
+
+impl Default for HllState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl HllState {
+    pub fn new() -> Self {
+        Self {
+            // Safety: the DEFAULT_PRECISION is fixed and valid
+            hll: HllStateType::new(DEFAULT_PRECISION, FixedRandomState::new()).unwrap(),
+        }
+    }
+
+    /// Create a UDF for the `hll` function.
+    ///
+    /// `hll` accepts a string column and aggregates the
+    /// values into a HyperLogLog state.
+    pub fn state_udf_impl() -> AggregateUDF {
+        create_udaf(
+            HLL_NAME,
+            vec![DataType::Utf8],
+            Arc::new(DataType::Binary),
+            Volatility::Immutable,
+            Arc::new(Self::create_accumulator),
+            Arc::new(vec![DataType::Binary]),
+        )
+    }
+
+    /// Create a UDF for the `hll_merge` function.
+    ///
+    /// `hll_merge` accepts a binary column of states generated by `hll`
+    /// and merges them into a single state.
+    pub fn merge_udf_impl() -> AggregateUDF {
+        create_udaf(
+            HLL_MERGE_NAME,
+            vec![DataType::Binary],
+            Arc::new(DataType::Binary),
+            Volatility::Immutable,
+            Arc::new(Self::create_merge_accumulator),
+            Arc::new(vec![DataType::Binary]),
+        )
+    }
+
+    fn update(&mut self, value: &str) {
+        self.hll.insert(value);
+    }
+
+    fn merge(&mut self, raw: &[u8]) {
+        if let Ok(serialized) = bincode::deserialize::<HllStateType>(raw) {
+            if let Ok(()) = self.hll.merge(&serialized) {
+                return;
+            }
+        }
+        trace!("Warning: Failed to merge HyperLogLog from {:?}", raw);
+    }
+
+    fn create_accumulator(acc_args: AccumulatorArgs) -> DfResult<Box<dyn DfAccumulator>> {
+        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
+
+        match data_type {
+            DataType::Utf8 => Ok(Box::new(HllState::new())),
+            other => not_impl_err!("{HLL_NAME} does not support data type: {other}"),
+        }
+    }
+
+    fn create_merge_accumulator(acc_args: AccumulatorArgs) -> DfResult<Box<dyn DfAccumulator>> {
+        let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
+
+        match data_type {
+            DataType::Binary => Ok(Box::new(HllState::new())),
+            other => not_impl_err!("{HLL_MERGE_NAME} does not support data type: {other}"),
+        }
+    }
+}
+
+impl DfAccumulator for HllState {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
+        let array = &values[0];
+
+        match array.data_type() {
+            DataType::Utf8 => {
+                let string_array = as_string_array(array)?;
+                for value in string_array.iter().flatten() {
+                    self.update(value);
+                }
+            }
+            DataType::Binary => {
+                let binary_array = as_binary_array(array)?;
+                for v in binary_array.iter().flatten() {
+                    self.merge(v);
+                }
+            }
+            _ => {
+                return not_impl_err!(
+                    "HLL functions do not support data type: {}",
+                    array.data_type()
+                )
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> DfResult<ScalarValue> {
+        Ok(ScalarValue::Binary(Some(
+            bincode::serialize(&self.hll).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize HyperLogLog: {}", e))
+            })?,
+        )))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(&self.hll)
+    }
+
+    fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
+        Ok(vec![ScalarValue::Binary(Some(
+            bincode::serialize(&self.hll).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize HyperLogLog: {}", e))
+            })?,
+        ))])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
+        let array = &states[0];
+        let binary_array = as_binary_array(array)?;
+        for v in binary_array.iter().flatten() {
+            self.merge(v);
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::arrow::array::{BinaryArray, StringArray};
+
+    use super::*;
+
+    #[test]
+    fn test_hll_basic() {
+        let mut state = HllState::new();
+        state.update("1");
+        state.update("2");
+        state.update("3");
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(hll.count().trunc() as u32, 3);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_hll_roundtrip() {
+        let mut state = HllState::new();
+        state.update("1");
+        state.update("2");
+
+        // Serialize
+        let serialized = state.evaluate().unwrap();
+
+        // Create new state and merge the serialized data
+        let mut new_state = HllState::new();
+        if let ScalarValue::Binary(Some(bytes)) = &serialized {
+            new_state.merge(bytes);
+
+            // Verify the merged state matches original
+            let result = new_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(new_bytes)) = result {
+                let mut original: HllStateType = bincode::deserialize(bytes).unwrap();
+                let mut merged: HllStateType = bincode::deserialize(&new_bytes).unwrap();
+                assert_eq!(original.count(), merged.count());
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_hll_batch_update() {
+        let mut state = HllState::new();
+
+        // Test string values
+        let str_values = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i"];
+        let str_array = Arc::new(StringArray::from(str_values)) as ArrayRef;
+        state.update_batch(&[str_array]).unwrap();
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(hll.count().trunc() as u32, 9);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_hll_merge_batch() {
+        let mut state1 = HllState::new();
+        state1.update("1");
+        let state1_binary = state1.evaluate().unwrap();
+
+        let mut state2 = HllState::new();
+        state2.update("2");
+        let state2_binary = state2.evaluate().unwrap();
+
+        let mut merged_state = HllState::new();
+        if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
+            (&state1_binary, &state2_binary)
+        {
+            let binary_array = Arc::new(BinaryArray::from(vec![
+                bytes1.as_slice(),
+                bytes2.as_slice(),
+            ])) as ArrayRef;
+            merged_state.merge_batch(&[binary_array]).unwrap();
+
+            let result = merged_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(bytes)) = result {
+                let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+                assert_eq!(hll.count().trunc() as u32, 2);
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar values");
+        }
+    }
+
+    #[test]
+    fn test_hll_merge_function() {
+        // Create two HLL states with different values
+        let mut state1 = HllState::new();
+        state1.update("1");
+        state1.update("2");
+        let state1_binary = state1.evaluate().unwrap();
+
+        let mut state2 = HllState::new();
+        state2.update("2");
+        state2.update("3");
+        let state2_binary = state2.evaluate().unwrap();
+
+        // Create a merge state and merge both states
+        let mut merge_state = HllState::new();
+        if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
+            (&state1_binary, &state2_binary)
+        {
+            let binary_array = Arc::new(BinaryArray::from(vec![
+                bytes1.as_slice(),
+                bytes2.as_slice(),
+            ])) as ArrayRef;
+            merge_state.update_batch(&[binary_array]).unwrap();
+
+            let result = merge_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(bytes)) = result {
+                let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
+                // Should have 3 unique values: "1", "2", "3"
+                assert_eq!(hll.count().trunc() as u32, 3);
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar values");
+        }
+    }
+}
--- a/src/common/function/src/aggr/uddsketch_state.rs
+++ b/src/common/function/src/aggr/uddsketch_state.rs
@@ -0,0 +1,307 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_query::prelude::*;
+use common_telemetry::trace;
+use datafusion::common::cast::{as_binary_array, as_primitive_array};
+use datafusion::common::not_impl_err;
+use datafusion::error::{DataFusionError, Result as DfResult};
+use datafusion::logical_expr::function::AccumulatorArgs;
+use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF};
+use datafusion::physical_plan::expressions::Literal;
+use datafusion::prelude::create_udaf;
+use datatypes::arrow::array::ArrayRef;
+use datatypes::arrow::datatypes::{DataType, Float64Type};
+use uddsketch::{SketchHashKey, UDDSketch};
+
+pub const UDDSKETCH_STATE_NAME: &str = "uddsketch_state";
+
+#[derive(Debug)]
+pub struct UddSketchState {
+    uddsketch: UDDSketch,
+}
+
+impl UddSketchState {
+    pub fn new(bucket_size: u64, error_rate: f64) -> Self {
+        Self {
+            uddsketch: UDDSketch::new(bucket_size, error_rate),
+        }
+    }
+
+    pub fn udf_impl() -> AggregateUDF {
+        create_udaf(
+            UDDSKETCH_STATE_NAME,
+            vec![DataType::Int64, DataType::Float64, DataType::Float64],
+            Arc::new(DataType::Binary),
+            Volatility::Immutable,
+            Arc::new(|args| {
+                let (bucket_size, error_rate) = downcast_accumulator_args(args)?;
+                Ok(Box::new(UddSketchState::new(bucket_size, error_rate)))
+            }),
+            Arc::new(vec![DataType::Binary]),
+        )
+    }
+
+    fn update(&mut self, value: f64) {
+        self.uddsketch.add_value(value);
+    }
+
+    fn merge(&mut self, raw: &[u8]) {
+        if let Ok(uddsketch) = bincode::deserialize::<UDDSketch>(raw) {
+            if uddsketch.count() != 0 {
+                self.uddsketch.merge_sketch(&uddsketch);
+            }
+        } else {
+            trace!("Warning: Failed to deserialize UDDSketch from {:?}", raw);
+        }
+    }
+}
+
+fn downcast_accumulator_args(args: AccumulatorArgs) -> DfResult<(u64, f64)> {
+    let bucket_size = match args.exprs[0]
+        .as_any()
+        .downcast_ref::<Literal>()
+        .map(|lit| lit.value())
+    {
+        Some(ScalarValue::Int64(Some(value))) => *value as u64,
+        _ => {
+            return not_impl_err!(
+                "{} not supported for bucket size: {}",
+                UDDSKETCH_STATE_NAME,
+                &args.exprs[0]
+            )
+        }
+    };
+
+    let error_rate = match args.exprs[1]
+        .as_any()
+        .downcast_ref::<Literal>()
+        .map(|lit| lit.value())
+    {
+        Some(ScalarValue::Float64(Some(value))) => *value,
+        _ => {
+            return not_impl_err!(
+                "{} not supported for error rate: {}",
+                UDDSKETCH_STATE_NAME,
+                &args.exprs[1]
+            )
+        }
+    };
+
+    Ok((bucket_size, error_rate))
+}
+
+impl DfAccumulator for UddSketchState {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
+        let array = &values[2]; // the third column is data value
+        let f64_array = as_primitive_array::<Float64Type>(array)?;
+        for v in f64_array.iter().flatten() {
+            self.update(v);
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&mut self) -> DfResult<ScalarValue> {
+        Ok(ScalarValue::Binary(Some(
+            bincode::serialize(&self.uddsketch).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize UDDSketch: {}", e))
+            })?,
+        )))
+    }
+
+    fn size(&self) -> usize {
+        // Base size of UDDSketch struct fields
+        let mut total_size = std::mem::size_of::<f64>() * 3 + // alpha, gamma, values_sum
+                            std::mem::size_of::<u32>() +      // compactions
+                            std::mem::size_of::<u64>() * 2; // max_buckets, num_values
+
+        // Size of buckets (SketchHashMap)
+        // Each bucket entry contains:
+        // - SketchHashKey (enum with i64/Zero/Invalid variants)
+        // - SketchHashEntry (count: u64, next: SketchHashKey)
+        let bucket_entry_size = std::mem::size_of::<SketchHashKey>() + // key
+                               std::mem::size_of::<u64>() +            // count
+                               std::mem::size_of::<SketchHashKey>(); // next
+
+        total_size += self.uddsketch.current_buckets_count() * bucket_entry_size;
+
+        total_size
+    }
+
+    fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
+        Ok(vec![ScalarValue::Binary(Some(
+            bincode::serialize(&self.uddsketch).map_err(|e| {
+                DataFusionError::Internal(format!("Failed to serialize UDDSketch: {}", e))
+            })?,
+        ))])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
+        let array = &states[0];
+        let binary_array = as_binary_array(array)?;
+        for v in binary_array.iter().flatten() {
+            self.merge(v);
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::arrow::array::{BinaryArray, Float64Array};
+
+    use super::*;
+
+    #[test]
+    fn test_uddsketch_state_basic() {
+        let mut state = UddSketchState::new(10, 0.01);
+        state.update(1.0);
+        state.update(2.0);
+        state.update(3.0);
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(deserialized.count(), 3);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_roundtrip() {
+        let mut state = UddSketchState::new(10, 0.01);
+        state.update(1.0);
+        state.update(2.0);
+
+        // Serialize
+        let serialized = state.evaluate().unwrap();
+
+        // Create new state and merge the serialized data
+        let mut new_state = UddSketchState::new(10, 0.01);
+        if let ScalarValue::Binary(Some(bytes)) = &serialized {
+            new_state.merge(bytes);
+
+            // Verify the merged state matches original by comparing deserialized values
+            let original_sketch: UDDSketch = bincode::deserialize(bytes).unwrap();
+            let new_result = new_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(new_bytes)) = new_result {
+                let new_sketch: UDDSketch = bincode::deserialize(&new_bytes).unwrap();
+                assert_eq!(original_sketch.count(), new_sketch.count());
+                assert_eq!(original_sketch.sum(), new_sketch.sum());
+                assert_eq!(original_sketch.mean(), new_sketch.mean());
+                assert_eq!(original_sketch.max_error(), new_sketch.max_error());
+                // Compare a few quantiles to ensure statistical equivalence
+                for q in [0.1, 0.5, 0.9].iter() {
+                    assert!(
+                        (original_sketch.estimate_quantile(*q) - new_sketch.estimate_quantile(*q))
+                            .abs()
+                            < 1e-10,
+                        "Quantile {} mismatch: original={}, new={}",
+                        q,
+                        original_sketch.estimate_quantile(*q),
+                        new_sketch.estimate_quantile(*q)
+                    );
+                }
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_batch_update() {
+        let mut state = UddSketchState::new(10, 0.01);
+        let values = vec![1.0f64, 2.0, 3.0];
+        let array = Arc::new(Float64Array::from(values)) as ArrayRef;
+
+        state
+            .update_batch(&[array.clone(), array.clone(), array])
+            .unwrap();
+
+        let result = state.evaluate().unwrap();
+        if let ScalarValue::Binary(Some(bytes)) = result {
+            let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
+            assert_eq!(deserialized.count(), 3);
+        } else {
+            panic!("Expected binary scalar value");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_merge_batch() {
+        let mut state1 = UddSketchState::new(10, 0.01);
+        state1.update(1.0);
+        let state1_binary = state1.evaluate().unwrap();
+
+        let mut state2 = UddSketchState::new(10, 0.01);
+        state2.update(2.0);
+        let state2_binary = state2.evaluate().unwrap();
+
+        let mut merged_state = UddSketchState::new(10, 0.01);
+        if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
+            (&state1_binary, &state2_binary)
+        {
+            let binary_array = Arc::new(BinaryArray::from(vec![
+                bytes1.as_slice(),
+                bytes2.as_slice(),
+            ])) as ArrayRef;
+            merged_state.merge_batch(&[binary_array]).unwrap();
+
+            let result = merged_state.evaluate().unwrap();
+            if let ScalarValue::Binary(Some(bytes)) = result {
+                let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
+                assert_eq!(deserialized.count(), 2);
+            } else {
+                panic!("Expected binary scalar value");
+            }
+        } else {
+            panic!("Expected binary scalar values");
+        }
+    }
+
+    #[test]
+    fn test_uddsketch_state_size() {
+        let mut state = UddSketchState::new(10, 0.01);
+        let initial_size = state.size();
+
+        // Add some values to create buckets
+        state.update(1.0);
+        state.update(2.0);
+        state.update(3.0);
+
+        let size_with_values = state.size();
+        assert!(
+            size_with_values > initial_size,
+            "Size should increase after adding values: initial={}, with_values={}",
+            initial_size,
+            size_with_values
+        );
+
+        // Verify size increases with more buckets
+        state.update(10.0); // This should create a new bucket
+        assert!(
+            state.size() > size_with_values,
+            "Size should increase after adding new bucket: prev={}, new={}",
+            size_with_values,
+            state.size()
+        );
+    }
+}
--- a/src/common/function/src/function_registry.rs
+++ b/src/common/function/src/function_registry.rs
@@ -22,10 +22,12 @@ use crate::function::{AsyncFunctionRef, FunctionRef};
 use crate::scalars::aggregate::{AggregateFunctionMetaRef, AggregateFunctions};
 use crate::scalars::date::DateFunction;
 use crate::scalars::expression::ExpressionFunction;
+use crate::scalars::hll_count::HllCalcFunction;
 use crate::scalars::json::JsonFunction;
 use crate::scalars::matches::MatchesFunction;
 use crate::scalars::math::MathFunction;
 use crate::scalars::timestamp::TimestampFunction;
+use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
 use crate::scalars::vector::VectorFunction;
 use crate::system::SystemFunction;
 use crate::table::TableFunction;
@@ -105,6 +107,8 @@ pub static FUNCTION_REGISTRY: Lazy<Arc<FunctionRegistry>> = Lazy::new(|| {
    TimestampFunction::register(&function_registry);
    DateFunction::register(&function_registry);
    ExpressionFunction::register(&function_registry);
+    UddSketchCalcFunction::register(&function_registry);
+    HllCalcFunction::register(&function_registry);

    // Aggregate functions
    AggregateFunctions::register(&function_registry);
--- a/src/common/function/src/lib.rs
+++ b/src/common/function/src/lib.rs
@@ -21,6 +21,7 @@ pub mod scalars;
 mod system;
 mod table;

+pub mod aggr;
 pub mod function;
 pub mod function_registry;
 pub mod handlers;
--- a/src/common/function/src/scalars.rs
+++ b/src/common/function/src/scalars.rs
@@ -22,7 +22,9 @@ pub mod matches;
 pub mod math;
 pub mod vector;

+pub(crate) mod hll_count;
 #[cfg(test)]
 pub(crate) mod test;
 pub(crate) mod timestamp;
+pub(crate) mod uddsketch_calc;
 pub mod udf;
--- a/src/common/function/src/scalars/hll_count.rs
+++ b/src/common/function/src/scalars/hll_count.rs
@@ -0,0 +1,175 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Implementation of the scalar function `hll_count`.
+
+use std::fmt;
+use std::fmt::Display;
+use std::sync::Arc;
+
+use common_query::error::{DowncastVectorSnafu, InvalidFuncArgsSnafu, Result};
+use common_query::prelude::{Signature, Volatility};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::Vector;
+use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
+use datatypes::vectors::{BinaryVector, MutableVector, UInt64VectorBuilder, VectorRef};
+use hyperloglogplus::HyperLogLog;
+use snafu::OptionExt;
+
+use crate::aggr::HllStateType;
+use crate::function::{Function, FunctionContext};
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "hll_count";
+
+/// HllCalcFunction implements the scalar function `hll_count`.
+///
+/// It accepts one argument:
+/// 1. The serialized HyperLogLogPlus state, as produced by the aggregator (binary).
+///
+/// For each row, it deserializes the sketch and returns the estimated cardinality.
+#[derive(Debug, Default)]
+pub struct HllCalcFunction;
+
+impl HllCalcFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register(Arc::new(HllCalcFunction));
+    }
+}
+
+impl Display for HllCalcFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for HllCalcFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::uint64_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        // Only argument: HyperLogLogPlus state (binary)
+        Signature::exact(
+            vec![ConcreteDataType::binary_datatype()],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        if columns.len() != 1 {
+            return InvalidFuncArgsSnafu {
+                err_msg: format!("hll_count expects 1 argument, got {}", columns.len()),
+            }
+            .fail();
+        }
+
+        let hll_vec = columns[0]
+            .as_any()
+            .downcast_ref::<BinaryVector>()
+            .with_context(|| DowncastVectorSnafu {
+                err_msg: format!("expect BinaryVector, got {}", columns[0].vector_type_name()),
+            })?;
+        let len = hll_vec.len();
+        let mut builder = UInt64VectorBuilder::with_capacity(len);
+
+        for i in 0..len {
+            let hll_opt = hll_vec.get_data(i);
+
+            if hll_opt.is_none() {
+                builder.push_null();
+                continue;
+            }
+
+            let hll_bytes = hll_opt.unwrap();
+
+            // Deserialize the HyperLogLogPlus from its bincode representation
+            let mut hll: HllStateType = match bincode::deserialize(hll_bytes) {
+                Ok(h) => h,
+                Err(e) => {
+                    common_telemetry::trace!("Failed to deserialize HyperLogLogPlus: {}", e);
+                    builder.push_null();
+                    continue;
+                }
+            };
+
+            builder.push(Some(hll.count().round() as u64));
+        }
+
+        Ok(builder.to_vector())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datatypes::vectors::BinaryVector;
+
+    use super::*;
+    use crate::utils::FixedRandomState;
+
+    #[test]
+    fn test_hll_count_function() {
+        let function = HllCalcFunction;
+        assert_eq!("hll_count", function.name());
+        assert_eq!(
+            ConcreteDataType::uint64_datatype(),
+            function
+                .return_type(&[ConcreteDataType::uint64_datatype()])
+                .unwrap()
+        );
+
+        // Create a test HLL
+        let mut hll = HllStateType::new(14, FixedRandomState::new()).unwrap();
+        for i in 1..=10 {
+            hll.insert(&i.to_string());
+        }
+
+        let serialized_bytes = bincode::serialize(&hll).unwrap();
+        let args: Vec<VectorRef> = vec![Arc::new(BinaryVector::from(vec![Some(serialized_bytes)]))];
+
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 1);
+
+        // Test cardinality estimate
+        if let datatypes::value::Value::UInt64(v) = result.get(0) {
+            assert_eq!(v, 10);
+        } else {
+            panic!("Expected uint64 value");
+        }
+    }
+
+    #[test]
+    fn test_hll_count_function_errors() {
+        let function = HllCalcFunction;
+
+        // Test with invalid number of arguments
+        let args: Vec<VectorRef> = vec![];
+        let result = function.eval(FunctionContext::default(), &args);
+        assert!(result.is_err());
+        assert!(result
+            .unwrap_err()
+            .to_string()
+            .contains("hll_count expects 1 argument"));
+
+        // Test with invalid binary data
+        let args: Vec<VectorRef> = vec![Arc::new(BinaryVector::from(vec![Some(vec![1, 2, 3])]))]; // Invalid binary data
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(matches!(result.get(0), datatypes::value::Value::Null));
+    }
+}
--- a/src/common/function/src/scalars/uddsketch_calc.rs
+++ b/src/common/function/src/scalars/uddsketch_calc.rs
@@ -0,0 +1,211 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Implementation of the scalar function `uddsketch_calc`.
+
+use std::fmt;
+use std::fmt::Display;
+use std::sync::Arc;
+
+use common_query::error::{DowncastVectorSnafu, InvalidFuncArgsSnafu, Result};
+use common_query::prelude::{Signature, Volatility};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::Vector;
+use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
+use datatypes::vectors::{BinaryVector, Float64VectorBuilder, MutableVector, VectorRef};
+use snafu::OptionExt;
+use uddsketch::UDDSketch;
+
+use crate::function::{Function, FunctionContext};
+use crate::function_registry::FunctionRegistry;
+
+const NAME: &str = "uddsketch_calc";
+
+/// UddSketchCalcFunction implements the scalar function `uddsketch_calc`.
+///
+/// It accepts two arguments:
+/// 1. A percentile (as f64) for which to compute the estimated quantile (e.g. 0.95 for p95).
+/// 2. The serialized UDDSketch state, as produced by the aggregator (binary).
+///
+/// For each row, it deserializes the sketch and returns the computed quantile value.
+#[derive(Debug, Default)]
+pub struct UddSketchCalcFunction;
+
+impl UddSketchCalcFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register(Arc::new(UddSketchCalcFunction));
+    }
+}
+
+impl Display for UddSketchCalcFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for UddSketchCalcFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::float64_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        // First argument: percentile (float64)
+        // Second argument: UDDSketch state (binary)
+        Signature::exact(
+            vec![
+                ConcreteDataType::float64_datatype(),
+                ConcreteDataType::binary_datatype(),
+            ],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        if columns.len() != 2 {
+            return InvalidFuncArgsSnafu {
+                err_msg: format!("uddsketch_calc expects 2 arguments, got {}", columns.len()),
+            }
+            .fail();
+        }
+
+        let perc_vec = &columns[0];
+        let sketch_vec = columns[1]
+            .as_any()
+            .downcast_ref::<BinaryVector>()
+            .with_context(|| DowncastVectorSnafu {
+                err_msg: format!("expect BinaryVector, got {}", columns[1].vector_type_name()),
+            })?;
+        let len = sketch_vec.len();
+        let mut builder = Float64VectorBuilder::with_capacity(len);
+
+        for i in 0..len {
+            let perc_opt = perc_vec.get(i).as_f64_lossy();
+            let sketch_opt = sketch_vec.get_data(i);
+
+            if sketch_opt.is_none() || perc_opt.is_none() {
+                builder.push_null();
+                continue;
+            }
+
+            let sketch_bytes = sketch_opt.unwrap();
+            let perc = perc_opt.unwrap();
+
+            // Deserialize the UDDSketch from its bincode representation
+            let sketch: UDDSketch = match bincode::deserialize(sketch_bytes) {
+                Ok(s) => s,
+                Err(e) => {
+                    common_telemetry::trace!("Failed to deserialize UDDSketch: {}", e);
+                    builder.push_null();
+                    continue;
+                }
+            };
+
+            // Compute the estimated quantile from the sketch
+            let result = sketch.estimate_quantile(perc);
+            builder.push(Some(result));
+        }
+
+        Ok(builder.to_vector())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datatypes::vectors::{BinaryVector, Float64Vector};
+
+    use super::*;
+
+    #[test]
+    fn test_uddsketch_calc_function() {
+        let function = UddSketchCalcFunction;
+        assert_eq!("uddsketch_calc", function.name());
+        assert_eq!(
+            ConcreteDataType::float64_datatype(),
+            function
+                .return_type(&[ConcreteDataType::float64_datatype()])
+                .unwrap()
+        );
+
+        // Create a test sketch
+        let mut sketch = UDDSketch::new(128, 0.01);
+        sketch.add_value(10.0);
+        sketch.add_value(20.0);
+        sketch.add_value(30.0);
+        sketch.add_value(40.0);
+        sketch.add_value(50.0);
+        sketch.add_value(60.0);
+        sketch.add_value(70.0);
+        sketch.add_value(80.0);
+        sketch.add_value(90.0);
+        sketch.add_value(100.0);
+
+        // Get expected values directly from the sketch
+        let expected_p50 = sketch.estimate_quantile(0.5);
+        let expected_p90 = sketch.estimate_quantile(0.9);
+        let expected_p95 = sketch.estimate_quantile(0.95);
+
+        let serialized = bincode::serialize(&sketch).unwrap();
+        let percentiles = vec![0.5, 0.9, 0.95];
+
+        let args: Vec<VectorRef> = vec![
+            Arc::new(Float64Vector::from_vec(percentiles.clone())),
+            Arc::new(BinaryVector::from(vec![Some(serialized.clone()); 3])),
+        ];
+
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 3);
+
+        // Test median (p50)
+        assert!(
+            matches!(result.get(0), datatypes::value::Value::Float64(v) if (v - expected_p50).abs() < 1e-10)
+        );
+        // Test p90
+        assert!(
+            matches!(result.get(1), datatypes::value::Value::Float64(v) if (v - expected_p90).abs() < 1e-10)
+        );
+        // Test p95
+        assert!(
+            matches!(result.get(2), datatypes::value::Value::Float64(v) if (v - expected_p95).abs() < 1e-10)
+        );
+    }
+
+    #[test]
+    fn test_uddsketch_calc_function_errors() {
+        let function = UddSketchCalcFunction;
+
+        // Test with invalid number of arguments
+        let args: Vec<VectorRef> = vec![Arc::new(Float64Vector::from_vec(vec![0.95]))];
+        let result = function.eval(FunctionContext::default(), &args);
+        assert!(result.is_err());
+        assert!(result
+            .unwrap_err()
+            .to_string()
+            .contains("uddsketch_calc expects 2 arguments"));
+
+        // Test with invalid binary data
+        let args: Vec<VectorRef> = vec![
+            Arc::new(Float64Vector::from_vec(vec![0.95])),
+            Arc::new(BinaryVector::from(vec![Some(vec![1, 2, 3])])), // Invalid binary data
+        ];
+        let result = function.eval(FunctionContext::default(), &args).unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(matches!(result.get(0), datatypes::value::Value::Null));
+    }
+}
--- a/src/common/function/src/utils.rs
+++ b/src/common/function/src/utils.rs
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::hash::BuildHasher;
+
+use ahash::RandomState;
+use serde::{Deserialize, Serialize};
+
 /// Escapes special characters in the provided pattern string for `LIKE`.
 ///
 /// Specifically, it prefixes the backslash (`\`), percent (`%`), and underscore (`_`)
@@ -32,6 +37,71 @@ pub fn escape_like_pattern(pattern: &str) -> String {
        })
        .collect::<String>()
 }
+
+/// A random state with fixed seeds.
+///
+/// This is used to ensure that the hash values are consistent across
+/// different processes, and easy to serialize and deserialize.
+#[derive(Debug)]
+pub struct FixedRandomState {
+    state: RandomState,
+}
+
+impl FixedRandomState {
+    // some random seeds
+    const RANDOM_SEED_0: u64 = 0x517cc1b727220a95;
+    const RANDOM_SEED_1: u64 = 0x428a2f98d728ae22;
+    const RANDOM_SEED_2: u64 = 0x7137449123ef65cd;
+    const RANDOM_SEED_3: u64 = 0xb5c0fbcfec4d3b2f;
+
+    pub fn new() -> Self {
+        Self {
+            state: ahash::RandomState::with_seeds(
+                Self::RANDOM_SEED_0,
+                Self::RANDOM_SEED_1,
+                Self::RANDOM_SEED_2,
+                Self::RANDOM_SEED_3,
+            ),
+        }
+    }
+}
+
+impl Default for FixedRandomState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BuildHasher for FixedRandomState {
+    type Hasher = ahash::AHasher;
+
+    fn build_hasher(&self) -> Self::Hasher {
+        self.state.build_hasher()
+    }
+
+    fn hash_one<T: std::hash::Hash>(&self, x: T) -> u64 {
+        self.state.hash_one(x)
+    }
+}
+
+impl Serialize for FixedRandomState {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_unit()
+    }
+}
+
+impl<'de> Deserialize<'de> for FixedRandomState {
+    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        Ok(Self::new())
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/src/common/grpc-expr/Cargo.toml
+++ b/src/common/grpc-expr/Cargo.toml
@@ -22,4 +22,4 @@ store-api.workspace = true
 table.workspace = true

 [dev-dependencies]
-paste = "1.0"
+paste.workspace = true
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -445,10 +445,16 @@ impl Pool {

 async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
    let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
-
+    // use weak ref here to prevent pool being leaked
+    let pool_weak = Arc::downgrade(&pool);
    loop {
        let _ = interval.tick().await;
-        pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
+        if let Some(pool) = pool_weak.upgrade() {
+            pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
+        } else {
+            // no one is using this pool, so we can also let go
+            break;
+        }
    }
 }

--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -16,7 +16,6 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use futures::future::BoxFuture;
-use futures::TryStreamExt;
 use moka::future::Cache;
 use moka::ops::compute::Op;
 use table::metadata::TableId;
@@ -54,9 +53,13 @@ fn init_factory(table_flow_manager: TableFlowManagerRef) -> Initializer<TableId,
        Box::pin(async move {
            table_flow_manager
                .flows(table_id)
-                .map_ok(|(key, value)| (key.flownode_id(), value.peer))
-                .try_collect::<HashMap<_, _>>()
                .await
+                .map(|flows| {
+                    flows
+                        .into_iter()
+                        .map(|(key, value)| (key.flownode_id(), value.peer))
+                        .collect::<HashMap<_, _>>()
+                })
                // We must cache the `HashSet` even if it's empty,
                // to avoid future requests to the remote storage next time;
                // If the value is added to the remote storage,
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::hash::{DefaultHasher, Hash, Hasher};
 use std::str::FromStr;

+use api::v1::meta::HeartbeatRequest;
 use common_error::ext::ErrorExt;
 use lazy_static::lazy_static;
 use regex::Regex;
@@ -55,12 +57,10 @@ pub trait ClusterInfo {
 }

 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-{cluster_id}-{role}-{node_id}`.
-///
-/// This key cannot be used to describe the `Metasrv` because the `Metasrv` does not have
-/// a `cluster_id`, it serves multiple clusters.
-#[derive(Debug, Clone, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub struct NodeInfoKey {
    /// The cluster id.
+    // todo(hl): remove cluster_id as it is not assigned anywhere.
    pub cluster_id: ClusterId,
    /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
    pub role: Role,
@@ -69,6 +69,28 @@ pub struct NodeInfoKey {
 }

 impl NodeInfoKey {
+    /// Try to create a `NodeInfoKey` from a "good" heartbeat request. "good" as in every needed
+    /// piece of information is provided and valid.  
+    pub fn new(request: &HeartbeatRequest) -> Option<Self> {
+        let HeartbeatRequest { header, peer, .. } = request;
+        let header = header.as_ref()?;
+        let peer = peer.as_ref()?;
+
+        let role = header.role.try_into().ok()?;
+        let node_id = match role {
+            // Because the Frontend is stateless, it's too easy to neglect choosing a unique id
+            // for it when setting up a cluster. So we calculate its id from its address.
+            Role::Frontend => calculate_node_id(&peer.addr),
+            _ => peer.id,
+        };
+
+        Some(NodeInfoKey {
+            cluster_id: header.cluster_id,
+            role,
+            node_id,
+        })
+    }
+
    pub fn key_prefix_with_cluster_id(cluster_id: u64) -> String {
        format!("{}-{}-", CLUSTER_NODE_INFO_PREFIX, cluster_id)
    }
@@ -83,6 +105,13 @@ impl NodeInfoKey {
    }
 }

+/// Calculate (by using the DefaultHasher) the node's id from its address.
+fn calculate_node_id(addr: &str) -> u64 {
+    let mut hasher = DefaultHasher::new();
+    addr.hash(&mut hasher);
+    hasher.finish()
+}
+
 /// The information of a node in the cluster.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct NodeInfo {
@@ -100,7 +129,7 @@ pub struct NodeInfo {
    pub start_time_ms: u64,
 }

-#[derive(Debug, Clone, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub enum Role {
    Datanode,
    Frontend,
@@ -201,8 +230,8 @@ impl TryFrom<Vec<u8>> for NodeInfoKey {
    }
 }

-impl From<NodeInfoKey> for Vec<u8> {
-    fn from(key: NodeInfoKey) -> Self {
+impl From<&NodeInfoKey> for Vec<u8> {
+    fn from(key: &NodeInfoKey) -> Self {
        format!(
            "{}-{}-{}-{}",
            CLUSTER_NODE_INFO_PREFIX,
@@ -271,6 +300,7 @@ impl TryFrom<i32> for Role {
 mod tests {
    use std::assert_matches::assert_matches;

+    use super::*;
    use crate::cluster::Role::{Datanode, Frontend};
    use crate::cluster::{DatanodeStatus, NodeInfo, NodeInfoKey, NodeStatus};
    use crate::peer::Peer;
@@ -283,7 +313,7 @@ mod tests {
            node_id: 2,
        };

-        let key_bytes: Vec<u8> = key.into();
+        let key_bytes: Vec<u8> = (&key).into();
        let new_key: NodeInfoKey = key_bytes.try_into().unwrap();

        assert_eq!(1, new_key.cluster_id);
@@ -338,4 +368,26 @@ mod tests {
        let prefix = NodeInfoKey::key_prefix_with_role(2, Frontend);
        assert_eq!(prefix, "__meta_cluster_node_info-2-1-");
    }
+
+    #[test]
+    fn test_calculate_node_id_from_addr() {
+        // Test empty string
+        assert_eq!(calculate_node_id(""), calculate_node_id(""));
+
+        // Test same addresses return same ids
+        let addr1 = "127.0.0.1:8080";
+        let id1 = calculate_node_id(addr1);
+        let id2 = calculate_node_id(addr1);
+        assert_eq!(id1, id2);
+
+        // Test different addresses return different ids
+        let addr2 = "127.0.0.1:8081";
+        let id3 = calculate_node_id(addr2);
+        assert_ne!(id1, id3);
+
+        // Test long address
+        let long_addr = "very.long.domain.name.example.com:9999";
+        let id4 = calculate_node_id(long_addr);
+        assert!(id4 > 0);
+    }
 }
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -15,6 +15,7 @@
 mod metadata;

 use std::collections::BTreeMap;
+use std::fmt;

 use api::v1::flow::flow_request::Body as PbFlowRequest;
 use api::v1::flow::{CreateRequest, FlowRequest, FlowRequestHeader};
@@ -28,7 +29,6 @@ use common_procedure::{
 use common_telemetry::info;
 use common_telemetry::tracing_context::TracingContext;
 use futures::future::join_all;
-use futures::TryStreamExt;
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};
@@ -77,6 +77,7 @@ impl CreateFlowProcedure {
                query_context,
                state: CreateFlowState::Prepare,
                prev_flow_info_value: None,
+                flow_type: None,
            },
        }
    }
@@ -104,7 +105,7 @@ impl CreateFlowProcedure {
        if create_if_not_exists && or_replace {
            // this is forbidden because not clear what does that mean exactly
            return error::UnsupportedSnafu {
-                operation: "Create flow with both `IF NOT EXISTS` and `OR REPLACE`".to_string(),
+                operation: "Create flow with both `IF NOT EXISTS` and `OR REPLACE`",
            }
            .fail();
        }
@@ -129,9 +130,10 @@ impl CreateFlowProcedure {
                .flow_metadata_manager
                .flow_route_manager()
                .routes(flow_id)
-                .map_ok(|(_, value)| value.peer)
-                .try_collect::<Vec<_>>()
-                .await?;
+                .await?
+                .into_iter()
+                .map(|(_, value)| value.peer)
+                .collect::<Vec<_>>();
            self.data.flow_id = Some(flow_id);
            self.data.peers = peers;
            info!("Replacing flow, flow_id: {}", flow_id);
@@ -175,6 +177,8 @@ impl CreateFlowProcedure {
            self.allocate_flow_id().await?;
        }
        self.data.state = CreateFlowState::CreateFlows;
+        // determine flow type
+        self.data.flow_type = Some(determine_flow_type(&self.data.task));

        Ok(Status::executing(true))
    }
@@ -309,6 +313,11 @@ impl Procedure for CreateFlowProcedure {
    }
 }

+pub fn determine_flow_type(_flow_task: &CreateFlowTask) -> FlowType {
+    // TODO(discord9): determine flow type
+    FlowType::RecordingRule
+}
+
 /// The state of [CreateFlowProcedure].
 #[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
 pub enum CreateFlowState {
@@ -322,6 +331,36 @@ pub enum CreateFlowState {
    CreateMetadata,
 }

+/// The type of flow.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum FlowType {
+    /// The flow is a recording rule task.
+    RecordingRule,
+    /// The flow is a streaming task.
+    Streaming,
+}
+
+impl FlowType {
+    pub const RECORDING_RULE: &str = "recording_rule";
+    pub const STREAMING: &str = "streaming";
+    pub const FLOW_TYPE_KEY: &str = "flow_type";
+}
+
+impl Default for FlowType {
+    fn default() -> Self {
+        Self::RecordingRule
+    }
+}
+
+impl fmt::Display for FlowType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            FlowType::RecordingRule => write!(f, "{}", FlowType::RECORDING_RULE),
+            FlowType::Streaming => write!(f, "{}", FlowType::STREAMING),
+        }
+    }
+}
+
 /// The serializable data.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct CreateFlowData {
@@ -335,6 +374,7 @@ pub struct CreateFlowData {
    /// For verify if prev value is consistent when need to update flow metadata.
    /// only set when `or_replace` is true.
    pub(crate) prev_flow_info_value: Option<DeserializedValueWithBytes<FlowInfoValue>>,
+    pub(crate) flow_type: Option<FlowType>,
 }

 impl From<&CreateFlowData> for CreateRequest {
@@ -342,7 +382,7 @@ impl From<&CreateFlowData> for CreateRequest {
        let flow_id = value.flow_id.unwrap();
        let source_table_ids = &value.source_table_ids;

-        CreateRequest {
+        let mut req = CreateRequest {
            flow_id: Some(api::v1::FlowId { id: flow_id }),
            source_table_ids: source_table_ids
                .iter()
@@ -356,7 +396,12 @@ impl From<&CreateFlowData> for CreateRequest {
            comment: value.task.comment.clone(),
            sql: value.task.sql.clone(),
            flow_options: value.task.flow_options.clone(),
-        }
+        };
+
+        let flow_type = value.flow_type.unwrap_or_default().to_string();
+        req.flow_options
+            .insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
+        req
    }
 }

@@ -369,7 +414,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            expire_after,
            comment,
            sql,
-            flow_options: options,
+            flow_options: mut options,
            ..
        } = value.task.clone();

@@ -386,19 +431,21 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            .map(|(idx, peer)| (idx as u32, FlowRouteValue { peer: peer.clone() }))
            .collect::<Vec<_>>();

-        (
-            FlowInfoValue {
-                source_table_ids: value.source_table_ids.clone(),
-                sink_table_name,
-                flownode_ids,
-                catalog_name,
-                flow_name,
-                raw_sql: sql,
-                expire_after,
-                comment,
-                options,
-            },
-            flow_routes,
-        )
+        let flow_type = value.flow_type.unwrap_or_default().to_string();
+        options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
+
+        let flow_info = FlowInfoValue {
+            source_table_ids: value.source_table_ids.clone(),
+            sink_table_name,
+            flownode_ids,
+            catalog_name,
+            flow_name,
+            raw_sql: sql,
+            expire_after,
+            comment,
+            options,
+        };
+
+        (flow_info, flow_routes)
    }
 }
--- a/src/common/meta/src/ddl/drop_database/executor.rs
+++ b/src/common/meta/src/ddl/drop_database/executor.rs
@@ -128,7 +128,7 @@ impl State for DropDatabaseExecutor {
            .await?;
        executor.invalidate_table_cache(ddl_ctx).await?;
        executor
-            .on_drop_regions(ddl_ctx, &self.physical_region_routes)
+            .on_drop_regions(ddl_ctx, &self.physical_region_routes, true)
            .await?;
        info!("Table: {}({}) is dropped", self.table_name, self.table_id);

--- a/src/common/meta/src/ddl/drop_flow/metadata.rs
+++ b/src/common/meta/src/ddl/drop_flow/metadata.rs
@@ -13,7 +13,6 @@
 // limitations under the License.

 use common_catalog::format_full_flow_name;
-use futures::TryStreamExt;
 use snafu::{ensure, OptionExt};

 use crate::ddl::drop_flow::DropFlowProcedure;
@@ -39,9 +38,10 @@ impl DropFlowProcedure {
            .flow_metadata_manager
            .flow_route_manager()
            .routes(self.data.task.flow_id)
-            .map_ok(|(_, value)| value)
-            .try_collect::<Vec<_>>()
-            .await?;
+            .await?
+            .into_iter()
+            .map(|(_, value)| value)
+            .collect::<Vec<_>>();
        ensure!(
            !flow_route_values.is_empty(),
            error::FlowRouteNotFoundSnafu {
--- a/src/common/meta/src/ddl/drop_table.rs
+++ b/src/common/meta/src/ddl/drop_table.rs
@@ -156,7 +156,7 @@ impl DropTableProcedure {

    pub async fn on_datanode_drop_regions(&mut self) -> Result<Status> {
        self.executor
-            .on_drop_regions(&self.context, &self.data.physical_region_routes)
+            .on_drop_regions(&self.context, &self.data.physical_region_routes, false)
            .await?;
        self.data.state = DropTableState::DeleteTombstone;
        Ok(Status::executing(true))
--- a/src/common/meta/src/ddl/drop_table/executor.rs
+++ b/src/common/meta/src/ddl/drop_table/executor.rs
@@ -214,6 +214,7 @@ impl DropTableExecutor {
        &self,
        ctx: &DdlContext,
        region_routes: &[RegionRoute],
+        fast_path: bool,
    ) -> Result<()> {
        let leaders = find_leaders(region_routes);
        let mut drop_region_tasks = Vec::with_capacity(leaders.len());
@@ -236,6 +237,7 @@ impl DropTableExecutor {
                    }),
                    body: Some(region_request::Body::Drop(PbDropRegionRequest {
                        region_id: region_id.as_u64(),
+                        fast_path,
                    })),
                };
                let datanode = datanode.clone();
--- a/src/common/meta/src/key/flow.rs
+++ b/src/common/meta/src/key/flow.rs
@@ -16,9 +16,9 @@ pub mod flow_info;
 pub(crate) mod flow_name;
 pub(crate) mod flow_route;
 pub mod flow_state;
+mod flownode_addr_helper;
 pub(crate) mod flownode_flow;
 pub(crate) mod table_flow;
-
 use std::ops::Deref;
 use std::sync::Arc;

@@ -506,7 +506,6 @@ mod tests {
        let routes = flow_metadata_manager
            .flow_route_manager()
            .routes(flow_id)
-            .try_collect::<Vec<_>>()
            .await
            .unwrap();
        assert_eq!(
@@ -538,7 +537,6 @@ mod tests {
            let nodes = flow_metadata_manager
                .table_flow_manager()
                .flows(table_id)
-                .try_collect::<Vec<_>>()
                .await
                .unwrap();
            assert_eq!(
@@ -727,7 +725,6 @@ mod tests {
        let routes = flow_metadata_manager
            .flow_route_manager()
            .routes(flow_id)
-            .try_collect::<Vec<_>>()
            .await
            .unwrap();
        assert_eq!(
@@ -759,7 +756,6 @@ mod tests {
            let nodes = flow_metadata_manager
                .table_flow_manager()
                .flows(table_id)
-                .try_collect::<Vec<_>>()
                .await
                .unwrap();
            assert_eq!(
--- a/src/common/meta/src/key/flow/flow_route.rs
+++ b/src/common/meta/src/key/flow/flow_route.rs
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use futures::stream::BoxStream;
+use futures::TryStreamExt;
 use lazy_static::lazy_static;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
 use snafu::OptionExt;

 use crate::error::{self, Result};
-use crate::key::flow::FlowScoped;
+use crate::key::flow::{flownode_addr_helper, FlowScoped};
+use crate::key::node_address::NodeAddressKey;
 use crate::key::{BytesAdapter, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
 use crate::kv_backend::txn::{Txn, TxnOp};
 use crate::kv_backend::KvBackendRef;
@@ -167,10 +168,7 @@ impl FlowRouteManager {
    }

    /// Retrieves all [FlowRouteValue]s of the specified `flow_id`.
-    pub fn routes(
-        &self,
-        flow_id: FlowId,
-    ) -> BoxStream<'static, Result<(FlowRouteKey, FlowRouteValue)>> {
+    pub async fn routes(&self, flow_id: FlowId) -> Result<Vec<(FlowRouteKey, FlowRouteValue)>> {
        let start_key = FlowRouteKey::range_start_key(flow_id);
        let req = RangeRequest::new().with_prefix(start_key);
        let stream = PaginationStream::new(
@@ -181,7 +179,9 @@ impl FlowRouteManager {
        )
        .into_stream();

-        Box::pin(stream)
+        let mut res = stream.try_collect::<Vec<_>>().await?;
+        self.remap_flow_route_addresses(&mut res).await?;
+        Ok(res)
    }

    /// Builds a create flow routes transaction.
@@ -203,6 +203,28 @@ impl FlowRouteManager {

        Ok(Txn::new().and_then(txns))
    }
+
+    async fn remap_flow_route_addresses(
+        &self,
+        flow_routes: &mut [(FlowRouteKey, FlowRouteValue)],
+    ) -> Result<()> {
+        let keys = flow_routes
+            .iter()
+            .map(|(_, value)| NodeAddressKey::with_flownode(value.peer.id))
+            .collect();
+        let flow_node_addrs =
+            flownode_addr_helper::get_flownode_addresses(&self.kv_backend, keys).await?;
+        for (_, flow_route_value) in flow_routes.iter_mut() {
+            let flownode_id = flow_route_value.peer.id;
+            // If an id lacks a corresponding address in the `flow_node_addrs`,
+            // it means the old address in `table_flow_value` is still valid,
+            // which is expected.
+            if let Some(node_addr) = flow_node_addrs.get(&flownode_id) {
+                flow_route_value.peer.addr = node_addr.peer.addr.clone();
+            }
+        }
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/src/common/meta/src/key/flow/flownode_addr_helper.rs
+++ b/src/common/meta/src/key/flow/flownode_addr_helper.rs
@@ -0,0 +1,47 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use crate::error::Result;
+use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
+use crate::key::{MetadataKey, MetadataValue};
+use crate::kv_backend::KvBackendRef;
+use crate::rpc::store::BatchGetRequest;
+
+/// Get the addresses of the flownodes.
+/// The result is a map: node_id -> NodeAddressValue
+pub(crate) async fn get_flownode_addresses(
+    kv_backend: &KvBackendRef,
+    keys: Vec<NodeAddressKey>,
+) -> Result<HashMap<u64, NodeAddressValue>> {
+    if keys.is_empty() {
+        return Ok(HashMap::default());
+    }
+
+    let req = BatchGetRequest {
+        keys: keys.into_iter().map(|k| k.to_bytes()).collect(),
+    };
+    kv_backend
+        .batch_get(req)
+        .await?
+        .kvs
+        .into_iter()
+        .map(|kv| {
+            let key = NodeAddressKey::from_bytes(&kv.key)?;
+            let value = NodeAddressValue::try_from_raw_value(&kv.value)?;
+            Ok((key.node_id, value))
+        })
+        .collect()
+}
--- a/src/common/meta/src/key/flow/table_flow.rs
+++ b/src/common/meta/src/key/flow/table_flow.rs
@@ -14,7 +14,7 @@

 use std::sync::Arc;

-use futures::stream::BoxStream;
+use futures::TryStreamExt;
 use lazy_static::lazy_static;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
@@ -22,7 +22,8 @@ use snafu::OptionExt;
 use table::metadata::TableId;

 use crate::error::{self, Result};
-use crate::key::flow::FlowScoped;
+use crate::key::flow::{flownode_addr_helper, FlowScoped};
+use crate::key::node_address::NodeAddressKey;
 use crate::key::{BytesAdapter, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
 use crate::kv_backend::txn::{Txn, TxnOp};
 use crate::kv_backend::KvBackendRef;
@@ -196,10 +197,7 @@ impl TableFlowManager {
    /// Retrieves all [TableFlowKey]s of the specified `table_id`.
    ///
    /// TODO(discord9): add cache for it since range request does not support cache.
-    pub fn flows(
-        &self,
-        table_id: TableId,
-    ) -> BoxStream<'static, Result<(TableFlowKey, TableFlowValue)>> {
+    pub async fn flows(&self, table_id: TableId) -> Result<Vec<(TableFlowKey, TableFlowValue)>> {
        let start_key = TableFlowKey::range_start_key(table_id);
        let req = RangeRequest::new().with_prefix(start_key);
        let stream = PaginationStream::new(
@@ -210,7 +208,9 @@ impl TableFlowManager {
        )
        .into_stream();

-        Box::pin(stream)
+        let mut res = stream.try_collect::<Vec<_>>().await?;
+        self.remap_table_flow_addresses(&mut res).await?;
+        Ok(res)
    }

    /// Builds a create table flow transaction.
@@ -238,6 +238,28 @@ impl TableFlowManager {

        Ok(Txn::new().and_then(txns))
    }
+
+    async fn remap_table_flow_addresses(
+        &self,
+        table_flows: &mut [(TableFlowKey, TableFlowValue)],
+    ) -> Result<()> {
+        let keys = table_flows
+            .iter()
+            .map(|(_, value)| NodeAddressKey::with_flownode(value.peer.id))
+            .collect::<Vec<_>>();
+        let flownode_addrs =
+            flownode_addr_helper::get_flownode_addresses(&self.kv_backend, keys).await?;
+        for (_, table_flow_value) in table_flows.iter_mut() {
+            let flownode_id = table_flow_value.peer.id;
+            // If an id lacks a corresponding address in the `flow_node_addrs`,
+            // it means the old address in `table_flow_value` is still valid,
+            // which is expected.
+            if let Some(flownode_addr) = flownode_addrs.get(&flownode_id) {
+                table_flow_value.peer.addr = flownode_addr.peer.addr.clone();
+            }
+        }
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/src/common/meta/src/key/node_address.rs
+++ b/src/common/meta/src/key/node_address.rs
@@ -39,6 +39,10 @@ impl NodeAddressKey {
    pub fn with_datanode(node_id: u64) -> Self {
        Self::new(Role::Datanode, node_id)
    }
+
+    pub fn with_flownode(node_id: u64) -> Self {
+        Self::new(Role::Flownode, node_id)
+    }
 }

 #[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -34,6 +34,7 @@ pub mod kv_backend;
 pub mod leadership_notifier;
 pub mod lock_key;
 pub mod metrics;
+pub mod node_expiry_listener;
 pub mod node_manager;
 pub mod peer;
 pub mod range_stream;
--- a/src/common/meta/src/node_expiry_listener.rs
+++ b/src/common/meta/src/node_expiry_listener.rs
@@ -0,0 +1,152 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Mutex;
+use std::time::Duration;
+
+use common_telemetry::{debug, error, info, warn};
+use tokio::task::JoinHandle;
+use tokio::time::{interval, MissedTickBehavior};
+
+use crate::cluster::{NodeInfo, NodeInfoKey};
+use crate::error;
+use crate::kv_backend::ResettableKvBackendRef;
+use crate::leadership_notifier::LeadershipChangeListener;
+use crate::rpc::store::RangeRequest;
+use crate::rpc::KeyValue;
+
+/// [NodeExpiryListener] periodically checks all node info in memory and removes
+/// expired node info to prevent memory leak.
+pub struct NodeExpiryListener {
+    handle: Mutex<Option<JoinHandle<()>>>,
+    max_idle_time: Duration,
+    in_memory: ResettableKvBackendRef,
+}
+
+impl Drop for NodeExpiryListener {
+    fn drop(&mut self) {
+        self.stop();
+    }
+}
+
+impl NodeExpiryListener {
+    pub fn new(max_idle_time: Duration, in_memory: ResettableKvBackendRef) -> Self {
+        Self {
+            handle: Mutex::new(None),
+            max_idle_time,
+            in_memory,
+        }
+    }
+
+    async fn start(&self) {
+        let mut handle = self.handle.lock().unwrap();
+        if handle.is_none() {
+            let in_memory = self.in_memory.clone();
+
+            let max_idle_time = self.max_idle_time;
+            let ticker_loop = tokio::spawn(async move {
+                // Run clean task every minute.
+                let mut interval = interval(Duration::from_secs(60));
+                interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+                loop {
+                    interval.tick().await;
+                    if let Err(e) = Self::clean_expired_nodes(&in_memory, max_idle_time).await {
+                        error!(e; "Failed to clean expired node");
+                    }
+                }
+            });
+            *handle = Some(ticker_loop);
+        }
+    }
+
+    fn stop(&self) {
+        if let Some(handle) = self.handle.lock().unwrap().take() {
+            handle.abort();
+            info!("Node expiry listener stopped")
+        }
+    }
+
+    /// Cleans expired nodes from memory.
+    async fn clean_expired_nodes(
+        in_memory: &ResettableKvBackendRef,
+        max_idle_time: Duration,
+    ) -> error::Result<()> {
+        let node_keys = Self::list_expired_nodes(in_memory, max_idle_time).await?;
+        for key in node_keys {
+            let key_bytes: Vec<u8> = (&key).into();
+            if let Err(e) = in_memory.delete(&key_bytes, false).await {
+                warn!(e; "Failed to delete expired node: {:?}", key_bytes);
+            } else {
+                debug!("Deleted expired node key: {:?}", key);
+            }
+        }
+        Ok(())
+    }
+
+    /// Lists expired nodes that have been inactive more than `max_idle_time`.
+    async fn list_expired_nodes(
+        in_memory: &ResettableKvBackendRef,
+        max_idle_time: Duration,
+    ) -> error::Result<impl Iterator<Item = NodeInfoKey>> {
+        let prefix = NodeInfoKey::key_prefix_with_cluster_id(0);
+        let req = RangeRequest::new().with_prefix(prefix);
+        let current_time_millis = common_time::util::current_time_millis();
+        let resp = in_memory.range(req).await?;
+        Ok(resp
+            .kvs
+            .into_iter()
+            .filter_map(move |KeyValue { key, value }| {
+                let Ok(info) = NodeInfo::try_from(value).inspect_err(|e| {
+                    warn!(e; "Unrecognized node info value");
+                }) else {
+                    return None;
+                };
+                if (current_time_millis - info.last_activity_ts) > max_idle_time.as_millis() as i64
+                {
+                    NodeInfoKey::try_from(key)
+                        .inspect_err(|e| {
+                            warn!(e; "Unrecognized node info key: {:?}", info.peer);
+                        })
+                        .ok()
+                        .inspect(|node_key| {
+                            debug!("Found expired node: {:?}", node_key);
+                        })
+                } else {
+                    None
+                }
+            }))
+    }
+}
+
+#[async_trait::async_trait]
+impl LeadershipChangeListener for NodeExpiryListener {
+    fn name(&self) -> &str {
+        "NodeExpiryListener"
+    }
+
+    async fn on_leader_start(&self) -> error::Result<()> {
+        self.start().await;
+        info!(
+            "On leader start, node expiry listener started with max idle time: {:?}",
+            self.max_idle_time
+        );
+        Ok(())
+    }
+
+    async fn on_leader_stop(&self) -> error::Result<()> {
+        self.stop();
+        info!("On leader stop, node expiry listener stopped");
+        Ok(())
+    }
+}
--- a/src/datanode/Cargo.toml
+++ b/src/datanode/Cargo.toml
@@ -39,7 +39,7 @@ datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datatypes.workspace = true
 file-engine.workspace = true
-futures = "0.3"
+futures.workspace = true
 futures-util.workspace = true
 humantime-serde.workspace = true
 lazy_static.workspace = true
@@ -47,6 +47,7 @@ log-store.workspace = true
 meta-client.workspace = true
 metric-engine.workspace = true
 mito2.workspace = true
+num_cpus.workspace = true
 object-store.workspace = true
 prometheus.workspace = true
 prost.workspace = true
--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -224,6 +224,20 @@ impl HeartbeatTask {
        common_runtime::spawn_hb(async move {
            let sleep = tokio::time::sleep(Duration::from_millis(0));
            tokio::pin!(sleep);
+
+            let build_info = common_version::build_info();
+            let heartbeat_request = HeartbeatRequest {
+                peer: self_peer,
+                node_epoch,
+                info: Some(NodeInfo {
+                    version: build_info.version.to_string(),
+                    git_commit: build_info.commit_short.to_string(),
+                    start_time_ms: node_epoch,
+                    cpus: num_cpus::get() as u32,
+                }),
+                ..Default::default()
+            };
+
            loop {
                if !running.load(Ordering::Relaxed) {
                    info!("shutdown heartbeat task");
@@ -235,9 +249,8 @@ impl HeartbeatTask {
                            match outgoing_message_to_mailbox_message(message) {
                                Ok(message) => {
                                    let req = HeartbeatRequest {
-                                        peer: self_peer.clone(),
                                        mailbox_message: Some(message),
-                                        ..Default::default()
+                                        ..heartbeat_request.clone()
                                    };
                                    HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
                                    Some(req)
@@ -253,22 +266,13 @@ impl HeartbeatTask {
                        }
                    }
                    _ = &mut sleep => {
-                        let build_info = common_version::build_info();
                        let region_stats = Self::load_region_stats(&region_server_clone);
                        let now = Instant::now();
                        let duration_since_epoch = (now - epoch).as_millis() as u64;
                        let req = HeartbeatRequest {
-                            peer: self_peer.clone(),
                            region_stats,
                            duration_since_epoch,
-                            node_epoch,
-                            info: Some(NodeInfo {
-                                version: build_info.version.to_string(),
-                                git_commit: build_info.commit_short.to_string(),
-                                // The start timestamp is the same as node_epoch currently.
-                                start_time_ms: node_epoch,
-                            }),
-                            ..Default::default()
+                            ..heartbeat_request.clone()
                        };
                        sleep.as_mut().reset(now + Duration::from_millis(interval));
                        Some(req)
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -1218,7 +1218,10 @@ mod tests {
        );

        let response = mock_region_server
-            .handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
+            .handle_request(
+                region_id,
+                RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+            )
            .await
            .unwrap();
        assert_eq!(response.affected_rows, 0);
@@ -1310,7 +1313,10 @@ mod tests {
            .insert(region_id, RegionEngineWithStatus::Ready(engine.clone()));

        mock_region_server
-            .handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
+            .handle_request(
+                region_id,
+                RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+            )
            .await
            .unwrap_err();

--- a/src/datatypes/Cargo.toml
+++ b/src/datatypes/Cargo.toml
@@ -29,7 +29,7 @@ jsonb.workspace = true
 num = "0.4"
 num-traits = "0.2"
 ordered-float = { version = "3.0", features = ["serde"] }
-paste = "1.0"
+paste.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
--- a/src/datatypes/src/lib.rs
+++ b/src/datatypes/src/lib.rs
@@ -32,5 +32,5 @@ pub mod types;
 pub mod value;
 pub mod vectors;

-pub use arrow;
+pub use arrow::{self, compute};
 pub use error::{Error, Result};
--- a/src/file-engine/Cargo.toml
+++ b/src/file-engine/Cargo.toml
@@ -13,7 +13,7 @@ workspace = true

 [dependencies]
 api.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 common-catalog.workspace = true
 common-datasource.workspace = true
 common-error.workspace = true
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -16,6 +16,7 @@ async-trait.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
+chrono.workspace = true
 client.workspace = true
 common-base.workspace = true
 common-config.workspace = true
@@ -41,7 +42,7 @@ datafusion-substrait.workspace = true
 datatypes.workspace = true
 enum-as-inner = "0.6.0"
 enum_dispatch = "0.3"
-futures = "0.3"
+futures.workspace = true
 get-size2 = "0.1.2"
 greptime-proto.workspace = true
 # This fork of hydroflow is simply for keeping our dependency in our org, and pin the version
@@ -53,6 +54,7 @@ lazy_static.workspace = true
 meta-client.workspace = true
 nom = "7.1.3"
 num-traits = "0.2"
+num_cpus.workspace = true
 operator.workspace = true
 partition.workspace = true
 prometheus.workspace = true
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -49,12 +49,13 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
 use crate::adapter::refill::RefillTask;
 use crate::adapter::table_source::ManagedTableSource;
 use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
-pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
+pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
 use crate::compute::ErrCollector;
 use crate::df_optimizer::sql_to_flow_plan;
 use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
 use crate::expr::Batch;
 use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
+use crate::recording_rules::RecordingRuleEngine;
 use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};

 mod flownode_impl;
@@ -63,7 +64,7 @@ pub(crate) mod refill;
 mod stat;
 #[cfg(test)]
 mod tests;
-mod util;
+pub(crate) mod util;
 mod worker;

 pub(crate) mod node_context;
@@ -171,6 +172,8 @@ pub struct FlowWorkerManager {
    flush_lock: RwLock<()>,
    /// receive a oneshot sender to send state size report
    state_report_handler: RwLock<Option<StateReportHandler>>,
+    /// engine for recording rule
+    rule_engine: RecordingRuleEngine,
 }

 /// Building FlownodeManager
@@ -185,6 +188,7 @@ impl FlowWorkerManager {
        node_id: Option<u32>,
        query_engine: Arc<dyn QueryEngine>,
        table_meta: TableMetadataManagerRef,
+        rule_engine: RecordingRuleEngine,
    ) -> Self {
        let srv_map = ManagedTableSource::new(
            table_meta.table_info_manager().clone(),
@@ -207,6 +211,7 @@ impl FlowWorkerManager {
            node_id,
            flush_lock: RwLock::new(()),
            state_report_handler: RwLock::new(None),
+            rule_engine,
        }
    }

@@ -215,25 +220,6 @@ impl FlowWorkerManager {
        self
    }

-    /// Create a flownode manager with one worker
-    pub fn new_with_workers<'s>(
-        node_id: Option<u32>,
-        query_engine: Arc<dyn QueryEngine>,
-        table_meta: TableMetadataManagerRef,
-        num_workers: usize,
-    ) -> (Self, Vec<Worker<'s>>) {
-        let mut zelf = Self::new(node_id, query_engine, table_meta);
-
-        let workers: Vec<_> = (0..num_workers)
-            .map(|_| {
-                let (handle, worker) = create_worker();
-                zelf.add_worker_handle(handle);
-                worker
-            })
-            .collect();
-        (zelf, workers)
-    }
-
    /// add a worker handler to manager, meaning this corresponding worker is under it's manage
    pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
        self.worker_handles.push(handle);
@@ -751,7 +737,11 @@ pub struct CreateFlowArgs {
 /// Create&Remove flow
 impl FlowWorkerManager {
    /// remove a flow by it's id
+    #[allow(unreachable_code)]
    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+        // TODO(discord9): reroute some back to streaming engine later
+        return self.rule_engine.remove_flow(flow_id).await;
+
        for handle in self.worker_handles.iter() {
            if handle.contains_flow(flow_id).await? {
                handle.remove_flow(flow_id).await?;
@@ -767,8 +757,10 @@ impl FlowWorkerManager {
    /// steps to create task:
    /// 1. parse query into typed plan(and optional parse expire_after expr)
    /// 2. render source/sink with output table id and used input table id
-    #[allow(clippy::too_many_arguments)]
+    #[allow(clippy::too_many_arguments, unreachable_code)]
    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
+        // TODO(discord9): reroute some back to streaming engine later
+        return self.rule_engine.create_flow(args).await;
        let CreateFlowArgs {
            flow_id,
            sink_table_name,
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -153,7 +153,13 @@ impl Flownode for FlowWorkerManager {
        }
    }

+    #[allow(unreachable_code, unused)]
    async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
+        return self
+            .rule_engine
+            .handle_inserts(request)
+            .await
+            .map_err(to_meta_err(snafu::location!()));
        // using try_read to ensure two things:
        // 1. flush wouldn't happen until inserts before it is inserted
        // 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -206,15 +212,15 @@ impl Flownode for FlowWorkerManager {
                    .collect_vec();
                let table_col_names = table_schema.relation_desc.names;
                let table_col_names = table_col_names
-                    .iter().enumerate()
-                    .map(|(idx,name)| match name {
-                        Some(name) => Ok(name.clone()),
-                        None => InternalSnafu {
-                            reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
-                        }
-                        .fail().map_err(BoxedError::new).context(ExternalSnafu),
-                    })
-                    .collect::<Result<Vec<_>>>()?;
+                        .iter().enumerate()
+                        .map(|(idx,name)| match name {
+                            Some(name) => Ok(name.clone()),
+                            None => InternalSnafu {
+                                reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
+                            }
+                            .fail().map_err(BoxedError::new).context(ExternalSnafu),
+                        })
+                        .collect::<Result<Vec<_>>>()?;
                let name_to_col = HashMap::<_, _>::from_iter(
                    insert_schema
                        .iter()
--- a/src/flow/src/adapter/util.rs
+++ b/src/flow/src/adapter/util.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+//! Some utility functions
+
 use std::sync::Arc;

 use api::helper::ColumnDataTypeWrapper;
--- a/src/flow/src/error.rs
+++ b/src/flow/src/error.rs
@@ -16,6 +16,7 @@

 use std::any::Any;

+use arrow_schema::ArrowError;
 use common_error::ext::BoxedError;
 use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
 use common_macro::stack_trace_debug;
@@ -53,6 +54,13 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Time error"))]
+    Time {
+        source: common_time::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("External error"))]
    External {
        source: BoxedError,
@@ -156,6 +164,15 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Arrow error: {raw:?} in context: {context}"))]
+    Arrow {
+        #[snafu(source)]
+        raw: ArrowError,
+        context: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
    Datafusion {
        #[snafu(source)]
@@ -230,6 +247,7 @@ impl ErrorExt for Error {
        match self {
            Self::Eval { .. }
            | Self::JoinTask { .. }
+            | Self::Arrow { .. }
            | Self::Datafusion { .. }
            | Self::InsertIntoFlow { .. } => StatusCode::Internal,
            Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
@@ -238,7 +256,9 @@ impl ErrorExt for Error {
            | Self::FlowNotFound { .. }
            | Self::ListFlows { .. } => StatusCode::TableNotFound,
            Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
-            Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
+            Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
+                StatusCode::EngineExecuteQuery
+            }
            Self::Unexpected { .. } => StatusCode::Unexpected,
            Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
                StatusCode::Unsupported
--- a/src/flow/src/expr/utils.rs
+++ b/src/flow/src/expr/utils.rs
@@ -238,6 +238,7 @@ mod test {

        for (sql, current, expected) in &testcases {
            let plan = sql_to_substrait(engine.clone(), sql).await;
+
            let mut ctx = create_test_ctx();
            let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
                .await
--- a/src/flow/src/heartbeat.rs
+++ b/src/flow/src/heartbeat.rs
@@ -60,12 +60,12 @@ async fn query_flow_state(
 #[derive(Clone)]
 pub struct HeartbeatTask {
    node_id: u64,
+    node_epoch: u64,
    peer_addr: String,
    meta_client: Arc<MetaClient>,
    report_interval: Duration,
    retry_interval: Duration,
    resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
-    start_time_ms: u64,
    running: Arc<AtomicBool>,
    query_stat_size: Option<SizeReportSender>,
 }
@@ -83,12 +83,12 @@ impl HeartbeatTask {
    ) -> Self {
        Self {
            node_id: opts.node_id.unwrap_or(0),
+            node_epoch: common_time::util::current_time_millis() as u64,
            peer_addr: addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr)),
            meta_client,
            report_interval: heartbeat_opts.interval,
            retry_interval: heartbeat_opts.retry_interval,
            resp_handler_executor,
-            start_time_ms: common_time::util::current_time_millis() as u64,
            running: Arc::new(AtomicBool::new(false)),
            query_stat_size: None,
        }
@@ -103,6 +103,11 @@ impl HeartbeatTask {
            warn!("Heartbeat task started multiple times");
            return Ok(());
        }
+
+        self.create_streams().await
+    }
+
+    async fn create_streams(&self) -> Result<(), Error> {
        info!("Start to establish the heartbeat connection to metasrv.");
        let (req_sender, resp_stream) = self
            .meta_client
@@ -125,19 +130,11 @@ impl HeartbeatTask {

    pub fn shutdown(&self) {
        info!("Close heartbeat task for flownode");
-        if self
-            .running
-            .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
-            warn!("Call close heartbeat task multiple times");
-        }
    }

-    fn create_heartbeat_request(
+    fn new_heartbeat_request(
+        heartbeat_request: &HeartbeatRequest,
        message: Option<OutgoingMessage>,
-        peer: Option<Peer>,
-        start_time_ms: u64,
        latest_report: &Option<FlowStat>,
    ) -> Option<HeartbeatRequest> {
        let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
@@ -161,10 +158,8 @@ impl HeartbeatTask {

        Some(HeartbeatRequest {
            mailbox_message,
-            peer,
-            info: Self::build_node_info(start_time_ms),
            flow_stat,
-            ..Default::default()
+            ..heartbeat_request.clone()
        })
    }

@@ -174,6 +169,7 @@ impl HeartbeatTask {
            version: build_info.version.to_string(),
            git_commit: build_info.commit_short.to_string(),
            start_time_ms,
+            cpus: num_cpus::get() as u32,
        })
    }

@@ -183,7 +179,7 @@ impl HeartbeatTask {
        mut outgoing_rx: mpsc::Receiver<OutgoingMessage>,
    ) {
        let report_interval = self.report_interval;
-        let start_time_ms = self.start_time_ms;
+        let node_epoch = self.node_epoch;
        let self_peer = Some(Peer {
            id: self.node_id,
            addr: self.peer_addr.clone(),
@@ -198,18 +194,25 @@ impl HeartbeatTask {
            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
            let mut latest_report = None;

+            let heartbeat_request = HeartbeatRequest {
+                peer: self_peer,
+                node_epoch,
+                info: Self::build_node_info(node_epoch),
+                ..Default::default()
+            };
+
            loop {
                let req = tokio::select! {
                    message = outgoing_rx.recv() => {
                        if let Some(message) = message {
-                            Self::create_heartbeat_request(Some(message), self_peer.clone(), start_time_ms, &latest_report)
+                            Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report)
                        } else {
                            // Receives None that means Sender was dropped, we need to break the current loop
                            break
                        }
                    }
                    _ = interval.tick() => {
-                        Self::create_heartbeat_request(None, self_peer.clone(), start_time_ms, &latest_report)
+                        Self::new_heartbeat_request(&heartbeat_request, None, &latest_report)
                    }
                };

@@ -226,6 +229,8 @@ impl HeartbeatTask {
                // set the timeout to half of the report interval so that it wouldn't delay heartbeat if something went horribly wrong
                latest_report = query_flow_state(&query_stat_size, report_interval / 2).await;
            }
+
+            info!("flownode heartbeat task stopped.");
        });
    }

@@ -269,7 +274,7 @@ impl HeartbeatTask {

            info!("Try to re-establish the heartbeat connection to metasrv.");

-            if self.start().await.is_ok() {
+            if self.create_streams().await.is_ok() {
                break;
            }
        }
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -33,6 +33,7 @@ mod expr;
 pub mod heartbeat;
 mod metrics;
 mod plan;
+mod recording_rules;
 mod repr;
 mod server;
 mod transform;
@@ -43,4 +44,5 @@ mod test_utils;

 pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
 pub use error::{Error, Result};
+pub use recording_rules::FrontendClient;
 pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -28,6 +28,32 @@ lazy_static! {
        &["table_id"]
    )
    .unwrap();
+    pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
+        "greptime_flow_rule_engine_query_time",
+        "flow rule engine query time",
+        &["flow_id"],
+        vec![
+            0.0,
+            1.,
+            3.,
+            5.,
+            10.,
+            20.,
+            30.,
+            60.,
+            2. * 60.,
+            5. * 60.,
+            10. * 60.
+        ]
+    )
+    .unwrap();
+    pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
+        "greptime_flow_rule_engine_slow_query",
+        "flow rule engine slow query",
+        &["flow_id", "sql", "peer"],
+        vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
+    )
+    .unwrap();
    pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
        register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
    pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
--- a/src/flow/src/recording_rules.rs
+++ b/src/flow/src/recording_rules.rs
@@ -0,0 +1,940 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
+
+mod engine;
+mod frontend_client;
+
+use std::collections::BTreeSet;
+use std::sync::Arc;
+
+use api::helper::pb_value_to_value_ref;
+use catalog::CatalogManagerRef;
+use common_error::ext::BoxedError;
+use common_recordbatch::DfRecordBatch;
+use common_telemetry::warn;
+use common_time::timestamp::TimeUnit;
+use common_time::Timestamp;
+use datafusion::error::Result as DfResult;
+use datafusion::logical_expr::Expr;
+use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
+use datafusion::prelude::SessionContext;
+use datafusion::sql::unparser::Unparser;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
+use datafusion_common::{DFSchema, TableReference};
+use datafusion_expr::{ColumnarValue, LogicalPlan};
+use datafusion_physical_expr::PhysicalExprRef;
+use datatypes::prelude::{ConcreteDataType, DataType};
+use datatypes::scalars::ScalarVector;
+use datatypes::schema::TIME_INDEX_KEY;
+use datatypes::value::Value;
+use datatypes::vectors::{
+    TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
+    TimestampSecondVector, Vector,
+};
+pub use engine::RecordingRuleEngine;
+pub use frontend_client::FrontendClient;
+use itertools::Itertools;
+use query::parser::QueryLanguageParser;
+use query::QueryEngineRef;
+use session::context::QueryContextRef;
+use snafu::{ensure, OptionExt, ResultExt};
+
+use crate::adapter::util::from_proto_to_data_type;
+use crate::df_optimizer::apply_df_optimizer;
+use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
+use crate::expr::error::DataTypeSnafu;
+use crate::Error;
+
+#[derive(Debug, Clone)]
+pub struct TimeWindowExpr {
+    phy_expr: PhysicalExprRef,
+    column_name: String,
+    logical_expr: Expr,
+    df_schema: DFSchema,
+}
+
+impl TimeWindowExpr {
+    pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
+        let phy_planner = DefaultPhysicalPlanner::default();
+
+        let phy_expr: PhysicalExprRef = phy_planner
+            .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+            .with_context(|_e| DatafusionSnafu {
+                context: format!(
+                    "Failed to create physical expression from {expr:?} using {df_schema:?}"
+                ),
+            })?;
+        Ok(Self {
+            phy_expr,
+            column_name: column_name.to_string(),
+            logical_expr: expr.clone(),
+            df_schema: df_schema.clone(),
+        })
+    }
+
+    pub fn eval(
+        &self,
+        current: Timestamp,
+    ) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
+        let lower_bound =
+            find_expr_time_window_lower_bound(&self.logical_expr, &self.df_schema, current)?;
+        let upper_bound =
+            find_expr_time_window_upper_bound(&self.logical_expr, &self.df_schema, current)?;
+        Ok((lower_bound, upper_bound))
+    }
+
+    /// Find timestamps from rows using time window expr
+    pub async fn handle_rows(
+        &self,
+        rows_list: Vec<api::v1::Rows>,
+    ) -> Result<BTreeSet<Timestamp>, Error> {
+        let mut time_windows = BTreeSet::new();
+
+        for rows in rows_list {
+            // pick the time index column and use it to eval on `self.expr`
+            let ts_col_index = rows
+                .schema
+                .iter()
+                .map(|col| col.column_name.clone())
+                .position(|name| name == self.column_name);
+            let Some(ts_col_index) = ts_col_index else {
+                warn!("can't found time index column in schema: {:?}", rows.schema);
+                continue;
+            };
+            let col_schema = &rows.schema[ts_col_index];
+            let cdt = from_proto_to_data_type(col_schema)?;
+
+            let column_values = rows
+                .rows
+                .iter()
+                .map(|row| &row.values[ts_col_index])
+                .collect_vec();
+
+            let mut vector = cdt.create_mutable_vector(column_values.len());
+            for value in column_values {
+                let value = pb_value_to_value_ref(value, &None);
+                vector.try_push_value_ref(value).context(DataTypeSnafu {
+                    msg: "Failed to convert rows to columns",
+                })?;
+            }
+            let vector = vector.to_vector();
+
+            let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
+
+            let rb =
+                DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
+                    .with_context(|_e| ArrowSnafu {
+                        context: format!(
+                            "Failed to create record batch from {df_schema:?} and {vector:?}"
+                        ),
+                    })?;
+
+            let eval_res = self
+                .phy_expr
+                .evaluate(&rb)
+                .with_context(|_| DatafusionSnafu {
+                    context: format!(
+                        "Failed to evaluate physical expression {:?} on {rb:?}",
+                        self.phy_expr
+                    ),
+                })?;
+
+            let res = columnar_to_ts_vector(&eval_res)?;
+
+            for ts in res.into_iter().flatten() {
+                time_windows.insert(ts);
+            }
+        }
+
+        Ok(time_windows)
+    }
+}
+
+fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
+    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+        name,
+        cdt.as_arrow_type(),
+        false,
+    )]));
+
+    let df_schema = DFSchema::from_field_specific_qualified_schema(
+        vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
+        &arrow_schema,
+    )
+    .with_context(|_e| DatafusionSnafu {
+        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
+    })?;
+
+    Ok(df_schema)
+}
+
+/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
+fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
+    let val = match columnar {
+        datafusion_expr::ColumnarValue::Array(array) => {
+            let ty = array.data_type();
+            let ty = ConcreteDataType::from_arrow_type(ty);
+            let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
+                ty.unit()
+            } else {
+                return UnexpectedSnafu {
+                    reason: format!("Non-timestamp type: {ty:?}"),
+                }
+                .fail();
+            };
+
+            match time_unit {
+                TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
+                    .with_context(|_| DatatypesSnafu {
+                        extra: format!("Failed to create vector from arrow array {array:?}"),
+                    })?
+                    .iter_data()
+                    .map(|d| d.map(|d| d.0))
+                    .collect_vec(),
+                TimeUnit::Millisecond => {
+                    TimestampMillisecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+                TimeUnit::Microsecond => {
+                    TimestampMicrosecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+                TimeUnit::Nanosecond => {
+                    TimestampNanosecondVector::try_from_arrow_array(array.clone())
+                        .with_context(|_| DatatypesSnafu {
+                            extra: format!("Failed to create vector from arrow array {array:?}"),
+                        })?
+                        .iter_data()
+                        .map(|d| d.map(|d| d.0))
+                        .collect_vec()
+                }
+            }
+        }
+        datafusion_expr::ColumnarValue::Scalar(scalar) => {
+            let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
+                extra: format!("Failed to convert scalar {scalar:?} to value"),
+            })?;
+            let ts = value.as_timestamp().context(UnexpectedSnafu {
+                reason: format!("Expect Timestamp, found {:?}", value),
+            })?;
+            vec![Some(ts)]
+        }
+    };
+    Ok(val)
+}
+
+/// Convert sql to datafusion logical plan
+pub async fn sql_to_df_plan(
+    query_ctx: QueryContextRef,
+    engine: QueryEngineRef,
+    sql: &str,
+    optimize: bool,
+) -> Result<LogicalPlan, Error> {
+    let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;
+    let plan = engine
+        .planner()
+        .plan(&stmt, query_ctx)
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?;
+    let plan = if optimize {
+        apply_df_optimizer(plan).await?
+    } else {
+        plan
+    };
+    Ok(plan)
+}
+
+/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
+async fn find_time_window_expr(
+    plan: &LogicalPlan,
+    catalog_man: CatalogManagerRef,
+    query_ctx: QueryContextRef,
+) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
+    // TODO(discord9): find the expr that do time window
+
+    let mut table_name = None;
+
+    // first find the table source in the logical plan
+    plan.apply(|plan| {
+        let LogicalPlan::TableScan(table_scan) = plan else {
+            return Ok(TreeNodeRecursion::Continue);
+        };
+        table_name = Some(table_scan.table_name.clone());
+        Ok(TreeNodeRecursion::Stop)
+    })
+    .with_context(|_| DatafusionSnafu {
+        context: format!("Can't find table source in plan {plan:?}"),
+    })?;
+    let Some(table_name) = table_name else {
+        UnexpectedSnafu {
+            reason: format!("Can't find table source in plan {plan:?}"),
+        }
+        .fail()?
+    };
+
+    let current_schema = query_ctx.current_schema();
+
+    let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
+    let schema_name = table_name.schema().unwrap_or(&current_schema);
+    let table_name = table_name.table();
+
+    let Some(table_ref) = catalog_man
+        .table(catalog_name, schema_name, table_name, Some(&query_ctx))
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?
+    else {
+        UnexpectedSnafu {
+            reason: format!(
+                "Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
+            ),
+        }
+        .fail()?
+    };
+
+    let schema = &table_ref.table_info().meta.schema;
+
+    let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
+        reason: format!("Can't find timestamp column in table {table_name:?}"),
+    })?;
+
+    let ts_col_name = ts_index.name.clone();
+
+    let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
+        reason: format!(
+            "Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
+        ),
+    })?.unit();
+
+    let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
+        ts_col_name.clone(),
+        ts_index.data_type.as_arrow_type(),
+        false,
+    )]));
+
+    let df_schema = DFSchema::from_field_specific_qualified_schema(
+        vec![Some(TableReference::bare(table_name))],
+        &arrow_schema,
+    )
+    .with_context(|_e| DatafusionSnafu {
+        context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
+    })?;
+
+    // find the time window expr which refers to the time index column
+    let mut aggr_expr = None;
+    let mut time_window_expr: Option<Expr> = None;
+
+    let find_inner_aggr_expr = |plan: &LogicalPlan| {
+        if let LogicalPlan::Aggregate(aggregate) = plan {
+            aggr_expr = Some(aggregate.clone());
+        };
+
+        Ok(TreeNodeRecursion::Continue)
+    };
+    plan.apply(find_inner_aggr_expr)
+        .with_context(|_| DatafusionSnafu {
+            context: format!("Can't find aggr expr in plan {plan:?}"),
+        })?;
+
+    if let Some(aggregate) = aggr_expr {
+        for group_expr in &aggregate.group_expr {
+            let refs = group_expr.column_refs();
+            if refs.len() != 1 {
+                continue;
+            }
+            let ref_col = refs.iter().next().unwrap();
+
+            let index = aggregate.input.schema().maybe_index_of_column(ref_col);
+            let Some(index) = index else {
+                continue;
+            };
+            let field = aggregate.input.schema().field(index);
+
+            let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
+
+            if is_time_index {
+                let rewrite_column = group_expr.clone();
+                let rewritten = rewrite_column
+                    .rewrite(&mut RewriteColumn {
+                        table_name: table_name.to_string(),
+                    })
+                    .with_context(|_| DatafusionSnafu {
+                        context: format!("Rewrite expr failed, expr={:?}", group_expr),
+                    })?
+                    .data;
+                struct RewriteColumn {
+                    table_name: String,
+                }
+
+                impl TreeNodeRewriter for RewriteColumn {
+                    type Node = Expr;
+                    fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+                        let Expr::Column(mut column) = node else {
+                            return Ok(Transformed::no(node));
+                        };
+
+                        column.relation = Some(TableReference::bare(self.table_name.clone()));
+
+                        Ok(Transformed::yes(Expr::Column(column)))
+                    }
+                }
+
+                time_window_expr = Some(rewritten);
+                break;
+            }
+        }
+        Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
+    } else {
+        // can't found time window expr, return None
+        Ok((ts_col_name, None, expected_time_unit, df_schema))
+    }
+}
+
+/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
+/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
+/// return `Some("2021-07-01 00:00:00.000")`
+/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
+///
+/// Time window expr is a expr that:
+/// 1. ref only to a time index column
+/// 2. is monotonic increasing
+/// 3. show up in GROUP BY clause
+///
+/// note this plan should only contain one TableScan
+pub async fn find_plan_time_window_bound(
+    plan: &LogicalPlan,
+    current: Timestamp,
+    query_ctx: QueryContextRef,
+    engine: QueryEngineRef,
+) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
+    // TODO(discord9): find the expr that do time window
+    let catalog_man = engine.engine_state().catalog_manager();
+
+    let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
+        find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
+    // cast current to ts_index's type
+    let new_current = current
+        .convert_to(expected_time_unit)
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
+        })?;
+
+    // if no time_window_expr is found, return None
+    if let Some(time_window_expr) = time_window_expr {
+        let lower_bound =
+            find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
+        let upper_bound =
+            find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
+        Ok((ts_col_name, lower_bound, upper_bound))
+    } else {
+        Ok((ts_col_name, None, None))
+    }
+}
+
+/// Find the lower bound of time window in given `expr` and `current` timestamp.
+///
+/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
+/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
+/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
+/// of current time window given the current timestamp
+///
+/// if return None, meaning this time window have no lower bound
+fn find_expr_time_window_lower_bound(
+    expr: &Expr,
+    df_schema: &DFSchema,
+    current: Timestamp,
+) -> Result<Option<Timestamp>, Error> {
+    let phy_planner = DefaultPhysicalPlanner::default();
+
+    let phy_expr: PhysicalExprRef = phy_planner
+        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+        .with_context(|_e| DatafusionSnafu {
+            context: format!(
+                "Failed to create physical expression from {expr:?} using {df_schema:?}"
+            ),
+        })?;
+
+    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
+    let input_time_unit = cur_time_window.unit();
+    Ok(cur_time_window.convert_to(input_time_unit))
+}
+
+/// Find the upper bound for time window expression
+fn find_expr_time_window_upper_bound(
+    expr: &Expr,
+    df_schema: &DFSchema,
+    current: Timestamp,
+) -> Result<Option<Timestamp>, Error> {
+    use std::cmp::Ordering;
+
+    let phy_planner = DefaultPhysicalPlanner::default();
+
+    let phy_expr: PhysicalExprRef = phy_planner
+        .create_physical_expr(expr, df_schema, &SessionContext::new().state())
+        .with_context(|_e| DatafusionSnafu {
+            context: format!(
+                "Failed to create physical expression from {expr:?} using {df_schema:?}"
+            ),
+        })?;
+
+    let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
+
+    // search to find the lower bound
+    let mut offset: i64 = 1;
+    let mut lower_bound = Some(current);
+    let upper_bound;
+    // first expontial probe to found a range for binary search
+    loop {
+        let Some(next_val) = current.value().checked_add(offset) else {
+            // no upper bound if overflow
+            return Ok(None);
+        };
+
+        let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
+
+        let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
+
+        match next_time_window.cmp(&cur_time_window) {
+            Ordering::Less => {UnexpectedSnafu {
+                reason: format!(
+                    "Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
+                ),
+            }
+            .fail()?
+            }
+            Ordering::Equal => {
+                lower_bound = Some(next_time_probe);
+            }
+            Ordering::Greater => {
+                upper_bound = Some(next_time_probe);
+                break
+            }
+        }
+
+        let Some(new_offset) = offset.checked_mul(2) else {
+            // no upper bound if overflow
+            return Ok(None);
+        };
+        offset = new_offset;
+    }
+
+    // binary search for the exact upper bound
+
+    ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
+        reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
+    });
+
+    let output_unit = upper_bound
+        .context(UnexpectedSnafu {
+            reason: "should have lower bound",
+        })?
+        .unit();
+
+    let mut low = lower_bound
+        .context(UnexpectedSnafu {
+            reason: "should have lower bound",
+        })?
+        .value();
+    let mut high = upper_bound
+        .context(UnexpectedSnafu {
+            reason: "should have upper bound",
+        })?
+        .value();
+    while low < high {
+        let mid = (low + high) / 2;
+        let mid_probe = common_time::Timestamp::new(mid, output_unit);
+        let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
+
+        match mid_time_window.cmp(&cur_time_window) {
+            Ordering::Less => UnexpectedSnafu {
+                reason: format!("Binary search failed for time window expression {expr:?}"),
+            }
+            .fail()?,
+            Ordering::Equal => low = mid + 1,
+            Ordering::Greater => high = mid,
+        }
+    }
+
+    let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
+
+    Ok(Some(final_upper_bound_for_time_window))
+}
+
+fn eval_ts_to_ts(
+    phy: &PhysicalExprRef,
+    df_schema: &DFSchema,
+    input_value: Timestamp,
+) -> Result<Timestamp, Error> {
+    let schema_ty = df_schema.field(0).data_type();
+    let schema_cdt = ConcreteDataType::from_arrow_type(schema_ty);
+    let schema_unit = if let ConcreteDataType::Timestamp(ts) = schema_cdt {
+        ts.unit()
+    } else {
+        return UnexpectedSnafu {
+            reason: format!("Expect Timestamp, found {:?}", schema_cdt),
+        }
+        .fail();
+    };
+    let input_value = input_value
+        .convert_to(schema_unit)
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Failed to convert timestamp {input_value:?} to {schema_unit}"),
+        })?;
+    let ts_vector = match schema_unit {
+        TimeUnit::Second => {
+            TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Millisecond => {
+            TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Microsecond => {
+            TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+        TimeUnit::Nanosecond => {
+            TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
+        }
+    };
+
+    let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
+        .with_context(|_| ArrowSnafu {
+            context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
+        })?;
+
+    let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
+        context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
+    })?;
+
+    if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
+        Ok(*ts)
+    } else {
+        UnexpectedSnafu {
+            reason: format!(
+                "Expected timestamp in expression {phy:?} but got {:?}",
+                eval_res
+            ),
+        }
+        .fail()?
+    }
+}
+
+// TODO(discord9): a method to found out the precise time window
+
+/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
+#[derive(Debug)]
+pub struct AddFilterRewriter {
+    extra_filter: Expr,
+    is_rewritten: bool,
+}
+
+impl AddFilterRewriter {
+    fn new(filter: Expr) -> Self {
+        Self {
+            extra_filter: filter,
+            is_rewritten: false,
+        }
+    }
+}
+
+impl TreeNodeRewriter for AddFilterRewriter {
+    type Node = LogicalPlan;
+    fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+        if self.is_rewritten {
+            return Ok(Transformed::no(node));
+        }
+        match node {
+            LogicalPlan::Filter(mut filter) if !filter.having => {
+                filter.predicate = filter.predicate.and(self.extra_filter.clone());
+                self.is_rewritten = true;
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            }
+            LogicalPlan::TableScan(_) => {
+                // add a new filter
+                let filter =
+                    datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
+                self.is_rewritten = true;
+                Ok(Transformed::yes(LogicalPlan::Filter(filter)))
+            }
+            _ => Ok(Transformed::no(node)),
+        }
+    }
+}
+
+fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
+    /// A dialect that forces all identifiers to be quoted
+    struct ForceQuoteIdentifiers;
+    impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
+        fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
+            if identifier.to_lowercase() != identifier {
+                Some('"')
+            } else {
+                None
+            }
+        }
+    }
+    let unparser = Unparser::new(&ForceQuoteIdentifiers);
+    // first make all column qualified
+    let sql = unparser
+        .plan_to_sql(plan)
+        .with_context(|_e| DatafusionSnafu {
+            context: format!("Failed to unparse logical plan {plan:?}"),
+        })?;
+    Ok(sql.to_string())
+}
+
+#[cfg(test)]
+mod test {
+    use datafusion_common::tree_node::TreeNode;
+    use pretty_assertions::assert_eq;
+    use session::context::QueryContext;
+
+    use super::{sql_to_df_plan, *};
+    use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
+    use crate::test_utils::create_test_query_engine;
+
+    #[tokio::test]
+    async fn test_sql_plan_convert() {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
+        let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
+            .await
+            .unwrap();
+        let new_sql = df_plan_to_sql(&new).unwrap();
+
+        assert_eq!(
+            r#"SELECT "UPPERCASE_NUMBERS_WITH_TS"."NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#,
+            new_sql
+        );
+    }
+
+    #[tokio::test]
+    async fn test_add_filter() {
+        let testcases = vec![
+            (
+                "SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
+            ),
+            (
+                "SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
+                "SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
+            ),
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            )
+        ];
+        use datafusion_expr::{col, lit};
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+
+        for (before, after) in testcases {
+            let sql = before;
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+                .await
+                .unwrap();
+
+            let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
+            let plan = plan.rewrite(&mut add_filter).unwrap().data;
+            let new_sql = df_plan_to_sql(&plan).unwrap();
+            assert_eq!(after, new_sql);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_time_window_lower_bound() {
+        use datafusion_expr::{col, lit};
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+
+        let testcases = [
+            // same alias is not same column
+            (
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
+                Timestamp::new(1740394109, TimeUnit::Second),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
+                ),
+                r#"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"#
+            ),
+            // complex time window index
+            (
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(1740394109, TimeUnit::Second),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(1740394080, TimeUnit::Second)),
+                    Some(Timestamp::new(1740394140, TimeUnit::Second)),
+                ),
+                "SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
+            ),
+            // no time index
+            (
+                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                ("ts".to_string(), None, None),
+                "SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
+            ),
+            // time index
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // on spot
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(0, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // different time unit
+            (
+                "SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23_000_000, TimeUnit::Nanosecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // time index with other fields
+            (
+                "SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
+            ),
+            // time index with other pks
+            (
+                "SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
+            ),
+            // subquery
+            (
+                "SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
+            ),
+            // cte
+            (
+                "with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
+            ),
+            // complex subquery without alias
+            (
+                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
+            ),
+            // complex subquery alias
+            (
+                "SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
+                Timestamp::new(23, TimeUnit::Millisecond),
+                (
+                    "ts".to_string(),
+                    Some(Timestamp::new(0, TimeUnit::Millisecond)),
+                    Some(Timestamp::new(300000, TimeUnit::Millisecond)),
+                ),
+                "SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
+            ),
+        ];
+
+        for (sql, current, expected, expected_unparsed) in testcases {
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
+                .await
+                .unwrap();
+
+            let real =
+                find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
+                    .await
+                    .unwrap();
+            assert_eq!(expected, real);
+
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+                .await
+                .unwrap();
+            let (col_name, lower, upper) = real;
+            let new_sql = if lower.is_some() {
+                let to_df_literal = |value| {
+                    let value = Value::from(value);
+
+                    value.try_to_scalar_value(&value.data_type()).unwrap()
+                };
+                let lower = to_df_literal(lower.unwrap());
+                let upper = to_df_literal(upper.unwrap());
+                let expr = col(&col_name)
+                    .gt_eq(lit(lower))
+                    .and(col(&col_name).lt_eq(lit(upper)));
+                let mut add_filter = AddFilterRewriter::new(expr);
+                let plan = plan.rewrite(&mut add_filter).unwrap().data;
+                df_plan_to_sql(&plan).unwrap()
+            } else {
+                sql.to_string()
+            };
+            assert_eq!(expected_unparsed, new_sql);
+        }
+    }
+}
--- a/src/flow/src/recording_rules/engine.rs
+++ b/src/flow/src/recording_rules/engine.rs
@@ -0,0 +1,815 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+use api::v1::flow::FlowResponse;
+use common_error::ext::BoxedError;
+use common_meta::ddl::create_flow::FlowType;
+use common_meta::key::flow::FlowMetadataManagerRef;
+use common_meta::key::table_info::TableInfoManager;
+use common_meta::key::TableMetadataManagerRef;
+use common_telemetry::tracing::warn;
+use common_telemetry::{debug, info};
+use common_time::Timestamp;
+use datafusion::sql::unparser::expr_to_sql;
+use datafusion_common::tree_node::TreeNode;
+use datatypes::value::Value;
+use query::QueryEngineRef;
+use session::context::QueryContextRef;
+use snafu::{ensure, OptionExt, ResultExt};
+use store_api::storage::RegionId;
+use table::metadata::TableId;
+use tokio::sync::oneshot::error::TryRecvError;
+use tokio::sync::{oneshot, RwLock};
+use tokio::time::Instant;
+
+use super::frontend_client::FrontendClient;
+use super::{df_plan_to_sql, AddFilterRewriter, TimeWindowExpr};
+use crate::adapter::{CreateFlowArgs, FlowId, TableName};
+use crate::error::{
+    DatafusionSnafu, DatatypesSnafu, ExternalSnafu, FlowAlreadyExistSnafu, InternalSnafu,
+    TimeSnafu, UnexpectedSnafu,
+};
+use crate::metrics::{METRIC_FLOW_RULE_ENGINE_QUERY_TIME, METRIC_FLOW_RULE_ENGINE_SLOW_QUERY};
+use crate::recording_rules::{find_time_window_expr, sql_to_df_plan};
+use crate::Error;
+
+/// TODO(discord9): make those constants configurable
+/// The default rule engine query timeout is 10 minutes
+pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
+
+/// will output a warn log for any query that runs for more that 1 minutes, and also every 1 minutes when that query is still running
+pub const SLOW_QUERY_THRESHOLD: Duration = Duration::from_secs(60);
+
+/// TODO(discord9): determine how to configure refresh rate
+pub struct RecordingRuleEngine {
+    tasks: RwLock<BTreeMap<FlowId, RecordingRuleTask>>,
+    shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
+    frontend_client: Arc<FrontendClient>,
+    flow_metadata_manager: FlowMetadataManagerRef,
+    table_meta: TableMetadataManagerRef,
+    engine: QueryEngineRef,
+}
+
+impl RecordingRuleEngine {
+    pub fn new(
+        frontend_client: Arc<FrontendClient>,
+        engine: QueryEngineRef,
+        flow_metadata_manager: FlowMetadataManagerRef,
+        table_meta: TableMetadataManagerRef,
+    ) -> Self {
+        Self {
+            tasks: Default::default(),
+            shutdown_txs: Default::default(),
+            frontend_client,
+            flow_metadata_manager,
+            table_meta,
+            engine,
+        }
+    }
+
+    pub async fn handle_inserts(
+        &self,
+        request: api::v1::region::InsertRequests,
+    ) -> Result<FlowResponse, Error> {
+        let table_info_mgr = self.table_meta.table_info_manager();
+        let mut group_by_table_name: HashMap<TableName, Vec<api::v1::Rows>> = HashMap::new();
+        for r in request.requests {
+            let tid = RegionId::from(r.region_id).table_id();
+            let name = get_table_name(table_info_mgr, &tid).await?;
+            let entry = group_by_table_name.entry(name).or_default();
+            if let Some(rows) = r.rows {
+                entry.push(rows);
+            }
+        }
+
+        for (_flow_id, task) in self.tasks.read().await.iter() {
+            let src_table_names = &task.source_table_names;
+
+            for src_table_name in src_table_names {
+                if let Some(entry) = group_by_table_name.get(src_table_name) {
+                    let Some(expr) = &task.time_window_expr else {
+                        continue;
+                    };
+                    let involved_time_windows = expr.handle_rows(entry.clone()).await?;
+                    let mut state = task.state.write().await;
+                    state
+                        .dirty_time_windows
+                        .add_lower_bounds(involved_time_windows.into_iter());
+                }
+            }
+        }
+
+        Ok(Default::default())
+    }
+}
+
+async fn get_table_name(zelf: &TableInfoManager, table_id: &TableId) -> Result<TableName, Error> {
+    zelf.get(*table_id)
+        .await
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)?
+        .with_context(|| UnexpectedSnafu {
+            reason: format!("Table id = {:?}, couldn't found table name", table_id),
+        })
+        .map(|name| name.table_name())
+        .map(|name| [name.catalog_name, name.schema_name, name.table_name])
+}
+
+const MIN_REFRESH_DURATION: Duration = Duration::new(5, 0);
+
+impl RecordingRuleEngine {
+    pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
+        let CreateFlowArgs {
+            flow_id,
+            sink_table_name,
+            source_table_ids,
+            create_if_not_exists,
+            or_replace,
+            expire_after,
+            comment: _,
+            sql,
+            flow_options,
+            query_ctx,
+        } = args;
+
+        // or replace logic
+        {
+            let is_exist = self.tasks.read().await.contains_key(&flow_id);
+            match (create_if_not_exists, or_replace, is_exist) {
+                // if replace, ignore that old flow exists
+                (_, true, true) => {
+                    info!("Replacing flow with id={}", flow_id);
+                }
+                (false, false, true) => FlowAlreadyExistSnafu { id: flow_id }.fail()?,
+                // already exists, and not replace, return None
+                (true, false, true) => {
+                    info!("Flow with id={} already exists, do nothing", flow_id);
+                    return Ok(None);
+                }
+
+                // continue as normal
+                (_, _, false) => (),
+            }
+        }
+
+        let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);
+
+        ensure!(
+            flow_type == Some(&FlowType::RecordingRule.to_string()) || flow_type.is_none(),
+            UnexpectedSnafu {
+                reason: format!("Flow type is not RecordingRule nor None, got {flow_type:?}")
+            }
+        );
+
+        let Some(query_ctx) = query_ctx else {
+            UnexpectedSnafu {
+                reason: "Query context is None".to_string(),
+            }
+            .fail()?
+        };
+        let query_ctx = Arc::new(query_ctx);
+        let mut source_table_names = Vec::new();
+        for src_id in source_table_ids {
+            let table_name = self
+                .table_meta
+                .table_info_manager()
+                .get(src_id)
+                .await
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu)?
+                .with_context(|| UnexpectedSnafu {
+                    reason: format!("Table id = {:?}, couldn't found table name", src_id),
+                })
+                .map(|name| name.table_name())
+                .map(|name| [name.catalog_name, name.schema_name, name.table_name])?;
+            source_table_names.push(table_name);
+        }
+
+        let (tx, rx) = oneshot::channel();
+
+        let plan = sql_to_df_plan(query_ctx.clone(), self.engine.clone(), &sql, true).await?;
+        let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
+            &plan,
+            self.engine.engine_state().catalog_manager().clone(),
+            query_ctx.clone(),
+        )
+        .await?;
+
+        let phy_expr = time_window_expr
+            .map(|expr| TimeWindowExpr::from_expr(&expr, &column_name, &df_schema))
+            .transpose()?;
+
+        info!("Flow id={}, found time window expr={:?}", flow_id, phy_expr);
+
+        let task = RecordingRuleTask::new(
+            flow_id,
+            &sql,
+            phy_expr,
+            expire_after,
+            sink_table_name,
+            source_table_names,
+            query_ctx,
+            rx,
+        );
+
+        let task_inner = task.clone();
+        let engine = self.engine.clone();
+        let frontend = self.frontend_client.clone();
+
+        // TODO(discord9): also save handle & use time wheel or what for better
+        let _handle = common_runtime::spawn_global(async move {
+            match task_inner.start_executing(engine, frontend).await {
+                Ok(()) => info!("Flow {} shutdown", task_inner.flow_id),
+                Err(err) => common_telemetry::error!(
+                    "Flow {} encounter unrecoverable error: {err:?}",
+                    task_inner.flow_id
+                ),
+            }
+        });
+
+        // TODO(discord9): deal with replace logic
+        let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
+        drop(replaced_old_task_opt);
+
+        self.shutdown_txs.write().await.insert(flow_id, tx);
+
+        Ok(Some(flow_id))
+    }
+
+    pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+        if self.tasks.write().await.remove(&flow_id).is_none() {
+            warn!("Flow {flow_id} not found in tasks")
+        }
+        let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
+            UnexpectedSnafu {
+                reason: format!("Can't found shutdown tx for flow {flow_id}"),
+            }
+            .fail()?
+        };
+        if tx.send(()).is_err() {
+            warn!("Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?")
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct RecordingRuleTask {
+    pub flow_id: FlowId,
+    query: String,
+    pub time_window_expr: Option<TimeWindowExpr>,
+    /// in seconds
+    pub expire_after: Option<i64>,
+    sink_table_name: [String; 3],
+    source_table_names: HashSet<[String; 3]>,
+    state: Arc<RwLock<RecordingRuleState>>,
+}
+
+impl RecordingRuleTask {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        flow_id: FlowId,
+        query: &str,
+        time_window_expr: Option<TimeWindowExpr>,
+        expire_after: Option<i64>,
+        sink_table_name: [String; 3],
+        source_table_names: Vec<[String; 3]>,
+        query_ctx: QueryContextRef,
+        shutdown_rx: oneshot::Receiver<()>,
+    ) -> Self {
+        Self {
+            flow_id,
+            query: query.to_string(),
+            time_window_expr,
+            expire_after,
+            sink_table_name,
+            source_table_names: source_table_names.into_iter().collect(),
+            state: Arc::new(RwLock::new(RecordingRuleState::new(query_ctx, shutdown_rx))),
+        }
+    }
+}
+impl RecordingRuleTask {
+    /// This should be called in a new tokio task
+    pub async fn start_executing(
+        &self,
+        engine: QueryEngineRef,
+        frontend_client: Arc<FrontendClient>,
+    ) -> Result<(), Error> {
+        // only first query don't need upper bound
+        let mut is_first = true;
+
+        loop {
+            // FIXME(discord9): test if need upper bound also works
+            let new_query = self.gen_query_with_time_window(engine.clone()).await?;
+
+            let insert_into = if let Some(new_query) = new_query {
+                format!(
+                    "INSERT INTO {}.{}.{} {}",
+                    self.sink_table_name[0],
+                    self.sink_table_name[1],
+                    self.sink_table_name[2],
+                    new_query
+                )
+            } else {
+                tokio::time::sleep(MIN_REFRESH_DURATION).await;
+                continue;
+            };
+
+            if is_first {
+                is_first = false;
+            }
+
+            let instant = Instant::now();
+            let flow_id = self.flow_id;
+            let db_client = frontend_client.get_database_client().await?;
+            let peer_addr = db_client.peer.addr;
+            debug!(
+                "Executing flow {flow_id}(expire_after={:?} secs) on {:?} with query {}",
+                self.expire_after, peer_addr, &insert_into
+            );
+
+            let timer = METRIC_FLOW_RULE_ENGINE_QUERY_TIME
+                .with_label_values(&[flow_id.to_string().as_str()])
+                .start_timer();
+
+            let res = db_client.database.sql(&insert_into).await;
+            drop(timer);
+
+            let elapsed = instant.elapsed();
+            if let Ok(res1) = &res {
+                debug!(
+                    "Flow {flow_id} executed, result: {res1:?}, elapsed: {:?}",
+                    elapsed
+                );
+            } else if let Err(res) = &res {
+                warn!(
+                    "Failed to execute Flow {flow_id} on frontend {}, result: {res:?}, elapsed: {:?} with query: {}",
+                    peer_addr, elapsed, &insert_into
+                );
+            }
+
+            // record slow query
+            if elapsed >= SLOW_QUERY_THRESHOLD {
+                warn!(
+                    "Flow {flow_id} on frontend {} executed for {:?} before complete, query: {}",
+                    peer_addr, elapsed, &insert_into
+                );
+                METRIC_FLOW_RULE_ENGINE_SLOW_QUERY
+                    .with_label_values(&[flow_id.to_string().as_str(), &insert_into, &peer_addr])
+                    .observe(elapsed.as_secs_f64());
+            }
+
+            self.state
+                .write()
+                .await
+                .after_query_exec(elapsed, res.is_ok());
+            // drop the result to free client-related resources
+            drop(res);
+
+            let sleep_until = {
+                let mut state = self.state.write().await;
+                match state.shutdown_rx.try_recv() {
+                    Ok(()) => break Ok(()),
+                    Err(TryRecvError::Closed) => {
+                        warn!("Unexpected shutdown flow {flow_id}, shutdown anyway");
+                        break Ok(());
+                    }
+                    Err(TryRecvError::Empty) => (),
+                }
+                state.get_next_start_query_time(None)
+            };
+            tokio::time::sleep_until(sleep_until).await;
+        }
+    }
+
+    /// will merge and use the first ten time window in query
+    async fn gen_query_with_time_window(
+        &self,
+        engine: QueryEngineRef,
+    ) -> Result<Option<String>, Error> {
+        let query_ctx = self.state.read().await.query_ctx.clone();
+        let start = SystemTime::now();
+        let since_the_epoch = start
+            .duration_since(UNIX_EPOCH)
+            .expect("Time went backwards");
+        let low_bound = self
+            .expire_after
+            .map(|e| since_the_epoch.as_secs() - e as u64)
+            .unwrap_or(u64::MIN);
+
+        let low_bound = Timestamp::new_second(low_bound as i64);
+
+        // TODO(discord9): use time window expr to get the precise expire lower bound
+        let expire_time_window_bound = self
+            .time_window_expr
+            .as_ref()
+            .map(|expr| expr.eval(low_bound))
+            .transpose()?;
+
+        let new_sql = {
+            let expr = {
+                match expire_time_window_bound {
+                    Some((Some(l), Some(u))) => {
+                        let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
+                            reason: format!("Can't get window size from {u:?} - {l:?}"),
+                        })?;
+                        let col_name = self
+                            .time_window_expr
+                            .as_ref()
+                            .map(|expr| expr.column_name.clone())
+                            .with_context(|| UnexpectedSnafu {
+                                reason: format!(
+                                    "Flow id={:?}, Failed to get column name from time window expr",
+                                    self.flow_id
+                                ),
+                            })?;
+
+                        self.state
+                            .write()
+                            .await
+                            .dirty_time_windows
+                            .gen_filter_exprs(&col_name, Some(l), window_size, self)?
+                    }
+                    _ => {
+                        debug!(
+                            "Flow id = {:?}, can't get window size: precise_lower_bound={expire_time_window_bound:?}, using the same query", self.flow_id
+                        );
+                        // since no time window lower/upper bound is found, just return the original query
+                        return Ok(Some(self.query.clone()));
+                    }
+                }
+            };
+
+            debug!(
+                "Flow id={:?}, Generated filter expr: {:?}",
+                self.flow_id,
+                expr.as_ref()
+                    .map(|expr| expr_to_sql(expr).with_context(|_| DatafusionSnafu {
+                        context: format!("Failed to generate filter expr from {expr:?}"),
+                    }))
+                    .transpose()?
+                    .map(|s| s.to_string())
+            );
+
+            let Some(expr) = expr else {
+                // no new data, hence no need to update
+                debug!("Flow id={:?}, no new data, not update", self.flow_id);
+                return Ok(None);
+            };
+
+            let mut add_filter = AddFilterRewriter::new(expr);
+            // make a not optimized plan for clearer unparse
+            let plan =
+                sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, false).await?;
+            let plan = plan
+                .clone()
+                .rewrite(&mut add_filter)
+                .with_context(|_| DatafusionSnafu {
+                    context: format!("Failed to rewrite plan {plan:?}"),
+                })?
+                .data;
+            df_plan_to_sql(&plan)?
+        };
+
+        Ok(Some(new_sql))
+    }
+}
+
+#[derive(Debug)]
+pub struct RecordingRuleState {
+    query_ctx: QueryContextRef,
+    /// last query complete time
+    last_update_time: Instant,
+    /// last time query duration
+    last_query_duration: Duration,
+    /// Dirty Time windows need to be updated
+    /// mapping of `start -> end` and non-overlapping
+    dirty_time_windows: DirtyTimeWindows,
+    exec_state: ExecState,
+    shutdown_rx: oneshot::Receiver<()>,
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct DirtyTimeWindows {
+    windows: BTreeMap<Timestamp, Option<Timestamp>>,
+}
+
+fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
+    let value = Value::from(value);
+    let value = value
+        .try_to_scalar_value(&value.data_type())
+        .with_context(|_| DatatypesSnafu {
+            extra: format!("Failed to convert to scalar value: {}", value),
+        })?;
+    Ok(value)
+}
+
+impl DirtyTimeWindows {
+    /// Time window merge distance
+    const MERGE_DIST: i32 = 3;
+
+    /// Maximum number of filters allowed in a single query
+    const MAX_FILTER_NUM: usize = 20;
+
+    /// Add lower bounds to the dirty time windows. Upper bounds are ignored.
+    ///
+    /// # Arguments
+    ///
+    /// * `lower_bounds` - An iterator of lower bounds to be added.
+    pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
+        for lower_bound in lower_bounds {
+            let entry = self.windows.entry(lower_bound);
+            entry.or_insert(None);
+        }
+    }
+
+    /// Generate all filter expressions consuming all time windows
+    pub fn gen_filter_exprs(
+        &mut self,
+        col_name: &str,
+        expire_lower_bound: Option<Timestamp>,
+        window_size: chrono::Duration,
+        task_ctx: &RecordingRuleTask,
+    ) -> Result<Option<datafusion_expr::Expr>, Error> {
+        debug!(
+            "expire_lower_bound: {:?}, window_size: {:?}",
+            expire_lower_bound.map(|t| t.to_iso8601_string()),
+            window_size
+        );
+        self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
+
+        if self.windows.len() > Self::MAX_FILTER_NUM {
+            let first_time_window = self.windows.first_key_value();
+            let last_time_window = self.windows.last_key_value();
+            warn!(
+                "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
+                task_ctx.flow_id,
+                self.windows.len(),
+                Self::MAX_FILTER_NUM,
+                task_ctx.time_window_expr,
+                task_ctx.expire_after,
+                first_time_window,
+                last_time_window,
+                task_ctx.query
+            );
+        }
+
+        // get the first `MAX_FILTER_NUM` time windows
+        let nth = self
+            .windows
+            .iter()
+            .nth(Self::MAX_FILTER_NUM)
+            .map(|(key, _)| *key);
+        let first_nth = {
+            if let Some(nth) = nth {
+                let mut after = self.windows.split_off(&nth);
+                std::mem::swap(&mut self.windows, &mut after);
+
+                after
+            } else {
+                std::mem::take(&mut self.windows)
+            }
+        };
+
+        let mut expr_lst = vec![];
+        for (start, end) in first_nth.into_iter() {
+            debug!(
+                "Time window start: {:?}, end: {:?}",
+                start.to_iso8601_string(),
+                end.map(|t| t.to_iso8601_string())
+            );
+
+            use datafusion_expr::{col, lit};
+            let lower = to_df_literal(start)?;
+            let upper = end.map(to_df_literal).transpose()?;
+            let expr = if let Some(upper) = upper {
+                col(col_name)
+                    .gt_eq(lit(lower))
+                    .and(col(col_name).lt(lit(upper)))
+            } else {
+                col(col_name).gt_eq(lit(lower))
+            };
+            expr_lst.push(expr);
+        }
+        let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
+        Ok(expr)
+    }
+
+    /// Merge time windows that overlaps or get too close
+    pub fn merge_dirty_time_windows(
+        &mut self,
+        window_size: chrono::Duration,
+        expire_lower_bound: Option<Timestamp>,
+    ) -> Result<(), Error> {
+        let mut new_windows = BTreeMap::new();
+
+        let mut prev_tw = None;
+        for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
+            // filter out expired time window
+            if let Some(expire_lower_bound) = expire_lower_bound {
+                if lower_bound <= expire_lower_bound {
+                    continue;
+                }
+            }
+
+            let Some(prev_tw) = &mut prev_tw else {
+                prev_tw = Some((lower_bound, upper_bound));
+                continue;
+            };
+
+            let std_window_size = window_size.to_std().map_err(|e| {
+                InternalSnafu {
+                    reason: e.to_string(),
+                }
+                .build()
+            })?;
+
+            // if cur.lower - prev.upper <= window_size * 2, merge
+            let prev_upper = prev_tw
+                .1
+                .unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
+            prev_tw.1 = Some(prev_upper);
+
+            let cur_upper = upper_bound.unwrap_or(
+                lower_bound
+                    .add_duration(std_window_size)
+                    .context(TimeSnafu)?,
+            );
+
+            if lower_bound
+                .sub(&prev_upper)
+                .map(|dist| dist <= window_size * Self::MERGE_DIST)
+                .unwrap_or(false)
+            {
+                prev_tw.1 = Some(cur_upper);
+            } else {
+                new_windows.insert(prev_tw.0, prev_tw.1);
+                *prev_tw = (lower_bound, Some(cur_upper));
+            }
+        }
+
+        if let Some(prev_tw) = prev_tw {
+            new_windows.insert(prev_tw.0, prev_tw.1);
+        }
+
+        self.windows = new_windows;
+
+        Ok(())
+    }
+}
+
+impl RecordingRuleState {
+    pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
+        Self {
+            query_ctx,
+            last_update_time: Instant::now(),
+            last_query_duration: Duration::from_secs(0),
+            dirty_time_windows: Default::default(),
+            exec_state: ExecState::Idle,
+            shutdown_rx,
+        }
+    }
+
+    /// called after last query is done
+    /// `is_succ` indicate whether the last query is successful
+    pub fn after_query_exec(&mut self, elapsed: Duration, _is_succ: bool) {
+        self.exec_state = ExecState::Idle;
+        self.last_query_duration = elapsed;
+        self.last_update_time = Instant::now();
+    }
+
+    /// wait for at least `last_query_duration`, at most `max_timeout` to start next query
+    pub fn get_next_start_query_time(&self, max_timeout: Option<Duration>) -> Instant {
+        let next_duration = max_timeout
+            .unwrap_or(self.last_query_duration)
+            .min(self.last_query_duration);
+        let next_duration = next_duration.max(MIN_REFRESH_DURATION);
+
+        self.last_update_time + next_duration
+    }
+}
+
+#[derive(Debug, Clone)]
+enum ExecState {
+    Idle,
+    Executing,
+}
+
+#[cfg(test)]
+mod test {
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_merge_dirty_time_windows() {
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
+            .unwrap();
+        // just enough to merge
+        assert_eq!(
+            dirty.windows,
+            BTreeMap::from([(
+                Timestamp::new_second(0),
+                Some(Timestamp::new_second(
+                    (2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
+                ))
+            )])
+        );
+
+        // separate time window
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
+            .unwrap();
+        // just enough to merge
+        assert_eq!(
+            BTreeMap::from([
+                (
+                    Timestamp::new_second(0),
+                    Some(Timestamp::new_second(5 * 60))
+                ),
+                (
+                    Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+                    Some(Timestamp::new_second(
+                        (3 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
+                    ))
+                )
+            ]),
+            dirty.windows
+        );
+
+        // overlapping
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
+            .unwrap();
+        // just enough to merge
+        assert_eq!(
+            BTreeMap::from([(
+                Timestamp::new_second(0),
+                Some(Timestamp::new_second(
+                    (1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
+                ))
+            ),]),
+            dirty.windows
+        );
+
+        // expired
+        let mut dirty = DirtyTimeWindows::default();
+        dirty.add_lower_bounds(
+            vec![
+                Timestamp::new_second(0),
+                Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
+            ]
+            .into_iter(),
+        );
+        dirty
+            .merge_dirty_time_windows(
+                chrono::Duration::seconds(5 * 60),
+                Some(Timestamp::new_second(
+                    (DirtyTimeWindows::MERGE_DIST as i64) * 6 * 60,
+                )),
+            )
+            .unwrap();
+        // just enough to merge
+        assert_eq!(BTreeMap::from([]), dirty.windows);
+    }
+}
--- a/src/flow/src/recording_rules/frontend_client.rs
+++ b/src/flow/src/recording_rules/frontend_client.rs
@@ -0,0 +1,163 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
+
+use std::sync::Arc;
+
+use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_error::ext::BoxedError;
+use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
+use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
+use common_meta::peer::Peer;
+use common_meta::rpc::store::RangeRequest;
+use meta_client::client::MetaClient;
+use snafu::ResultExt;
+
+use crate::error::{ExternalSnafu, UnexpectedSnafu};
+use crate::recording_rules::engine::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
+use crate::Error;
+
+fn default_channel_mgr() -> ChannelManager {
+    let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
+    ChannelManager::with_config(cfg)
+}
+
+fn client_from_urls(addrs: Vec<String>) -> Client {
+    Client::with_manager_and_urls(default_channel_mgr(), addrs)
+}
+
+/// A simple frontend client able to execute sql using grpc protocol
+#[derive(Debug)]
+pub enum FrontendClient {
+    Distributed {
+        meta_client: Arc<MetaClient>,
+        channel_mgr: ChannelManager,
+    },
+    Standalone {
+        /// for the sake of simplicity still use grpc even in standalone mode
+        /// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
+        /// TODO(discord9): not use grpc under standalone mode
+        database_client: DatabaseWithPeer,
+    },
+}
+
+#[derive(Debug, Clone)]
+pub struct DatabaseWithPeer {
+    pub database: Database,
+    pub peer: Peer,
+}
+
+impl DatabaseWithPeer {
+    fn new(database: Database, peer: Peer) -> Self {
+        Self { database, peer }
+    }
+}
+
+impl FrontendClient {
+    pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
+        Self::Distributed {
+            meta_client,
+            channel_mgr: default_channel_mgr(),
+        }
+    }
+
+    pub fn from_static_grpc_addr(addr: String) -> Self {
+        let peer = Peer {
+            id: 0,
+            addr: addr.clone(),
+        };
+
+        let mgr = default_channel_mgr();
+        let client = Client::with_manager_and_urls(mgr.clone(), vec![addr]);
+        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        Self::Standalone {
+            database_client: DatabaseWithPeer::new(database, peer),
+        }
+    }
+}
+
+impl FrontendClient {
+    async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
+        let Self::Distributed { meta_client, .. } = self else {
+            return Ok(vec![]);
+        };
+        let cluster_client = meta_client
+            .cluster_client()
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu)?;
+        let cluster_id = meta_client.id().0;
+        let prefix = NodeInfoKey::key_prefix_with_role(cluster_id, Role::Frontend);
+        let req = RangeRequest::new().with_prefix(prefix);
+        let resp = cluster_client
+            .range(req)
+            .await
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu)?;
+        let mut res = Vec::with_capacity(resp.kvs.len());
+        for kv in resp.kvs {
+            let key = NodeInfoKey::try_from(kv.key)
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu)?;
+
+            let val = NodeInfo::try_from(kv.value)
+                .map_err(BoxedError::new)
+                .context(ExternalSnafu)?;
+            res.push((key, val));
+        }
+        Ok(res)
+    }
+
+    /// Get the database with max `last_activity_ts`
+    async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
+        if let Self::Standalone { database_client } = self {
+            return Ok(database_client.clone());
+        }
+        match &self {
+            Self::Standalone { database_client } => Ok(database_client.clone()),
+            Self::Distributed {
+                meta_client: _,
+                channel_mgr,
+            } => {
+                let frontends = self.scan_for_frontend().await?;
+                let mut last_activity_ts = i64::MIN;
+                let mut peer = None;
+                for (_key, val) in frontends.iter() {
+                    if val.last_activity_ts > last_activity_ts {
+                        last_activity_ts = val.last_activity_ts;
+                        peer = Some(val.peer.clone());
+                    }
+                }
+                let Some(peer) = peer else {
+                    UnexpectedSnafu {
+                        reason: format!("No frontend available: {:?}", frontends),
+                    }
+                    .fail()?
+                };
+                let client =
+                    Client::with_manager_and_urls(channel_mgr.clone(), vec![peer.addr.clone()]);
+                let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+                Ok(DatabaseWithPeer::new(database, peer))
+            }
+        }
+    }
+
+    /// Get a database client, and possibly update it before returning.
+    pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
+        match self {
+            Self::Standalone { database_client } => Ok(database_client.clone()),
+            Self::Distributed { meta_client: _, .. } => self.get_last_active_frontend().await,
+        }
+    }
+}
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -57,6 +57,7 @@ use crate::error::{
 };
 use crate::heartbeat::HeartbeatTask;
 use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
+use crate::recording_rules::{FrontendClient, RecordingRuleEngine};
 use crate::transform::register_function_to_query_engine;
 use crate::utils::{SizeReportSender, StateReportHandler};
 use crate::{Error, FlowWorkerManager, FlownodeOptions};
@@ -245,6 +246,7 @@ impl FlownodeInstance {
        self.server.shutdown().await.context(ShutdownServerSnafu)?;

        if let Some(task) = &self.heartbeat_task {
+            info!("Close heartbeat task for flownode");
            task.shutdown();
        }

@@ -271,6 +273,8 @@ pub struct FlownodeBuilder {
    heartbeat_task: Option<HeartbeatTask>,
    /// receive a oneshot sender to send state size report
    state_report_handler: Option<StateReportHandler>,
+    /// Client to send sql to frontend
+    frontend_client: Arc<FrontendClient>,
 }

 impl FlownodeBuilder {
@@ -281,6 +285,7 @@ impl FlownodeBuilder {
        table_meta: TableMetadataManagerRef,
        catalog_manager: CatalogManagerRef,
        flow_metadata_manager: FlowMetadataManagerRef,
+        frontend_client: Arc<FrontendClient>,
    ) -> Self {
        Self {
            opts,
@@ -290,6 +295,7 @@ impl FlownodeBuilder {
            flow_metadata_manager,
            heartbeat_task: None,
            state_report_handler: None,
+            frontend_client,
        }
    }

@@ -447,7 +453,14 @@ impl FlownodeBuilder {

        let node_id = self.opts.node_id.map(|id| id as u32);

-        let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
+        let rule_engine = RecordingRuleEngine::new(
+            self.frontend_client.clone(),
+            query_engine.clone(),
+            self.flow_metadata_manager.clone(),
+            table_meta.clone(),
+        );
+
+        let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta, rule_engine);
        for worker_id in 0..num_workers {
            let (tx, rx) = oneshot::channel();

--- a/src/flow/src/test_utils.rs
+++ b/src/flow/src/test_utils.rs
@@ -86,7 +86,8 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {

    let schema = vec![
        datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
-        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
+        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
    ];
    let mut columns = vec![];
    let numbers = (1..=10).collect_vec();
@@ -114,6 +115,37 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
    };
    catalog_list.register_table_sync(req_with_ts).unwrap();

+    let schema = vec![
+        datatypes::schema::ColumnSchema::new("NUMBER", CDT::uint32_datatype(), false),
+        datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ];
+    let mut columns = vec![];
+    let numbers = (1..=10).collect_vec();
+    let column: VectorRef = Arc::new(<u32 as Scalar>::VectorType::from_vec(numbers));
+    columns.push(column);
+
+    let ts = (1..=10).collect_vec();
+    let mut builder = TimestampMillisecondVectorBuilder::with_capacity(10);
+    ts.into_iter()
+        .map(|v| builder.push(Some(TimestampMillisecond::new(v))))
+        .count();
+    let column: VectorRef = builder.to_vector_cloned();
+    columns.push(column);
+
+    let schema = Arc::new(Schema::new(schema));
+    let recordbatch = common_recordbatch::RecordBatch::new(schema, columns).unwrap();
+    let table = MemTable::table("UPPERCASE_NUMBERS_WITH_TS", recordbatch);
+
+    let req_with_ts = RegisterTableRequest {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table_name: "UPPERCASE_NUMBERS_WITH_TS".to_string(),
+        table_id: 1025,
+        table,
+    };
+    catalog_list.register_table_sync(req_with_ts).unwrap();
+
    let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);

    let engine = factory.query_engine();
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -13,7 +13,7 @@ workspace = true
 [dependencies]
 api.workspace = true
 arc-swap = "1.0"
-async-trait = "0.1"
+async-trait.workspace = true
 auth.workspace = true
 cache.workspace = true
 catalog.workspace = true
@@ -44,6 +44,7 @@ lazy_static.workspace = true
 log-query.workspace = true
 log-store.workspace = true
 meta-client.workspace = true
+num_cpus.workspace = true
 opentelemetry-proto.workspace = true
 operator.workspace = true
 partition.workspace = true
@@ -70,7 +71,7 @@ catalog = { workspace = true, features = ["testing"] }
 common-test-util.workspace = true
 datanode.workspace = true
 datatypes.workspace = true
-futures = "0.3"
+futures.workspace = true
 serde_json.workspace = true
 strfmt = "0.2"
 tower.workspace = true
--- a/src/frontend/src/heartbeat.rs
+++ b/src/frontend/src/heartbeat.rs
@@ -118,10 +118,9 @@ impl HeartbeatTask {
        });
    }

-    fn create_heartbeat_request(
+    fn new_heartbeat_request(
+        heartbeat_request: &HeartbeatRequest,
        message: Option<OutgoingMessage>,
-        peer: Option<Peer>,
-        start_time_ms: u64,
    ) -> Option<HeartbeatRequest> {
        let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
            Some(Ok(message)) => Some(message),
@@ -134,9 +133,7 @@ impl HeartbeatTask {

        Some(HeartbeatRequest {
            mailbox_message,
-            peer,
-            info: Self::build_node_info(start_time_ms),
-            ..Default::default()
+            ..heartbeat_request.clone()
        })
    }

@@ -147,6 +144,7 @@ impl HeartbeatTask {
            version: build_info.version.to_string(),
            git_commit: build_info.commit_short.to_string(),
            start_time_ms,
+            cpus: num_cpus::get() as u32,
        })
    }

@@ -167,11 +165,17 @@ impl HeartbeatTask {
            let sleep = tokio::time::sleep(Duration::from_millis(0));
            tokio::pin!(sleep);

+            let heartbeat_request = HeartbeatRequest {
+                peer: self_peer,
+                info: Self::build_node_info(start_time_ms),
+                ..Default::default()
+            };
+
            loop {
                let req = tokio::select! {
                    message = outgoing_rx.recv() => {
                        if let Some(message) = message {
-                            Self::create_heartbeat_request(Some(message), self_peer.clone(), start_time_ms)
+                            Self::new_heartbeat_request(&heartbeat_request, Some(message))
                        } else {
                            // Receives None that means Sender was dropped, we need to break the current loop
                            break
@@ -179,7 +183,7 @@ impl HeartbeatTask {
                    }
                    _ = &mut sleep => {
                        sleep.as_mut().reset(Instant::now() + Duration::from_millis(report_interval));
-                       Self::create_heartbeat_request(None, self_peer.clone(), start_time_ms)
+                       Self::new_heartbeat_request(&heartbeat_request, None)
                    }
                };

--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -237,6 +237,13 @@ impl Instance {

        let output = match stmt {
            Statement::Query(_) | Statement::Explain(_) | Statement::Delete(_) => {
+                // TODO: remove this when format is supported in datafusion
+                if let Statement::Explain(explain) = &stmt {
+                    if let Some(format) = explain.format() {
+                        query_ctx.set_explain_format(format.to_string());
+                    }
+                }
+
                let stmt = QueryStatement::Sql(stmt);
                let plan = self
                    .statement_executor
--- a/src/index/src/fulltext_index/tests.rs
+++ b/src/index/src/fulltext_index/tests.rs
@@ -25,12 +25,12 @@ use crate::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCr
 use crate::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
 use crate::fulltext_index::{Analyzer, Config};

-async fn new_bounded_stager(prefix: &str) -> (TempDir, Arc<BoundedStager>) {
+async fn new_bounded_stager(prefix: &str) -> (TempDir, Arc<BoundedStager<String>>) {
    let staging_dir = create_temp_dir(prefix);
    let path = staging_dir.path().to_path_buf();
    (
        staging_dir,
-        Arc::new(BoundedStager::new(path, 102400, None).await.unwrap()),
+        Arc::new(BoundedStager::new(path, 102400, None, None).await.unwrap()),
    )
 }

@@ -68,13 +68,13 @@ async fn test_search(
    let file_accessor = Arc::new(MockFileAccessor::new(prefix));
    let puffin_manager = FsPuffinManager::new(stager, file_accessor);

-    let file_name = "fulltext_index";
-    let blob_key = "fulltext_index";
-    let mut writer = puffin_manager.writer(file_name).await.unwrap();
-    create_index(prefix, &mut writer, blob_key, texts, config).await;
+    let file_name = "fulltext_index".to_string();
+    let blob_key = "fulltext_index".to_string();
+    let mut writer = puffin_manager.writer(&file_name).await.unwrap();
+    create_index(prefix, &mut writer, &blob_key, texts, config).await;

-    let reader = puffin_manager.reader(file_name).await.unwrap();
-    let index_dir = reader.dir(blob_key).await.unwrap();
+    let reader = puffin_manager.reader(&file_name).await.unwrap();
+    let index_dir = reader.dir(&blob_key).await.unwrap();
    let searcher = TantivyFulltextIndexSearcher::new(index_dir.path()).unwrap();
    let results = searcher.search(query).await.unwrap();

--- a/src/log-query/src/log_query.rs
+++ b/src/log-query/src/log_query.rs
@@ -55,7 +55,7 @@ pub struct LogQuery {
 }

 /// Expression to calculate on log after filtering.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum LogExpr {
    NamedIdent(String),
    PositionalIdent(usize),
@@ -289,7 +289,7 @@ pub struct ColumnFilters {
    pub filters: Vec<ContentFilter>,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum ContentFilter {
    // Search-based filters
    /// Only match the exact content.
@@ -310,14 +310,19 @@ pub enum ContentFilter {
    // Value-based filters
    /// Content exists, a.k.a. not null.
    Exist,
-    Between(String, String),
+    Between {
+        start: String,
+        end: String,
+        start_inclusive: bool,
+        end_inclusive: bool,
+    },
    // TODO(ruihang): arithmetic operations

    // Compound filters
    Compound(Vec<ContentFilter>, BinaryOperator),
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum BinaryOperator {
    And,
    Or,
--- a/src/meta-client/Cargo.toml
+++ b/src/meta-client/Cargo.toml
@@ -9,7 +9,7 @@ workspace = true

 [dependencies]
 api.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 common-error.workspace = true
 common-grpc.workspace = true
 common-macro.workspace = true
@@ -27,7 +27,7 @@ tonic.workspace = true

 [dev-dependencies]
 datatypes.workspace = true
-futures = "0.3"
+futures.workspace = true
 meta-srv = { workspace = true, features = ["mock"] }
 tower.workspace = true
 tracing = "0.1"
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -112,6 +112,7 @@ impl MetaClientBuilder {
            .enable_store()
            .enable_heartbeat()
            .enable_procedure()
+            .enable_access_cluster_info()
    }

    pub fn enable_heartbeat(self) -> Self {
--- a/src/meta-client/src/client/heartbeat.rs
+++ b/src/meta-client/src/client/heartbeat.rs
@@ -198,13 +198,13 @@ impl Inner {
            }
        );

-        let leader = self
+        let leader_addr = self
            .ask_leader
            .as_ref()
            .unwrap()
            .get_leader()
            .context(error::NoLeaderSnafu)?;
-        let mut leader = self.make_client(leader)?;
+        let mut leader = self.make_client(&leader_addr)?;

        let (sender, receiver) = mpsc::channel::<HeartbeatRequest>(128);

@@ -236,7 +236,11 @@ impl Inner {
            .await
            .map_err(error::Error::from)?
            .context(error::CreateHeartbeatStreamSnafu)?;
-        info!("Success to create heartbeat stream to server: {:#?}", res);
+
+        info!(
+            "Success to create heartbeat stream to server: {}, response: {:#?}",
+            leader_addr, res
+        );

        Ok((
            HeartbeatSender::new(self.id, self.role, sender),
--- a/src/meta-srv/Cargo.toml
+++ b/src/meta-srv/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [features]
 mock = []
 pg_kvbackend = ["dep:tokio-postgres", "common-meta/pg_kvbackend"]
+mysql_kvbackend = []                                              # placeholder features so CI can compile

 [lints]
 workspace = true
@@ -16,7 +17,7 @@ local-ip-address.workspace = true

 [dependencies]
 api.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 bytes.workspace = true
 chrono.workspace = true
 clap.workspace = true
--- a/src/meta-srv/src/handler.rs
+++ b/src/meta-srv/src/handler.rs
@@ -44,6 +44,7 @@ use mailbox_handler::MailboxHandler;
 use on_leader_start_handler::OnLeaderStartHandler;
 use publish_heartbeat_handler::PublishHeartbeatHandler;
 use region_lease_handler::RegionLeaseHandler;
+use remap_flow_peer_handler::RemapFlowPeerHandler;
 use response_header_handler::ResponseHeaderHandler;
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::RegionId;
@@ -71,6 +72,7 @@ pub mod mailbox_handler;
 pub mod on_leader_start_handler;
 pub mod publish_heartbeat_handler;
 pub mod region_lease_handler;
+pub mod remap_flow_peer_handler;
 pub mod response_header_handler;

 #[async_trait::async_trait]
@@ -573,6 +575,7 @@ impl HeartbeatHandlerGroupBuilder {
            self.add_handler_last(publish_heartbeat_handler);
        }
        self.add_handler_last(CollectStatsHandler::new(self.flush_stats_factor));
+        self.add_handler_last(RemapFlowPeerHandler::default());

        if let Some(flow_state_handler) = self.flow_state_handler.take() {
            self.add_handler_last(flow_state_handler);
@@ -853,7 +856,7 @@ mod tests {
            .unwrap();

        let handlers = group.handlers;
-        assert_eq!(12, handlers.len());
+        assert_eq!(13, handlers.len());

        let names = [
            "ResponseHeaderHandler",
@@ -868,6 +871,7 @@ mod tests {
            "MailboxHandler",
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -888,7 +892,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(13, handlers.len());
+        assert_eq!(14, handlers.len());

        let names = [
            "ResponseHeaderHandler",
@@ -904,6 +908,7 @@ mod tests {
            "CollectStatsHandler",
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -921,7 +926,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(13, handlers.len());
+        assert_eq!(14, handlers.len());

        let names = [
            "CollectStatsHandler",
@@ -937,6 +942,7 @@ mod tests {
            "MailboxHandler",
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -954,7 +960,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(13, handlers.len());
+        assert_eq!(14, handlers.len());

        let names = [
            "ResponseHeaderHandler",
@@ -970,6 +976,7 @@ mod tests {
            "CollectStatsHandler",
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -987,7 +994,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(13, handlers.len());
+        assert_eq!(14, handlers.len());

        let names = [
            "ResponseHeaderHandler",
@@ -1003,6 +1010,7 @@ mod tests {
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
            "ResponseHeaderHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -1020,7 +1028,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(12, handlers.len());
+        assert_eq!(13, handlers.len());

        let names = [
            "ResponseHeaderHandler",
@@ -1035,6 +1043,7 @@ mod tests {
            "CollectStatsHandler",
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -1052,7 +1061,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(12, handlers.len());
+        assert_eq!(13, handlers.len());

        let names = [
            "ResponseHeaderHandler",
@@ -1067,6 +1076,7 @@ mod tests {
            "MailboxHandler",
            "FilterInactiveRegionStatsHandler",
            "ResponseHeaderHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -1084,7 +1094,7 @@ mod tests {

        let group = builder.build().unwrap();
        let handlers = group.handlers;
-        assert_eq!(12, handlers.len());
+        assert_eq!(13, handlers.len());

        let names = [
            "CollectStatsHandler",
@@ -1099,6 +1109,7 @@ mod tests {
            "MailboxHandler",
            "FilterInactiveRegionStatsHandler",
            "CollectStatsHandler",
+            "RemapFlowPeerHandler",
        ];

        for (handler, name) in handlers.iter().zip(names.into_iter()) {
--- a/src/meta-srv/src/handler/check_leader_handler.rs
+++ b/src/meta-srv/src/handler/check_leader_handler.rs
@@ -23,8 +23,8 @@ pub struct CheckLeaderHandler;

 #[async_trait::async_trait]
 impl HeartbeatHandler for CheckLeaderHandler {
-    fn is_acceptable(&self, role: Role) -> bool {
-        role == Role::Datanode
+    fn is_acceptable(&self, _role: Role) -> bool {
+        true
    }

    async fn handle(
--- a/src/meta-srv/src/handler/collect_cluster_info_handler.rs
+++ b/src/meta-srv/src/handler/collect_cluster_info_handler.rs
@@ -13,7 +13,6 @@
 // limitations under the License.

 use api::v1::meta::{HeartbeatRequest, NodeInfo as PbNodeInfo, Role};
-use common_meta::cluster;
 use common_meta::cluster::{
    DatanodeStatus, FlownodeStatus, FrontendStatus, NodeInfo, NodeInfoKey, NodeStatus,
 };
@@ -42,7 +41,7 @@ impl HeartbeatHandler for CollectFrontendClusterInfoHandler {
        ctx: &mut Context,
        _acc: &mut HeartbeatAccumulator,
    ) -> Result<HandleControl> {
-        let Some((key, peer, info)) = extract_base_info(req, Role::Frontend) else {
+        let Some((key, peer, info)) = extract_base_info(req) else {
            return Ok(HandleControl::Continue);
        };

@@ -75,7 +74,7 @@ impl HeartbeatHandler for CollectFlownodeClusterInfoHandler {
        ctx: &mut Context,
        _acc: &mut HeartbeatAccumulator,
    ) -> Result<HandleControl> {
-        let Some((key, peer, info)) = extract_base_info(req, Role::Flownode) else {
+        let Some((key, peer, info)) = extract_base_info(req) else {
            return Ok(HandleControl::Continue);
        };

@@ -109,7 +108,7 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler {
        ctx: &mut Context,
        acc: &mut HeartbeatAccumulator,
    ) -> Result<HandleControl> {
-        let Some((key, peer, info)) = extract_base_info(req, Role::Datanode) else {
+        let Some((key, peer, info)) = extract_base_info(req) else {
            return Ok(HandleControl::Continue);
        };

@@ -144,16 +143,9 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler {
    }
 }

-fn extract_base_info(
-    req: &HeartbeatRequest,
-    role: Role,
-) -> Option<(NodeInfoKey, Peer, PbNodeInfo)> {
-    let HeartbeatRequest {
-        header, peer, info, ..
-    } = req;
-    let Some(header) = &header else {
-        return None;
-    };
+fn extract_base_info(request: &HeartbeatRequest) -> Option<(NodeInfoKey, Peer, PbNodeInfo)> {
+    let HeartbeatRequest { peer, info, .. } = request;
+    let key = NodeInfoKey::new(request)?;
    let Some(peer) = &peer else {
        return None;
    };
@@ -161,23 +153,11 @@ fn extract_base_info(
        return None;
    };

-    Some((
-        NodeInfoKey {
-            cluster_id: header.cluster_id,
-            role: match role {
-                Role::Datanode => cluster::Role::Datanode,
-                Role::Frontend => cluster::Role::Frontend,
-                Role::Flownode => cluster::Role::Flownode,
-            },
-            node_id: peer.id,
-        },
-        Peer::from(peer.clone()),
-        info.clone(),
-    ))
+    Some((key, Peer::from(peer.clone()), info.clone()))
 }

 async fn put_into_memory_store(ctx: &mut Context, key: NodeInfoKey, value: NodeInfo) -> Result<()> {
-    let key = key.into();
+    let key = (&key).into();
    let value = value.try_into().context(InvalidClusterInfoFormatSnafu)?;
    let put_req = PutRequest {
        key,
--- a/src/meta-srv/src/handler/collect_stats_handler.rs
+++ b/src/meta-srv/src/handler/collect_stats_handler.rs
@@ -21,7 +21,7 @@ use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
 use common_meta::key::{MetadataKey, MetadataValue};
 use common_meta::peer::Peer;
 use common_meta::rpc::store::PutRequest;
-use common_telemetry::{error, warn};
+use common_telemetry::{error, info, warn};
 use dashmap::DashMap;
 use snafu::ResultExt;

@@ -185,6 +185,10 @@ async fn rewrite_node_address(ctx: &mut Context, stat: &Stat) {

        match ctx.leader_cached_kv_backend.put(put).await {
            Ok(_) => {
+                info!(
+                    "Successfully updated datanode `NodeAddressValue`: {:?}",
+                    peer
+                );
                // broadcast invalidating cache
                let cache_idents = stat
                    .table_ids()
@@ -200,11 +204,14 @@ async fn rewrite_node_address(ctx: &mut Context, stat: &Stat) {
                }
            }
            Err(e) => {
-                error!(e; "Failed to update NodeAddressValue: {:?}", peer);
+                error!(e; "Failed to update datanode `NodeAddressValue`: {:?}", peer);
            }
        }
    } else {
-        warn!("Failed to serialize NodeAddressValue: {:?}", peer);
+        warn!(
+            "Failed to serialize datanode `NodeAddressValue`: {:?}",
+            peer
+        );
    }
 }

--- a/src/meta-srv/src/handler/remap_flow_peer_handler.rs
+++ b/src/meta-srv/src/handler/remap_flow_peer_handler.rs
@@ -0,0 +1,92 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::meta::{HeartbeatRequest, Peer, Role};
+use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
+use common_meta::key::{MetadataKey, MetadataValue};
+use common_meta::rpc::store::PutRequest;
+use common_telemetry::{error, info, warn};
+use dashmap::DashMap;
+
+use crate::handler::{HandleControl, HeartbeatAccumulator, HeartbeatHandler};
+use crate::metasrv::Context;
+use crate::Result;
+
+#[derive(Debug, Default)]
+pub struct RemapFlowPeerHandler {
+    /// flow_node_id -> epoch
+    epoch_cache: DashMap<u64, u64>,
+}
+
+#[async_trait::async_trait]
+impl HeartbeatHandler for RemapFlowPeerHandler {
+    fn is_acceptable(&self, role: Role) -> bool {
+        role == Role::Flownode
+    }
+
+    async fn handle(
+        &self,
+        req: &HeartbeatRequest,
+        ctx: &mut Context,
+        _acc: &mut HeartbeatAccumulator,
+    ) -> Result<HandleControl> {
+        let Some(peer) = req.peer.as_ref() else {
+            return Ok(HandleControl::Continue);
+        };
+
+        let current_epoch = req.node_epoch;
+        let flow_node_id = peer.id;
+
+        let refresh = if let Some(mut epoch) = self.epoch_cache.get_mut(&flow_node_id) {
+            if current_epoch > *epoch.value() {
+                *epoch.value_mut() = current_epoch;
+                true
+            } else {
+                false
+            }
+        } else {
+            self.epoch_cache.insert(flow_node_id, current_epoch);
+            true
+        };
+
+        if refresh {
+            rewrite_node_address(ctx, peer).await;
+        }
+
+        Ok(HandleControl::Continue)
+    }
+}
+
+async fn rewrite_node_address(ctx: &mut Context, peer: &Peer) {
+    let key = NodeAddressKey::with_flownode(peer.id).to_bytes();
+    if let Ok(value) = NodeAddressValue::new(peer.clone().into()).try_as_raw_value() {
+        let put = PutRequest {
+            key,
+            value,
+            prev_kv: false,
+        };
+
+        match ctx.leader_cached_kv_backend.put(put).await {
+            Ok(_) => {
+                info!("Successfully updated flow `NodeAddressValue`: {:?}", peer);
+                // TODO(discord): broadcast invalidating cache to all frontends
+            }
+            Err(e) => {
+                error!(e; "Failed to update flow `NodeAddressValue`: {:?}", peer);
+            }
+        }
+    } else {
+        warn!("Failed to serialize flow `NodeAddressValue`: {:?}", peer);
+    }
+}
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -32,6 +32,7 @@ use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBac
 use common_meta::leadership_notifier::{
    LeadershipChangeNotifier, LeadershipChangeNotifierCustomizerRef,
 };
+use common_meta::node_expiry_listener::NodeExpiryListener;
 use common_meta::peer::Peer;
 use common_meta::region_keeper::MemoryRegionKeeperRef;
 use common_meta::wal_options_allocator::WalOptionsAllocatorRef;
@@ -151,6 +152,8 @@ pub struct MetasrvOptions {
    #[cfg(feature = "pg_kvbackend")]
    /// Lock id for meta kv election. Only effect when using pg_kvbackend.
    pub meta_election_lock_id: u64,
+    #[serde(with = "humantime_serde")]
+    pub node_max_idle_time: Duration,
 }

 const DEFAULT_METASRV_ADDR_PORT: &str = "3002";
@@ -192,6 +195,7 @@ impl Default for MetasrvOptions {
            meta_table_name: DEFAULT_META_TABLE_NAME.to_string(),
            #[cfg(feature = "pg_kvbackend")]
            meta_election_lock_id: DEFAULT_META_ELECTION_LOCK_ID,
+            node_max_idle_time: Duration::from_secs(24 * 60 * 60),
        }
    }
 }
@@ -442,6 +446,10 @@ impl Metasrv {
            leadership_change_notifier.add_listener(self.wal_options_allocator.clone());
            leadership_change_notifier
                .add_listener(Arc::new(ProcedureManagerListenerAdapter(procedure_manager)));
+            leadership_change_notifier.add_listener(Arc::new(NodeExpiryListener::new(
+                self.options.node_max_idle_time,
+                self.in_memory.clone(),
+            )));
            if let Some(region_supervisor_ticker) = &self.region_supervisor_ticker {
                leadership_change_notifier.add_listener(region_supervisor_ticker.clone() as _);
            }
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -68,13 +68,15 @@ impl heartbeat_server::Heartbeat for Metasrv {
                        };

                        if pusher_id.is_none() {
-                            pusher_id = register_pusher(&handler_group, header, tx.clone()).await;
+                            pusher_id =
+                                Some(register_pusher(&handler_group, header, tx.clone()).await);
                        }
                        if let Some(k) = &pusher_id {
                            METRIC_META_HEARTBEAT_RECV.with_label_values(&[&k.to_string()]);
                        } else {
                            METRIC_META_HEARTBEAT_RECV.with_label_values(&["none"]);
                        }
+
                        let res = handler_group
                            .handle(req, ctx.clone())
                            .await
@@ -173,13 +175,13 @@ async fn register_pusher(
    handler_group: &HeartbeatHandlerGroup,
    header: &RequestHeader,
    sender: Sender<std::result::Result<HeartbeatResponse, tonic::Status>>,
-) -> Option<PusherId> {
+) -> PusherId {
    let role = header.role();
    let id = get_node_id(header);
    let pusher_id = PusherId::new(role, id);
    let pusher = Pusher::new(sender, header);
    handler_group.register_pusher(pusher_id, pusher).await;
-    Some(pusher_id)
+    pusher_id
 }

 #[cfg(test)]
--- a/src/meta-srv/src/service/procedure.rs
+++ b/src/meta-srv/src/service/procedure.rs
@@ -17,13 +17,15 @@ use std::time::Duration;

 use api::v1::meta::{
    procedure_service_server, DdlTaskRequest as PbDdlTaskRequest,
-    DdlTaskResponse as PbDdlTaskResponse, MigrateRegionRequest, MigrateRegionResponse,
+    DdlTaskResponse as PbDdlTaskResponse, Error, MigrateRegionRequest, MigrateRegionResponse,
    ProcedureDetailRequest, ProcedureDetailResponse, ProcedureStateResponse, QueryProcedureRequest,
+    ResponseHeader,
 };
 use common_meta::ddl::ExecutorContext;
 use common_meta::rpc::ddl::{DdlTask, SubmitDdlTaskRequest};
 use common_meta::rpc::procedure;
-use snafu::{ensure, OptionExt, ResultExt};
+use common_telemetry::warn;
+use snafu::{OptionExt, ResultExt};
 use tonic::{Request, Response};

 use super::GrpcResult;
@@ -37,6 +39,16 @@ impl procedure_service_server::ProcedureService for Metasrv {
        &self,
        request: Request<QueryProcedureRequest>,
    ) -> GrpcResult<ProcedureStateResponse> {
+        if !self.is_leader() {
+            let resp = ProcedureStateResponse {
+                header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
+                ..Default::default()
+            };
+
+            warn!("The current meta is not leader, but a `query procedure state` request have reached the meta. Detail: {:?}.", request);
+            return Ok(Response::new(resp));
+        }
+
        let QueryProcedureRequest { header, pid, .. } = request.into_inner();
        let _header = header.context(error::MissingRequestHeaderSnafu)?;
        let pid = pid.context(error::MissingRequiredParameterSnafu { param: "pid" })?;
@@ -57,6 +69,16 @@ impl procedure_service_server::ProcedureService for Metasrv {
    }

    async fn ddl(&self, request: Request<PbDdlTaskRequest>) -> GrpcResult<PbDdlTaskResponse> {
+        if !self.is_leader() {
+            let resp = PbDdlTaskResponse {
+                header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
+                ..Default::default()
+            };
+
+            warn!("The current meta is not leader, but a `ddl` request have reached the meta. Detail: {:?}.", request);
+            return Ok(Response::new(resp));
+        }
+
        let PbDdlTaskRequest {
            header,
            query_context,
@@ -99,12 +121,15 @@ impl procedure_service_server::ProcedureService for Metasrv {
        &self,
        request: Request<MigrateRegionRequest>,
    ) -> GrpcResult<MigrateRegionResponse> {
-        ensure!(
-            self.meta_peer_client().is_leader(),
-            error::UnexpectedSnafu {
-                violated: "Trying to submit a region migration procedure to non-leader meta server"
-            }
-        );
+        if !self.is_leader() {
+            let resp = MigrateRegionResponse {
+                header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
+                ..Default::default()
+            };
+
+            warn!("The current meta is not leader, but a `migrate` request have reached the meta. Detail: {:?}.", request);
+            return Ok(Response::new(resp));
+        }

        let MigrateRegionRequest {
            header,
@@ -150,6 +175,16 @@ impl procedure_service_server::ProcedureService for Metasrv {
        &self,
        request: Request<ProcedureDetailRequest>,
    ) -> GrpcResult<ProcedureDetailResponse> {
+        if !self.is_leader() {
+            let resp = ProcedureDetailResponse {
+                header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
+                ..Default::default()
+            };
+
+            warn!("The current meta is not leader, but a `procedure details` request have reached the meta. Detail: {:?}.", request);
+            return Ok(Response::new(resp));
+        }
+
        let ProcedureDetailRequest { header } = request.into_inner();
        let _header = header.context(error::MissingRequestHeaderSnafu)?;
        let metas = self
--- a/src/metric-engine/src/data_region.rs
+++ b/src/metric-engine/src/data_region.rs
@@ -142,6 +142,7 @@ impl DataRegion {
                c.column_id = new_column_id_start + delta as u32;
                c.column_schema.set_nullable();
                match index_options {
+                    IndexOptions::None => {}
                    IndexOptions::Inverted => {
                        c.column_schema.set_inverted_index(true);
                    }
--- a/src/metric-engine/src/engine/create.rs
+++ b/src/metric-engine/src/engine/create.rs
@@ -21,7 +21,7 @@ use api::v1::SemanticType;
 use common_telemetry::info;
 use common_time::{Timestamp, FOREVER};
 use datatypes::data_type::ConcreteDataType;
-use datatypes::schema::ColumnSchema;
+use datatypes::schema::{ColumnSchema, SkippingIndexOptions};
 use datatypes::value::Value;
 use mito2::engine::MITO_ENGINE_NAME;
 use object_store::util::join_dir;
@@ -55,6 +55,8 @@ use crate::error::{
 use crate::metrics::PHYSICAL_REGION_COUNT;
 use crate::utils::{self, to_data_region_id, to_metadata_region_id};

+const DEFAULT_TABLE_ID_SKIPPING_INDEX_GRANULARITY: u32 = 1024;
+
 impl MetricEngineInner {
    pub async fn create_regions(
        &self,
@@ -440,6 +442,7 @@ impl MetricEngineInner {
    ///
    /// Return `[table_id_col, tsid_col]`
    fn internal_column_metadata() -> [ColumnMetadata; 2] {
+        // Safety: BloomFilter is a valid skipping index type
        let metric_name_col = ColumnMetadata {
            column_id: ReservedColumnId::table_id(),
            semantic_type: SemanticType::Tag,
@@ -448,7 +451,11 @@ impl MetricEngineInner {
                ConcreteDataType::uint32_datatype(),
                false,
            )
-            .with_inverted_index(true),
+            .with_skipping_options(SkippingIndexOptions {
+                granularity: DEFAULT_TABLE_ID_SKIPPING_INDEX_GRANULARITY,
+                index_type: datatypes::schema::SkippingIndexType::BloomFilter,
+            })
+            .unwrap(),
        };
        let tsid_col = ColumnMetadata {
            column_id: ReservedColumnId::tsid(),
--- a/src/metric-engine/src/engine/drop.rs
+++ b/src/metric-engine/src/engine/drop.rs
@@ -30,9 +30,10 @@ impl MetricEngineInner {
    pub async fn drop_region(
        &self,
        region_id: RegionId,
-        _req: RegionDropRequest,
+        req: RegionDropRequest,
    ) -> Result<AffectedRows> {
        let data_region_id = utils::to_data_region_id(region_id);
+        let fast_path = req.fast_path;

        // enclose the guard in a block to prevent the guard from polluting the async context
        let (is_physical_region, is_physical_region_busy) = {
@@ -52,7 +53,7 @@ impl MetricEngineInner {

        if is_physical_region {
            // check if there is no logical region relates to this physical region
-            if is_physical_region_busy {
+            if is_physical_region_busy && !fast_path {
                // reject if there is any present logical region
                return Err(PhysicalRegionBusySnafu {
                    region_id: data_region_id,
@@ -60,9 +61,21 @@ impl MetricEngineInner {
                .build());
            }

-            self.drop_physical_region(data_region_id).await
+            return self.drop_physical_region(data_region_id).await;
+        }
+
+        if fast_path {
+            // for fast path, we don't delete the metadata in the metadata region.
+            // it only remove the logical region from the engine state.
+            //
+            // The drop database procedure will ensure the metadata region and data region are dropped eventually.
+            self.state
+                .write()
+                .unwrap()
+                .remove_logical_region(region_id)?;
+
+            Ok(0)
        } else {
-            // cannot merge these two `if` otherwise the stupid type checker will complain
            let metadata_region_id = self
                .state
                .read()
@@ -87,13 +100,16 @@ impl MetricEngineInner {
        // Since the physical regions are going to be dropped, we don't need to
        // update the contents in metadata region.
        self.mito
-            .handle_request(data_region_id, RegionRequest::Drop(RegionDropRequest {}))
+            .handle_request(
+                data_region_id,
+                RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+            )
            .await
            .with_context(|_| CloseMitoRegionSnafu { region_id })?;
        self.mito
            .handle_request(
                metadata_region_id,
-                RegionRequest::Drop(RegionDropRequest {}),
+                RegionRequest::Drop(RegionDropRequest { fast_path: false }),
            )
            .await
            .with_context(|_| CloseMitoRegionSnafu { region_id })?;
--- a/src/metric-engine/src/engine/options.rs
+++ b/src/metric-engine/src/engine/options.rs
@@ -40,6 +40,7 @@ pub struct PhysicalRegionOptions {
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
 pub enum IndexOptions {
    #[default]
+    None,
    Inverted,
    Skipping {
        granularity: u32,
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -16,7 +16,7 @@ api.workspace = true
 aquamarine.workspace = true
 async-channel = "1.9"
 async-stream.workspace = true
-async-trait = "0.1"
+async-trait.workspace = true
 bytemuck.workspace = true
 bytes.workspace = true
 common-base.workspace = true
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -146,11 +146,14 @@ impl AccessLayer {
        } else {
            // Write cache is disabled.
            let store = self.object_store.clone();
+            let path_provider = RegionFilePathFactory::new(self.region_dir.clone());
            let indexer_builder = IndexerBuilderImpl {
                op_type: request.op_type,
                metadata: request.metadata.clone(),
                row_group_size: write_opts.row_group_size,
-                puffin_manager: self.puffin_manager_factory.build(store),
+                puffin_manager: self
+                    .puffin_manager_factory
+                    .build(store, path_provider.clone()),
                intermediate_manager: self.intermediate_manager.clone(),
                index_options: request.index_options,
                inverted_index_config: request.inverted_index_config,
@@ -161,9 +164,7 @@ impl AccessLayer {
                self.object_store.clone(),
                request.metadata,
                indexer_builder,
-                RegionFilePathFactory {
-                    region_dir: self.region_dir.clone(),
-                },
+                path_provider,
            )
            .await;
            writer
@@ -248,8 +249,18 @@ pub trait FilePathProvider: Send + Sync {
 /// Path provider that builds paths in local write cache.
 #[derive(Clone)]
 pub(crate) struct WriteCachePathProvider {
-    pub(crate) region_id: RegionId,
-    pub(crate) file_cache: FileCacheRef,
+    region_id: RegionId,
+    file_cache: FileCacheRef,
+}
+
+impl WriteCachePathProvider {
+    /// Creates a new `WriteCachePathProvider` instance.
+    pub fn new(region_id: RegionId, file_cache: FileCacheRef) -> Self {
+        Self {
+            region_id,
+            file_cache,
+        }
+    }
 }

 impl FilePathProvider for WriteCachePathProvider {
@@ -267,7 +278,14 @@ impl FilePathProvider for WriteCachePathProvider {
 /// Path provider that builds paths in region storage path.
 #[derive(Clone, Debug)]
 pub(crate) struct RegionFilePathFactory {
-    pub(crate) region_dir: String,
+    region_dir: String,
+}
+
+impl RegionFilePathFactory {
+    /// Creates a new `RegionFilePathFactory` instance.
+    pub fn new(region_dir: String) -> Self {
+        Self { region_dir }
+    }
 }

 impl FilePathProvider for RegionFilePathFactory {
--- a/src/mito2/src/cache/file_cache.rs
+++ b/src/mito2/src/cache/file_cache.rs
@@ -187,9 +187,12 @@ impl FileCache {
    }

    /// Removes a file from the cache explicitly.
+    /// It always tries to remove the file from the local store because we may not have the file
+    /// in the memory index if upload is failed.
    pub(crate) async fn remove(&self, key: IndexKey) {
        let file_path = self.cache_file_path(key);
        self.memory_index.remove(&key).await;
+        // Always delete the file from the local store.
        if let Err(e) = self.local_store.delete(&file_path).await {
            warn!(e; "Failed to delete a cached file {}", file_path);
        }
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -22,6 +22,7 @@ use common_telemetry::{debug, info};
 use futures::AsyncWriteExt;
 use object_store::ObjectStore;
 use snafu::ResultExt;
+use store_api::storage::RegionId;

 use crate::access_layer::{
    new_fs_cache_store, FilePathProvider, RegionFilePathFactory, SstInfoArray, SstWriteRequest,
@@ -114,15 +115,14 @@ impl WriteCache {
        let region_id = write_request.metadata.region_id;

        let store = self.file_cache.local_store();
-        let path_provider = WriteCachePathProvider {
-            file_cache: self.file_cache.clone(),
-            region_id,
-        };
+        let path_provider = WriteCachePathProvider::new(region_id, self.file_cache.clone());
        let indexer = IndexerBuilderImpl {
            op_type: write_request.op_type,
            metadata: write_request.metadata.clone(),
            row_group_size: write_opts.row_group_size,
-            puffin_manager: self.puffin_manager_factory.build(store),
+            puffin_manager: self
+                .puffin_manager_factory
+                .build(store, path_provider.clone()),
            intermediate_manager: self.intermediate_manager.clone(),
            index_options: write_request.index_options,
            inverted_index_config: write_request.inverted_index_config,
@@ -150,24 +150,41 @@ impl WriteCache {
            return Ok(sst_info);
        }

+        let mut upload_tracker = UploadTracker::new(region_id);
+        let mut err = None;
        let remote_store = &upload_request.remote_store;
        for sst in &sst_info {
            let parquet_key = IndexKey::new(region_id, sst.file_id, FileType::Parquet);
            let parquet_path = upload_request
                .dest_path_provider
                .build_sst_file_path(sst.file_id);
-            self.upload(parquet_key, &parquet_path, remote_store)
-                .await?;
+            if let Err(e) = self.upload(parquet_key, &parquet_path, remote_store).await {
+                err = Some(e);
+                break;
+            }
+            upload_tracker.push_uploaded_file(parquet_path);

            if sst.index_metadata.file_size > 0 {
                let puffin_key = IndexKey::new(region_id, sst.file_id, FileType::Puffin);
-                let puffin_path = &upload_request
+                let puffin_path = upload_request
                    .dest_path_provider
                    .build_index_file_path(sst.file_id);
-                self.upload(puffin_key, puffin_path, remote_store).await?;
+                if let Err(e) = self.upload(puffin_key, &puffin_path, remote_store).await {
+                    err = Some(e);
+                    break;
+                }
+                upload_tracker.push_uploaded_file(puffin_path);
            }
        }

+        if let Some(err) = err {
+            // Cleans files on failure.
+            upload_tracker
+                .clean(&sst_info, &self.file_cache, remote_store)
+                .await;
+            return Err(err);
+        }
+
        Ok(sst_info)
    }

@@ -333,6 +350,61 @@ pub struct SstUploadRequest {
    pub remote_store: ObjectStore,
 }

+/// A structs to track files to upload and clean them if upload failed.
+struct UploadTracker {
+    /// Id of the region to track.
+    region_id: RegionId,
+    /// Paths of files uploaded successfully.
+    files_uploaded: Vec<String>,
+}
+
+impl UploadTracker {
+    /// Creates a new instance of `UploadTracker` for a given region.
+    fn new(region_id: RegionId) -> Self {
+        Self {
+            region_id,
+            files_uploaded: Vec::new(),
+        }
+    }
+
+    /// Add a file path to the list of uploaded files.
+    fn push_uploaded_file(&mut self, path: String) {
+        self.files_uploaded.push(path);
+    }
+
+    /// Cleans uploaded files and files in the file cache at best effort.
+    async fn clean(
+        &self,
+        sst_info: &SstInfoArray,
+        file_cache: &FileCacheRef,
+        remote_store: &ObjectStore,
+    ) {
+        common_telemetry::info!(
+            "Start cleaning files on upload failure, region: {}, num_ssts: {}",
+            self.region_id,
+            sst_info.len()
+        );
+
+        // Cleans files in the file cache first.
+        for sst in sst_info {
+            let parquet_key = IndexKey::new(self.region_id, sst.file_id, FileType::Parquet);
+            file_cache.remove(parquet_key).await;
+
+            if sst.index_metadata.file_size > 0 {
+                let puffin_key = IndexKey::new(self.region_id, sst.file_id, FileType::Puffin);
+                file_cache.remove(puffin_key).await;
+            }
+        }
+
+        // Cleans uploaded files.
+        for file_path in &self.files_uploaded {
+            if let Err(e) = remote_store.delete(file_path).await {
+                common_telemetry::error!(e; "Failed to delete file {}", file_path);
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use common_test_util::temp_dir::create_temp_dir;
@@ -355,9 +427,7 @@ mod tests {
        // and now just use local file system to mock.
        let mut env = TestEnv::new();
        let mock_store = env.init_object_store_manager();
-        let path_provider = RegionFilePathFactory {
-            region_dir: "test".to_string(),
-        };
+        let path_provider = RegionFilePathFactory::new("test".to_string());

        let local_dir = create_temp_dir("");
        let local_store = new_fs_store(local_dir.path().to_str().unwrap());
@@ -488,9 +558,7 @@ mod tests {
            ..Default::default()
        };
        let upload_request = SstUploadRequest {
-            dest_path_provider: RegionFilePathFactory {
-                region_dir: data_home.clone(),
-            },
+            dest_path_provider: RegionFilePathFactory::new(data_home.clone()),
            remote_store: mock_store.clone(),
        };

--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -135,6 +135,7 @@ pub async fn open_compaction_region(
            &mito_config.index.aux_path,
            mito_config.index.staging_size.as_bytes(),
            Some(mito_config.index.write_buffer_size.as_bytes() as _),
+            mito_config.index.staging_ttl,
        )
        .await?;
        let intermediate_manager =
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -299,6 +299,11 @@ pub struct IndexConfig {

    /// The max capacity of the staging directory.
    pub staging_size: ReadableSize,
+    /// The TTL of the staging directory.
+    /// Defaults to 7 days.
+    /// Setting it to "0s" to disable TTL.
+    #[serde(with = "humantime_serde")]
+    pub staging_ttl: Option<Duration>,

    /// Write buffer size for creating the index.
    pub write_buffer_size: ReadableSize,
@@ -316,6 +321,7 @@ impl Default for IndexConfig {
        Self {
            aux_path: String::new(),
            staging_size: ReadableSize::gb(2),
+            staging_ttl: Some(Duration::from_secs(7 * 24 * 60 * 60)),
            write_buffer_size: ReadableSize::mb(8),
            metadata_cache_size: ReadableSize::mb(64),
            content_cache_size: ReadableSize::mb(128),
@@ -352,6 +358,10 @@ impl IndexConfig {
            );
        }

+        if self.staging_ttl.map(|ttl| ttl.is_zero()).unwrap_or(false) {
+            self.staging_ttl = None;
+        }
+
        Ok(())
    }
 }
--- a/src/mito2/src/engine/drop_test.rs
+++ b/src/mito2/src/engine/drop_test.rs
@@ -56,7 +56,10 @@ async fn test_engine_drop_region() {

    // It's okay to drop a region doesn't exist.
    engine
-        .handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
+        .handle_request(
+            region_id,
+            RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+        )
        .await
        .unwrap_err();

@@ -86,7 +89,10 @@ async fn test_engine_drop_region() {

    // drop the created region.
    engine
-        .handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
+        .handle_request(
+            region_id,
+            RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+        )
        .await
        .unwrap();
    assert!(!engine.is_region_exists(region_id));
@@ -192,7 +198,10 @@ async fn test_engine_drop_region_for_custom_store() {

    // Drop the custom region.
    engine
-        .handle_request(custom_region_id, RegionRequest::Drop(RegionDropRequest {}))
+        .handle_request(
+            custom_region_id,
+            RegionRequest::Drop(RegionDropRequest { fast_path: false }),
+        )
        .await
        .unwrap();
    assert!(!engine.is_region_exists(custom_region_id));
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
discord9	7f7b974e8a	fix: conn pool leak & placeholder feature so ci can compile	2025-04-10 15:01:07 +08:00
discord9	4875ace0d0	fix: placeholder feature so ci can compile	2025-04-08 14:37:55 +08:00
discord9	a847d96649	fix: time window filter expr use OR	2025-04-07 16:50:17 +08:00
discord9	23a0a54e18	fix: convert timestamp unit too	2025-04-07 16:50:17 +08:00
discord9	78eb8b53f6	fix: quote&more info when time window too many chore: even more warning fix: filter first warn later	2025-04-07 16:50:17 +08:00
discord9	2455f39e8e	fix: subquery&cte time window expr	2025-04-07 16:46:53 +08:00
discord9	7fe0074202	refactor: even finer&limit time window num	2025-04-07 16:46:53 +08:00
discord9	e16bc203d0	feat: basic time window aware	2025-04-07 16:46:53 +08:00
discord9	9a3c26bb0a	metrics: better bucket&longer timeout	2025-04-07 16:46:53 +08:00
discord9	e1ff398c32	fix: timeout	2025-04-07 16:46:53 +08:00
discord9	780e3000de	fix: heartbeat&expire_after unit	2025-04-07 16:46:53 +08:00
discord9	2b5ddf8427	feat: time window in df plan WIP test: found out time window expr chore: pub tests: also unparsed tests: rm dup code feat: frontend client for recording rule fix: bound edgecase WIP WIP feat: rule engine feat: add init options& tmp rerounte to rule fix: dist client get fix: also not handle mirror write in flownode chore: clippy	2025-04-07 16:46:47 +08:00
Weny Xu	904d560175	feat(promql-planner): introduce vector matching binary operation (#5578 ) * feat(promql-planner): support vector matching for binary operation * test: add sqlness tests	2025-02-27 07:39:19 +00:00
Lei, HUANG	765d1277ee	fix(metasrv): clean expired nodes in memory (#5592 ) * fix/frontend-node-state: Refactor NodeInfoKey and Context Handling in Meta Server • Removed unused cluster_id from NodeInfoKey struct. • Updated HeartbeatHandlerGroup to return Context alongside HeartbeatResponse. • Added current_node_info to Context for tracking node information. • Implemented on_node_disconnect in Context to handle node disconnection events, specifically for Frontend roles. • Adjusted register_pusher function to return PusherId directly. • Updated tests to accommodate changes in Context structure. * fix/frontend-node-state: Refactor Heartbeat Handler Context Management Refactored the HeartbeatHandlerGroup::handle method to use a mutable reference for Context instead of passing it by value. This change simplifies the context management by eliminating the need to return the context with the response. Updated the Metasrv implementation to align with this new context handling approach, improving code clarity and reducing unnecessary context cloning. * revert: clean cluster info on disconnect * fix/frontend-node-state: Add Frontend Expiry Listener and Update NodeInfoKey Conversion • Introduced FrontendExpiryListener to manage the expiration of frontend nodes, including its integration with leadership change notifications. • Modified NodeInfoKey conversion to use references, enhancing efficiency and consistency across the codebase. • Updated collect_cluster_info_handler and metasrv to incorporate the new listener and conversion changes. • Added frontend_expiry module to the project structure for better organization and maintainability. * chore: add config for node expiry * add some doc * fix: clippy * fix/frontend-node-state: ### Refactor Node Expiry Handling - Configuration Update: Removed `node_expiry_tick` from `metasrv.example.toml` and `MetasrvOptions` in `metasrv.rs`. - Module Renaming: Renamed `frontend_expiry.rs` to `node_expiry_listener.rs` and updated references in `lib.rs`. - Code Refactoring: Replaced `FrontendExpiryListener` with `NodeExpiryListener` in `node_expiry_listener.rs` and `metasrv.rs`, removing the tick interval and adjusting logic to use a fixed 60-second interval for node expiry checks. * fix/frontend-node-state: Improve logging in `node_expiry_listener.rs` - Enhanced warning message to include peer information when an unrecognized node info key is encountered in `node_expiry_listener.rs`. * docs: update config docs * fix/frontend-node-state: Refactor Context Handling in Heartbeat Services - Updated `HeartbeatHandlerGroup` in `handler.rs` to pass `Context` by value instead of by mutable reference, allowing for more flexible context management. - Modified `Metasrv` implementation in `heartbeat.rs` to clone `Context` when passing to `handle` method, ensuring thread safety and consistency in asynchronous operations.	2025-02-27 06:16:36 +00:00
discord9	ccf42a9d97	fix: flow heartbeat retry (#5600 ) * fix: flow heartbeat retry * fix?: not sure if fixed * chore: per review	2025-02-27 03:58:21 +00:00
Weny Xu	71e2fb895f	feat: introduce `prom_round` fn (#5604 ) * feat: introduce `prom_round` fn * test: add sqlness tests	2025-02-27 03:30:15 +00:00
Ruihang Xia	c9671fd669	feat(promql): implement subquery (#5606 ) * feat: initial implement for promql subquery Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl and test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * refactor Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-27 03:28:04 +00:00
Ruihang Xia	b5efc75aab	feat(promql): ignore invalid input in histogram plan (#5607 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-27 03:18:20 +00:00
Weny Xu	c1d18d9980	fix(prom): preserve the order of series in `PromQueryResult` (#5601 ) fix(prom): keep the order of tags	2025-02-26 13:40:09 +00:00
Lei, HUANG	5d9faaaf39	fix(metasrv): reject ddl when metasrv is follower (#5599 ) * fix/reject-ddl-in-follower-metasrv: Add leader check and logging for gRPC requests in `procedure.rs` - Implemented leader verification for `query_procedure_state`, `ddl`, and `procedure_details` gRPC requests in `procedure.rs`. - Added logging with `warn` for requests reaching a non-leader node. - Introduced `ResponseHeader` and `Error::is_not_leader()` to handle non-leader responses. * fix/reject-ddl-in-follower-metasrv: Improve leader address handling in `heartbeat.rs` - Refactor leader address retrieval by renaming `leader` to `leader_addr` for clarity. - Update `make_client` function to use a reference to `leader_addr`. - Enhance logging to include the leader address in the success message for creating a heartbeat stream. * fmt * fix/reject-ddl-in-follower-metasrv: Enhance Leader Check in `procedure.rs` - Updated the leader verification logic in `procedure.rs` to return a failed `MigrateRegionResponse` when the server is not the leader. - Added logging to warn when a migrate request is received by a non-leader server.	2025-02-26 08:10:40 +00:00
ZonaHe	538875abee	feat: update dashboard to v0.7.11 (#5597 ) Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>	2025-02-26 07:57:59 +00:00
jeremyhi	5ed09c4584	fix: all heartbeat channel need to check leader (#5593 )	2025-02-25 10:45:30 +00:00
Yingwen	3f6a41eac5	fix: update show create table output for fulltext index (#5591 ) * fix: update full index syntax in show create table * test: update fulltext sqlness result	2025-02-25 09:36:27 +00:00
yihong	ff0dcf12c5	perf: close issue 4974 by do not delete columns when drop logical region about 100 times faster (#5561 ) * perf: do not delete columns when drop logical region in drop database Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: make ci happy Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: address review comments Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: address some comments Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: drop stupid comments by copilot Signed-off-by: yihong0618 <zouzou0208@gmail.com> * chore: minor refactor * chore: minor refactor * chore: update grpetime-proto --------- Signed-off-by: yihong0618 <zouzou0208@gmail.com> Co-authored-by: WenyXu <wenymedia@gmail.com>	2025-02-25 09:00:49 +00:00
Yingwen	5b1fca825a	fix: remove cached and uploaded files on failure (#5590 )	2025-02-25 08:51:37 +00:00
Ruihang Xia	7bd108e2be	feat: impl `hll_state`, `hll_merge` and `hll_calc` for incremental distinct counting (#5579 ) * basic impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * more tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update with more test and logs Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl merge fn Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * rename function names Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-24 19:07:37 +00:00
Weny Xu	286f225e50	fix: correct `inverted_indexed_column_ids` behavior (#5586 ) * fix: correct `inverted_indexed_column_ids` * fix: fix unit tests	2025-02-23 07:17:38 +00:00
Ruihang Xia	4f988b5ba9	feat: remove default inverted index for physical table (#5583 ) * feat: remove default inverted index for physical table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update sqlness result Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-22 06:48:05 +00:00
Ruihang Xia	500d0852eb	fix: avoid run labeler job concurrently (#5584 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-22 05:18:26 +00:00
Zhenchi	8d05fb3503	feat: unify puffin name passed to stager (#5564 ) * feat: purge a given puffin file in staging area Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * polish log Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * ttl set to 2d Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * feat: expose staging_ttl to index config * feat: unify puffin name passed to stager Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix test Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * address comments Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fallback to remote index Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * refactor Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> Co-authored-by: evenyag <realevenyag@gmail.com>	2025-02-21 09:27:03 +00:00
Ruihang Xia	d7b6718be0	feat: run sqlness in parallel (#5499 ) * define server mode Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * bump sqlness Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * all good Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * refactor: Move config generation logic from Env to ServerMode Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * finalize Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * change license header Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * rename variables Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * override parallelism Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * rename more variables Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-21 07:05:19 +00:00
Ruihang Xia	6f0783e17e	fix: broken link in AUTHOR.md (#5581 )	2025-02-21 07:01:41 +00:00
Ruihang Xia	d69e93b91a	feat: support to generate json output for explain analyze in http api (#5567 ) * impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * integration test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * Update src/servers/src/http/hints.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * refactor: with FORMAT option for explain format * lift some well-known metrics Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Ning Sun <sunning@greptime.com>	2025-02-21 05:13:09 +00:00
Ruihang Xia	76083892cd	feat: support UNNEST (#5580 ) * feat: support UNNEST Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy and sqlness Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-21 04:53:56 +00:00
Ruihang Xia	7981c06989	feat: implement uddsketch function to calculate percentile (#5574 ) * basic impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * more tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update with more test and logs Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-20 18:59:20 +00:00
beryl678	97bb1519f8	docs: revise the author list (#5575 )	2025-02-20 18:04:23 +00:00
Weny Xu	1d8c9c1843	feat: enable gzip for prometheus query handlers and ignore NaN values in prometheus response (#5576 ) * feat: enable gzip for prometheus query handlers and ignore nan values in prometheus response * Apply suggestions from code review Co-authored-by: shuiyisong <113876041+shuiyisong@users.noreply.github.com> --------- Co-authored-by: shuiyisong <113876041+shuiyisong@users.noreply.github.com>	2025-02-20 11:34:32 +00:00
jeremyhi	71007e200c	feat: remap flow route address (#5565 ) * feat: remap fow peers * refactor: not stream * feat: remap flownode addr on FlowRoute and TableFlow * fix: unit test * Update src/meta-srv/src/handler/remap_flow_peer_handler.rs Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com> * chore: by comment * Update src/meta-srv/src/handler/remap_flow_peer_handler.rs * Update src/common/meta/src/key/flow/table_flow.rs * Update src/common/meta/src/key/flow/flow_route.rs * chore: remove duplicate field --------- Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com>	2025-02-20 08:21:32 +00:00
jeremyhi	a0ff9e751e	feat: flow type on creating procedure (#5572 ) feat: flow type on creating	2025-02-20 08:12:02 +00:00
LFC	f6f617d667	feat: submit node's cpu cores number to metasrv in heartbeat (#5571 ) * feat: submit node's cpu cores number to metasrv in heartbeat * update greptime-proto dep	2025-02-20 03:55:18 +00:00
Ruihang Xia	e8788088a8	feat(log-query): implement the first part of log query expr (#5548 ) * feat(log-query): implement the first part of log query expr Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-19 18:25:41 +00:00
shuiyisong	53b25c04a2	chore: support Loki's structured metadata for ingestion (#5541 ) * chore: support loki's structured metadata * test: update test * chore: revert some code change * chore: address CR comment	2025-02-19 16:44:26 +00:00
dennis zhuang	62a8b8b9dc	feat(promql): supports sort, sort_desc etc. functions (#5542 ) * feat(promql): supports sort, sort_desc etc. functions * chore: fix toml format and tests * chore: update deps Co-authored-by: Weny Xu <wenymedia@gmail.com> * chore: remove fixme * fix: cargo lock * chore: style --------- Co-authored-by: Weny Xu <wenymedia@gmail.com>	2025-02-19 13:13:49 +00:00
Weny Xu	c8bdeaaa6a	fix(promql-planner): update ctx field columns of OR operator (#5556 ) * fix(promql-planner): update ctx field columns of OR operator * test: add sqlness test	2025-02-19 11:18:58 +00:00
Ning Sun	81da18e5df	refactor: use global type alias for pipeline input (#5568 ) * refactor: use global type alias for pipeline input * fmt: reformat	2025-02-19 10:41:33 +00:00
Weny Xu	7c65fddb30	fix(promql-planner): correct AND/UNLESS operator behavior (#5557 ) * fix(promql-planner): keep field column in left input for AND operator * test: add sqlness test * fix: fix unless operator	2025-02-19 09:07:39 +00:00
Zhenchi	421e38c481	feat: allow purging a given puffin file in staging area (#5558 ) * feat: purge a given puffin file in staging area Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * polish log Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * ttl set to 2d Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * feat: expose staging_ttl to index config * fix test Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * use `invalidate_entries_if` instead of maintaining map Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * run_pending_tasks after purging Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> Co-authored-by: evenyag <realevenyag@gmail.com>	2025-02-19 08:58:30 +00:00
Weny Xu	aada5c1706	fix(promql-planner): remove le tag in ctx (#5560 ) * fix(promql-planner): remove le tag in ctx * test: add sqlness test * chore: apply suggestions from CR	2025-02-19 03:51:27 +00:00
yihong	aa8f119bbb	chore: format all toml files (#5529 ) fix: format some cargo files Signed-off-by: yihong0618 <zouzou0208@gmail.com>	2025-02-18 12:09:01 +00:00
ZonaHe	19a6d15849	feat: update dashboard to v0.7.10 (#5562 ) Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>	2025-02-18 12:06:22 +00:00
liyang	073aaefe65	chore: improve grafana dashboard (#5559 )	2025-02-18 11:36:27 +00:00
Yingwen	77223a0f3e	fix: window sort support alias time index (#5543 ) * fix: use alias expr to check commutativity * chore: debug sort * feat: consider alias in window sort optimizer * test: sqlness test * test: update sqlness result	2025-02-18 10:35:43 +00:00
Ruihang Xia	4ef038d098	fix: correct promql behavior on nonexistent columns (#5547 ) * Revert "fix(promql): ignore filters for non-existent labels (#5519)" This reverts commit `33a2485f54`. * reimplement Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * state safety Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-17 18:43:50 +00:00
jeremyhi	deb9520970	fix: information_schema.cluster_info be covered by the same id (#5555 ) * fix: information_schema.cluster_info be coverd by the same id * chore: by comment	2025-02-17 11:51:02 +00:00
Yingwen	6bba5e0afa	feat: collect stager metrics (#5553 ) * feat: collect stager metrics * Apply suggestions from code review Co-authored-by: Zhenchi <zhongzc_arch@outlook.com> * Update src/mito2/src/metrics.rs --------- Co-authored-by: Weny Xu <wenymedia@gmail.com> Co-authored-by: Zhenchi <zhongzc_arch@outlook.com>	2025-02-17 07:09:15 +00:00
Ruihang Xia	f359eeb667	feat(log-query): support specifying exclusive/inclusive for between filter (#5546 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2025-02-17 04:40:47 +00:00
liyang	009dbad581	ci: don't push nightly latest image (#5551 ) * ci: don't push nightly latest image * add push release latest image	2025-02-17 04:34:49 +00:00
liyang	a2047b096c	ci: use s5cmd upload artifacts (#5550 )	2025-02-17 02:57:13 +00:00