chore: Merge branch 'main' into chore/bench-metrics

2026-01-05 21:02:58 +00:00 · 2024-12-19 16:07:43 +08:00
parent 0c302ba127 422d18da8b
commit 9d3dc2d311
270 changed files with 10347 additions and 4455 deletions
--- a/.github/actions/setup-kafka-cluster/action.yml
+++ b/.github/actions/setup-kafka-cluster/action.yml
@@ -18,6 +18,8 @@ runs:
        --set controller.replicaCount=${{ inputs.controller-replicas }} \
        --set controller.resources.requests.cpu=50m \
        --set controller.resources.requests.memory=128Mi \
+        --set controller.resources.limits.cpu=2000m \
+        --set controller.resources.limits.memory=2Gi \
        --set listeners.controller.protocol=PLAINTEXT \
        --set listeners.client.protocol=PLAINTEXT \
        --create-namespace \
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -323,8 +323,6 @@ jobs:
        uses: ./.github/actions/setup-kafka-cluster
      - name: Setup Etcd cluser
        uses: ./.github/actions/setup-etcd-cluster
-      - name: Setup Postgres cluser
-        uses: ./.github/actions/setup-postgres-cluster
      # Prepares for fuzz tests
      - uses: arduino/setup-protoc@v3
        with:
@@ -474,8 +472,6 @@ jobs:
        uses: ./.github/actions/setup-kafka-cluster
      - name: Setup Etcd cluser
        uses: ./.github/actions/setup-etcd-cluster
-      - name: Setup Postgres cluser
-        uses: ./.github/actions/setup-postgres-cluster
      # Prepares for fuzz tests
      - uses: arduino/setup-protoc@v3
        with:
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -12,7 +12,7 @@ on:
      linux_amd64_runner:
        type: choice
        description: The runner uses to build linux-amd64 artifacts
-        default: ec2-c6i.2xlarge-amd64
+        default: ec2-c6i.4xlarge-amd64
        options:
          - ubuntu-20.04
          - ubuntu-20.04-8-cores
@@ -27,7 +27,7 @@ on:
      linux_arm64_runner:
        type: choice
        description: The runner uses to build linux-arm64 artifacts
-        default: ec2-c6g.2xlarge-arm64
+        default: ec2-c6g.4xlarge-arm64
        options:
          - ec2-c6g.xlarge-arm64 # 4C8G
          - ec2-c6g.2xlarge-arm64 # 8C16G
--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
@@ -114,6 +114,17 @@ jobs:
          GT_S3_REGION: ${{ vars.AWS_CI_TEST_BUCKET_REGION }}
          UNITTEST_LOG_DIR: "__unittest_logs"

+  cleanbuild-linux-nix:
+    runs-on: ubuntu-latest-8-cores
+    timeout-minutes: 60
+    needs: [coverage, fmt, clippy, check]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: cachix/install-nix-action@v27
+        with:
+          nix_path: nixpkgs=channel:nixos-unstable
+      - run: nix-shell --pure --run "cargo build"
+
  check-status:
    name: Check status
    needs: [sqlness-test, sqlness-windows, test-on-windows]
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,10 @@ benchmarks/data

 venv/

-# Fuzz tests 
+# Fuzz tests
 tests-fuzz/artifacts/
 tests-fuzz/corpus/
+
+# Nix
+.direnv
+.envrc
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -222,12 +222,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "approx_eq"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3f9eb837c6a783fbf002e3e5cc7925a3aa6893d6d42f9169517528983777590"
-
 [[package]]
 name = "aquamarine"
 version = "0.3.3"
@@ -872,18 +866,6 @@ dependencies = [
 "rand",
 ]

-[[package]]
-name = "backon"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d67782c3f868daa71d3533538e98a8e13713231969def7536e8039606fc46bf0"
-dependencies = [
- "fastrand",
- "futures-core",
- "pin-project",
- "tokio",
-]
-
 [[package]]
 name = "backon"
 version = "1.2.0"
@@ -1310,7 +1292,6 @@ dependencies = [
 "common-meta",
 "moka",
 "snafu 0.8.5",
- "substrait 0.12.0",
 ]

 [[package]]
@@ -1349,7 +1330,6 @@ dependencies = [
 "catalog",
 "chrono",
 "common-catalog",
- "common-config",
 "common-error",
 "common-macro",
 "common-meta",
@@ -1358,7 +1338,6 @@ dependencies = [
 "common-recordbatch",
 "common-runtime",
 "common-telemetry",
- "common-test-util",
 "common-time",
 "common-version",
 "dashmap",
@@ -1369,7 +1348,6 @@ dependencies = [
 "humantime",
 "itertools 0.10.5",
 "lazy_static",
- "log-store",
 "meta-client",
 "moka",
 "object-store",
@@ -1693,7 +1671,6 @@ dependencies = [
 "common-grpc",
 "common-macro",
 "common-meta",
- "common-options",
 "common-procedure",
 "common-query",
 "common-recordbatch",
@@ -1722,7 +1699,6 @@ dependencies = [
 "store-api",
 "substrait 0.12.0",
 "table",
- "temp-env",
 "tempfile",
 "tokio",
 "tracing-appender",
@@ -1746,8 +1722,6 @@ dependencies = [
 "common-query",
 "common-recordbatch",
 "common-telemetry",
- "datanode",
- "derive-new 0.5.9",
 "enum_dispatch",
 "futures-util",
 "lazy_static",
@@ -1918,6 +1892,7 @@ dependencies = [
 "futures",
 "paste",
 "pin-project",
+ "rand",
 "serde",
 "snafu 0.8.5",
 "tokio",
@@ -1928,13 +1903,6 @@ dependencies = [
 [[package]]
 name = "common-catalog"
 version = "0.12.0"
-dependencies = [
- "chrono",
- "common-error",
- "common-macro",
- "snafu 0.8.5",
- "tokio",
-]

 [[package]]
 name = "common-config"
@@ -1978,7 +1946,6 @@ dependencies = [
 "datafusion",
 "datatypes",
 "derive_builder 0.12.0",
- "dotenv",
 "futures",
 "lazy_static",
 "object-store",
@@ -2022,15 +1989,10 @@ dependencies = [
 name = "common-frontend"
 version = "0.12.0"
 dependencies = [
- "api",
 "async-trait",
- "common-base",
 "common-error",
 "common-macro",
- "common-query",
- "session",
 "snafu 0.8.5",
- "sql",
 ]

 [[package]]
@@ -2064,7 +2026,6 @@ dependencies = [
 "num-traits",
 "once_cell",
 "paste",
- "ron",
 "s2",
 "serde",
 "serde_json",
@@ -2255,7 +2216,7 @@ version = "0.12.0"
 dependencies = [
 "async-stream",
 "async-trait",
- "backon 1.2.0",
+ "backon",
 "common-base",
 "common-error",
 "common-macro",
@@ -2353,8 +2314,6 @@ dependencies = [
 "snafu 0.8.5",
 "tempfile",
 "tokio",
- "tokio-metrics",
- "tokio-metrics-collector",
 "tokio-test",
 "tokio-util",
 ]
@@ -2834,16 +2793,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "ctor"
-version = "0.1.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
-dependencies = [
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "darling"
 version = "0.14.4"
@@ -3386,17 +3335,6 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "derive-new"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "derive-new"
 version = "0.7.0"
@@ -3885,6 +3823,18 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c"

+[[package]]
+name = "fastbloom"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b679f25009b51b71506296f95fb6362ba7d0151172fa7373a8d1611b8bc5d10f"
+dependencies = [
+ "getrandom",
+ "rand",
+ "siphasher 1.0.1",
+ "wide",
+]
+
 [[package]]
 name = "fastdivide"
 version = "0.4.1"
@@ -3919,7 +3869,6 @@ dependencies = [
 "common-error",
 "common-macro",
 "common-procedure",
- "common-procedure-test",
 "common-query",
 "common-recordbatch",
 "common-telemetry",
@@ -4067,7 +4016,6 @@ dependencies = [
 "itertools 0.10.5",
 "lazy_static",
 "meta-client",
- "minstant",
 "nom",
 "num-traits",
 "operator",
@@ -4114,15 +4062,6 @@ dependencies = [
 "percent-encoding",
 ]

-[[package]]
-name = "format_num"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14ac05eb8d2eb4ed1eeff847911deae077b0b53332465de9d6a26b0ea9961bc8"
-dependencies = [
- "regex",
-]
-
 [[package]]
 name = "fragile"
 version = "2.0.0"
@@ -4145,7 +4084,6 @@ dependencies = [
 "common-config",
 "common-datasource",
 "common-error",
- "common-frontend",
 "common-function",
 "common-grpc",
 "common-macro",
@@ -4167,7 +4105,6 @@ dependencies = [
 "lazy_static",
 "log-store",
 "meta-client",
- "meta-srv",
 "opentelemetry-proto 0.5.0",
 "operator",
 "partition",
@@ -5277,6 +5214,7 @@ dependencies = [
 "common-runtime",
 "common-telemetry",
 "common-test-util",
+ "fastbloom",
 "fst",
 "futures",
 "greptime-proto",
@@ -5287,6 +5225,7 @@ dependencies = [
 "regex",
 "regex-automata 0.4.8",
 "serde",
+ "serde_json",
 "snafu 0.8.5",
 "tantivy",
 "tantivy-jieba",
@@ -6076,6 +6015,18 @@ version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"

+[[package]]
+name = "local-ip-address"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3669cf5561f8d27e8fc84cc15e58350e70f557d4d65f70e3154e54cd2f8e1782"
+dependencies = [
+ "libc",
+ "neli",
+ "thiserror 1.0.64",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@@ -6600,16 +6551,6 @@ dependencies = [
 "adler2",
 ]

-[[package]]
-name = "minstant"
-version = "0.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb9b5c752f145ac5046bccc3c4f62892e3c950c1d1eab80c5949cd68a2078db"
-dependencies = [
- "ctor",
- "web-time 1.1.0",
-]
-
 [[package]]
 name = "mio"
 version = "0.8.11"
@@ -6643,6 +6584,7 @@ dependencies = [
 "async-channel 1.9.0",
 "async-stream",
 "async-trait",
+ "bytemuck",
 "bytes",
 "common-base",
 "common-config",
@@ -6652,7 +6594,6 @@ dependencies = [
 "common-function",
 "common-macro",
 "common-meta",
- "common-procedure-test",
 "common-query",
 "common-recordbatch",
 "common-runtime",
@@ -7052,6 +6993,31 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "27b02d87554356db9e9a873add8782d4ea6e3e58ea071a9adb9a2e8ddb884a8b"

+[[package]]
+name = "neli"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1100229e06604150b3becd61a4965d5c70f3be1759544ea7274166f4be41ef43"
+dependencies = [
+ "byteorder",
+ "libc",
+ "log",
+ "neli-proc-macros",
+]
+
+[[package]]
+name = "neli-proc-macros"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c168194d373b1e134786274020dae7fc5513d565ea2ebb9bc9ff17ffb69106d4"
+dependencies = [
+ "either",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "new_debug_unreachable"
 version = "1.0.6"
@@ -7408,13 +7374,13 @@ checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"

 [[package]]
 name = "opendal"
-version = "0.49.2"
+version = "0.50.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b04d09b9822c2f75a1d2fc513a2c1279c70e91e7407936fffdf6a6976ec530a"
+checksum = "cb28bb6c64e116ceaf8dd4e87099d3cfea4a58e85e62b104fef74c91afba0f44"
 dependencies = [
 "anyhow",
 "async-trait",
- "backon 0.4.4",
+ "backon",
 "base64 0.22.1",
 "bytes",
 "chrono",
@@ -7427,6 +7393,7 @@ dependencies = [
 "md-5",
 "once_cell",
 "percent-encoding",
+ "prometheus",
 "quick-xml 0.36.2",
 "reqsign",
 "reqwest",
@@ -8089,7 +8056,7 @@ dependencies = [
 "async-trait",
 "bytes",
 "chrono",
- "derive-new 0.7.0",
+ "derive-new",
 "futures",
 "hex",
 "lazy-regex",
@@ -8139,7 +8106,7 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
 dependencies = [
- "siphasher",
+ "siphasher 0.3.11",
 ]

 [[package]]
@@ -8148,7 +8115,7 @@ version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
 dependencies = [
- "siphasher",
+ "siphasher 0.3.11",
 ]

 [[package]]
@@ -8229,7 +8196,6 @@ dependencies = [
 "query",
 "rayon",
 "regex",
- "ron",
 "serde",
 "serde_json",
 "session",
@@ -8641,10 +8607,7 @@ dependencies = [
 "greptime-proto",
 "lazy_static",
 "prometheus",
- "promql-parser",
 "prost 0.12.6",
- "query",
- "session",
 "snafu 0.8.5",
 "tokio",
 ]
@@ -8883,6 +8846,7 @@ dependencies = [
 "lz4_flex 0.11.3",
 "moka",
 "pin-project",
+ "prometheus",
 "serde",
 "serde_json",
 "sha2",
@@ -8991,7 +8955,6 @@ version = "0.12.0"
 dependencies = [
 "ahash 0.8.11",
 "api",
- "approx_eq",
 "arc-swap",
 "arrow",
 "arrow-schema",
@@ -9023,7 +8986,6 @@ dependencies = [
 "datafusion-sql",
 "datatypes",
 "fastrand",
- "format_num",
 "futures",
 "futures-util",
 "greptime-proto",
@@ -9051,9 +9013,7 @@ dependencies = [
 "sql",
 "sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
 "statrs",
- "stats-cli",
 "store-api",
- "streaming-stats",
 "substrait 0.12.0",
 "table",
 "tokio",
@@ -9416,9 +9376,9 @@ dependencies = [

 [[package]]
 name = "reqsign"
-version = "0.16.0"
+version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03dd4ba7c3901dd43e6b8c7446a760d45bc1ea4301002e1a6fa48f97c3a796fa"
+checksum = "eb0075a66c8bfbf4cc8b70dca166e722e1f55a3ea9250ecbb85f4d92a5f64149"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -9447,9 +9407,9 @@ dependencies = [

 [[package]]
 name = "reqwest"
-version = "0.12.8"
+version = "0.12.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b"
+checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
 dependencies = [
 "base64 0.22.1",
 "bytes",
@@ -10086,7 +10046,7 @@ dependencies = [
 "once_cell",
 "radium",
 "rand",
- "siphasher",
+ "siphasher 0.3.11",
 "unic-ucd-category",
 "volatile",
 "widestring",
@@ -10544,7 +10504,6 @@ dependencies = [
 "datatypes",
 "futures",
 "lazy_static",
- "log-store",
 "once_cell",
 "operator",
 "paste",
@@ -10567,7 +10526,6 @@ dependencies = [
 "sql",
 "table",
 "tokio",
- "tokio-test",
 ]

 [[package]]
@@ -10909,7 +10867,6 @@ dependencies = [
 "tokio-postgres-rustls",
 "tokio-rustls 0.26.0",
 "tokio-stream",
- "tokio-test",
 "tokio-util",
 "tonic 0.11.0",
 "tonic-reflection",
@@ -11100,6 +11057,12 @@ version = "0.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"

+[[package]]
+name = "siphasher"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
+
 [[package]]
 name = "sketches-ddsketch"
 version = "0.2.2"
@@ -11295,6 +11258,7 @@ dependencies = [
 "jsonb",
 "lazy_static",
 "regex",
+ "serde",
 "serde_json",
 "snafu 0.8.5",
 "sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
@@ -11343,14 +11307,21 @@ dependencies = [
 "common-recordbatch",
 "common-time",
 "datatypes",
+ "flate2",
+ "hex",
+ "local-ip-address",
 "mysql",
+ "reqwest",
 "serde",
 "serde_json",
+ "sha2",
 "sqlness",
+ "tar",
 "tempfile",
 "tinytemplate",
 "tokio",
 "tokio-postgres",
+ "tokio-stream",
 ]

 [[package]]
@@ -11371,6 +11342,7 @@ dependencies = [
 "lazy_static",
 "log",
 "regex",
+ "serde",
 "sqlparser 0.45.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "sqlparser_derive 0.2.2 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
 ]
@@ -11541,16 +11513,6 @@ dependencies = [
 "rand",
 ]

-[[package]]
-name = "stats-cli"
-version = "3.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8786c4fc8a91bc4fcd90aed33413f79e4dc9811f24ba14d1d59adf57cf1c871"
-dependencies = [
- "clap 2.34.0",
- "num-traits",
-]
-
 [[package]]
 name = "store-api"
 version = "0.12.0"
@@ -11592,15 +11554,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"

-[[package]]
-name = "streaming-stats"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0d670ce4e348a2081843569e0f79b21c99c91bb9028b3b3ecb0f050306de547"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "strfmt"
 version = "0.2.4"
@@ -12124,6 +12077,17 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"

+[[package]]
+name = "tar"
+version = "0.4.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6"
+dependencies = [
+ "filetime",
+ "libc",
+ "xattr",
+]
+
 [[package]]
 name = "target-lexicon"
 version = "0.12.16"
@@ -12194,6 +12158,7 @@ dependencies = [
 "arbitrary",
 "async-trait",
 "chrono",
+ "common-base",
 "common-error",
 "common-macro",
 "common-query",
@@ -12557,30 +12522,6 @@ dependencies = [
 "syn 2.0.90",
 ]

-[[package]]
-name = "tokio-metrics"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eace09241d62c98b7eeb1107d4c5c64ca3bd7da92e8c218c153ab3a78f9be112"
-dependencies = [
- "futures-util",
- "pin-project-lite",
- "tokio",
- "tokio-stream",
-]
-
-[[package]]
-name = "tokio-metrics-collector"
-version = "0.2.1"
-source = "git+https://github.com/MichaelScofield/tokio-metrics-collector.git?rev=89d692d5753d28564a7aac73c6ac5aba22243ba0#89d692d5753d28564a7aac73c6ac5aba22243ba0"
-dependencies = [
- "lazy_static",
- "parking_lot 0.12.3",
- "prometheus",
- "tokio",
- "tokio-metrics",
-]
-
 [[package]]
 name = "tokio-postgres"
 version = "0.7.12"
@@ -13007,7 +12948,7 @@ dependencies = [
 "tracing-core",
 "tracing-log 0.2.0",
 "tracing-subscriber",
- "web-time 0.2.4",
+ "web-time",
 ]

 [[package]]
@@ -13778,16 +13719,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "web-time"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
-dependencies = [
- "js-sys",
- "wasm-bindgen",
-]
-
 [[package]]
 name = "webbrowser"
 version = "0.8.15"
@@ -14282,6 +14213,17 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "xattr"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f"
+dependencies = [
+ "libc",
+ "linux-raw-sys",
+ "rustix",
+]
+
 [[package]]
 name = "xml-rs"
 version = "0.8.22"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -180,6 +180,7 @@ sysinfo = "0.30"
 # on branch v0.44.x
 sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "54a267ac89c09b11c0c88934690530807185d3e7", features = [
    "visitor",
+    "serde",
 ] }
 strum = { version = "0.25", features = ["derive"] }
 tempfile = "3"
--- a/config/config.md
+++ b/config/config.md
@@ -150,6 +150,7 @@
 | `region_engine.mito.inverted_index.intermediate_path` | String | `""` | Deprecated, use `region_engine.mito.index.aux_path` instead. |
 | `region_engine.mito.inverted_index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
 | `region_engine.mito.inverted_index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
+| `region_engine.mito.inverted_index.content_cache_page_size` | String | `8MiB` | Page size for inverted index content cache. |
 | `region_engine.mito.fulltext_index` | -- | -- | The options for full-text index in Mito engine. |
 | `region_engine.mito.fulltext_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
 | `region_engine.mito.fulltext_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
@@ -475,6 +476,9 @@
 | `region_engine.mito.inverted_index.apply_on_query` | String | `auto` | Whether to apply the index on query<br/>- `auto`: automatically (default)<br/>- `disable`: never |
 | `region_engine.mito.inverted_index.mem_threshold_on_create` | String | `auto` | Memory threshold for performing an external sort during index creation.<br/>- `auto`: automatically determine the threshold based on the system memory size (default)<br/>- `unlimited`: no memory limit<br/>- `[size]` e.g. `64MB`: fixed memory threshold |
 | `region_engine.mito.inverted_index.intermediate_path` | String | `""` | Deprecated, use `region_engine.mito.index.aux_path` instead. |
+| `region_engine.mito.inverted_index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
+| `region_engine.mito.inverted_index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
+| `region_engine.mito.inverted_index.content_cache_page_size` | String | `8MiB` | Page size for inverted index content cache. |
 | `region_engine.mito.fulltext_index` | -- | -- | The options for full-text index in Mito engine. |
 | `region_engine.mito.fulltext_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
 | `region_engine.mito.fulltext_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -543,6 +543,15 @@ mem_threshold_on_create = "auto"
 ## Deprecated, use `region_engine.mito.index.aux_path` instead.
 intermediate_path = ""

+## Cache size for inverted index metadata.
+metadata_cache_size = "64MiB"
+
+## Cache size for inverted index content.
+content_cache_size = "128MiB"
+
+## Page size for inverted index content cache.
+content_cache_page_size = "8MiB"
+
 ## The options for full-text index in Mito engine.
 [region_engine.mito.fulltext_index]

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -588,6 +588,9 @@ metadata_cache_size = "64MiB"
 ## Cache size for inverted index content.
 content_cache_size = "128MiB"

+## Page size for inverted index content cache.
+content_cache_page_size = "8MiB"
+
 ## The options for full-text index in Mito engine.
 [region_engine.mito.fulltext_index]

--- a/grafana/greptimedb.json
+++ b/grafana/greptimedb.json
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,2 +1,3 @@
 [toolchain]
 channel = "nightly-2024-10-19"
+components = ["rust-analyzer"]
--- a/scripts/check-snafu.py
+++ b/scripts/check-snafu.py
@@ -58,8 +58,10 @@ def main():
        if not check_snafu_in_files(branch_name, other_rust_files)
    ]

-    for name in unused_snafu:
-        print(name)
+    if unused_snafu:
+        print("Unused error variants:")
+        for name in unused_snafu:
+            print(name)

    if unused_snafu:
        raise SystemExit(1)
--- a/shell.nix
+++ b/shell.nix
@@ -0,0 +1,27 @@
+let
+  nixpkgs = fetchTarball "https://github.com/NixOS/nixpkgs/tarball/nixos-unstable";
+  fenix = import (fetchTarball "https://github.com/nix-community/fenix/archive/main.tar.gz") {};
+  pkgs = import nixpkgs { config = {}; overlays = []; };
+in
+
+pkgs.mkShell rec {
+  nativeBuildInputs = with pkgs; [
+    pkg-config
+    git
+    clang
+    gcc
+    protobuf
+    mold
+    (fenix.fromToolchainFile {
+      dir = ./.;
+    })
+    cargo-nextest
+    taplo
+  ];
+
+  buildInputs = with pkgs; [
+    libgit2
+  ];
+
+  LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath buildInputs;
+}
--- a/src/api/src/v1/column_def.rs
+++ b/src/api/src/v1/column_def.rs
@@ -16,7 +16,7 @@ use std::collections::HashMap;

 use datatypes::schema::{
    ColumnDefaultConstraint, ColumnSchema, FulltextAnalyzer, FulltextOptions, COMMENT_KEY,
-    FULLTEXT_KEY, INVERTED_INDEX_KEY,
+    FULLTEXT_KEY, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY,
 };
 use greptime_proto::v1::Analyzer;
 use snafu::ResultExt;
@@ -29,6 +29,8 @@ use crate::v1::{ColumnDef, ColumnOptions, SemanticType};
 const FULLTEXT_GRPC_KEY: &str = "fulltext";
 /// Key used to store inverted index options in gRPC column options.
 const INVERTED_INDEX_GRPC_KEY: &str = "inverted_index";
+/// Key used to store skip index options in gRPC column options.
+const SKIPPING_INDEX_GRPC_KEY: &str = "skipping_index";

 /// Tries to construct a `ColumnSchema` from the given  `ColumnDef`.
 pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
@@ -60,6 +62,9 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
        if let Some(inverted_index) = options.options.get(INVERTED_INDEX_GRPC_KEY) {
            metadata.insert(INVERTED_INDEX_KEY.to_string(), inverted_index.clone());
        }
+        if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
+            metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.clone());
+        }
    }

    ColumnSchema::new(&column_def.name, data_type.into(), column_def.is_nullable)
@@ -84,6 +89,11 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
            .options
            .insert(INVERTED_INDEX_GRPC_KEY.to_string(), inverted_index.clone());
    }
+    if let Some(skipping_index) = column_schema.metadata().get(SKIPPING_INDEX_KEY) {
+        options
+            .options
+            .insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone());
+    }

    (!options.options.is_empty()).then_some(options)
 }
--- a/src/cache/Cargo.toml
+++ b/src/cache/Cargo.toml
@@ -11,4 +11,3 @@ common-macro.workspace = true
 common-meta.workspace = true
 moka.workspace = true
 snafu.workspace = true
-substrait.workspace = true
--- a/src/catalog/Cargo.toml
+++ b/src/catalog/Cargo.toml
@@ -18,7 +18,6 @@ async-stream.workspace = true
 async-trait = "0.1"
 bytes.workspace = true
 common-catalog.workspace = true
-common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-meta.workspace = true
@@ -58,7 +57,5 @@ catalog = { workspace = true, features = ["testing"] }
 chrono.workspace = true
 common-meta = { workspace = true, features = ["testing"] }
 common-query = { workspace = true, features = ["testing"] }
-common-test-util.workspace = true
-log-store.workspace = true
 object-store.workspace = true
 tokio.workspace = true
--- a/src/catalog/src/system_schema/information_schema/key_column_usage.rs
+++ b/src/catalog/src/system_schema/information_schema/key_column_usage.rs
@@ -54,6 +54,10 @@ const INIT_CAPACITY: usize = 42;
 pub(crate) const PRI_CONSTRAINT_NAME: &str = "PRIMARY";
 /// Time index constraint name
 pub(crate) const TIME_INDEX_CONSTRAINT_NAME: &str = "TIME INDEX";
+/// Inverted index constraint name
+pub(crate) const INVERTED_INDEX_CONSTRAINT_NAME: &str = "INVERTED INDEX";
+/// Fulltext index constraint name
+pub(crate) const FULLTEXT_INDEX_CONSTRAINT_NAME: &str = "FULLTEXT INDEX";

 /// The virtual table implementation for `information_schema.KEY_COLUMN_USAGE`.
 pub(super) struct InformationSchemaKeyColumnUsage {
@@ -216,14 +220,13 @@ impl InformationSchemaKeyColumnUsageBuilder {
            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);

            while let Some(table) = stream.try_next().await? {
-                let mut primary_constraints = vec![];
-
                let table_info = table.table_info();
                let table_name = &table_info.name;
                let keys = &table_info.meta.primary_key_indices;
                let schema = table.schema();

                for (idx, column) in schema.column_schemas().iter().enumerate() {
+                    let mut constraints = vec![];
                    if column.is_time_index() {
                        self.add_key_column_usage(
                            &predicates,
@@ -236,30 +239,31 @@ impl InformationSchemaKeyColumnUsageBuilder {
                            1, //always 1 for time index
                        );
                    }
-                    if keys.contains(&idx) {
-                        primary_constraints.push((
-                            catalog_name.clone(),
-                            schema_name.clone(),
-                            table_name.to_string(),
-                            column.name.clone(),
-                        ));
-                    }
                    // TODO(dimbtp): foreign key constraint not supported yet
-                }
+                    if keys.contains(&idx) {
+                        constraints.push(PRI_CONSTRAINT_NAME);
+                    }
+                    if column.is_inverted_indexed() {
+                        constraints.push(INVERTED_INDEX_CONSTRAINT_NAME);
+                    }

-                for (i, (catalog_name, schema_name, table_name, column_name)) in
-                    primary_constraints.into_iter().enumerate()
-                {
-                    self.add_key_column_usage(
-                        &predicates,
-                        &schema_name,
-                        PRI_CONSTRAINT_NAME,
-                        &catalog_name,
-                        &schema_name,
-                        &table_name,
-                        &column_name,
-                        i as u32 + 1,
-                    );
+                    if column.has_fulltext_index_key() {
+                        constraints.push(FULLTEXT_INDEX_CONSTRAINT_NAME);
+                    }
+
+                    if !constraints.is_empty() {
+                        let aggregated_constraints = constraints.join(", ");
+                        self.add_key_column_usage(
+                            &predicates,
+                            &schema_name,
+                            &aggregated_constraints,
+                            &catalog_name,
+                            &schema_name,
+                            table_name,
+                            &column.name,
+                            idx as u32 + 1,
+                        );
+                    }
                }
            }
        }
--- a/src/cli/Cargo.toml
+++ b/src/cli/Cargo.toml
@@ -23,7 +23,6 @@ common-error.workspace = true
 common-grpc.workspace = true
 common-macro.workspace = true
 common-meta.workspace = true
-common-options.workspace = true
 common-procedure.workspace = true
 common-query.workspace = true
 common-recordbatch.workspace = true
@@ -61,5 +60,4 @@ client = { workspace = true, features = ["testing"] }
 common-test-util.workspace = true
 common-version.workspace = true
 serde.workspace = true
-temp-env = "0.3"
 tempfile.workspace = true
--- a/src/client/Cargo.toml
+++ b/src/client/Cargo.toml
@@ -42,8 +42,6 @@ tonic.workspace = true

 [dev-dependencies]
 common-grpc-expr.workspace = true
-datanode.workspace = true
-derive-new = "0.5"
 tracing = "0.1"

 [dev-dependencies.substrait_proto]
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -59,10 +59,6 @@ impl Instance {
        }
    }

-    pub fn datanode_mut(&mut self) -> &mut Datanode {
-        &mut self.datanode
-    }
-
    pub fn datanode(&self) -> &Datanode {
        &self.datanode
    }
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -63,10 +63,6 @@ impl Instance {
        }
    }

-    pub fn flownode_mut(&mut self) -> &mut FlownodeInstance {
-        &mut self.flownode
-    }
-
    pub fn flownode(&self) -> &FlownodeInstance {
        &self.flownode
    }
--- a/src/common/base/Cargo.toml
+++ b/src/common/base/Cargo.toml
@@ -17,6 +17,7 @@ common-macro.workspace = true
 futures.workspace = true
 paste = "1.0"
 pin-project.workspace = true
+rand.workspace = true
 serde = { version = "1.0", features = ["derive"] }
 snafu.workspace = true
 tokio.workspace = true
--- a/src/common/base/src/range_read.rs
+++ b/src/common/base/src/range_read.rs
@@ -36,6 +36,11 @@ pub struct Metadata {
 /// `RangeReader` reads a range of bytes from a source.
 #[async_trait]
 pub trait RangeReader: Send + Unpin {
+    /// Sets the file size hint for the reader.
+    ///
+    /// It's used to optimize the reading process by reducing the number of remote requests.
+    fn with_file_size_hint(&mut self, file_size_hint: u64);
+
    /// Returns the metadata of the source.
    async fn metadata(&mut self) -> io::Result<Metadata>;

@@ -70,6 +75,10 @@ pub trait RangeReader: Send + Unpin {

 #[async_trait]
 impl<R: ?Sized + RangeReader> RangeReader for &mut R {
+    fn with_file_size_hint(&mut self, file_size_hint: u64) {
+        (*self).with_file_size_hint(file_size_hint)
+    }
+
    async fn metadata(&mut self) -> io::Result<Metadata> {
        (*self).metadata().await
    }
@@ -186,15 +195,17 @@ impl<R: RangeReader + 'static> AsyncRead for AsyncReadAdapter<R> {

 #[async_trait]
 impl RangeReader for Vec<u8> {
+    fn with_file_size_hint(&mut self, _file_size_hint: u64) {
+        // do nothing
+    }
+
    async fn metadata(&mut self) -> io::Result<Metadata> {
        Ok(Metadata {
            content_length: self.len() as u64,
        })
    }

-    async fn read(&mut self, mut range: Range<u64>) -> io::Result<Bytes> {
-        range.end = range.end.min(self.len() as u64);
-
+    async fn read(&mut self, range: Range<u64>) -> io::Result<Bytes> {
        let bytes = Bytes::copy_from_slice(&self[range.start as usize..range.end as usize]);
        Ok(bytes)
    }
@@ -222,6 +233,10 @@ impl FileReader {

 #[async_trait]
 impl RangeReader for FileReader {
+    fn with_file_size_hint(&mut self, _file_size_hint: u64) {
+        // do nothing
+    }
+
    async fn metadata(&mut self) -> io::Result<Metadata> {
        Ok(Metadata {
            content_length: self.content_length,
--- a/src/common/base/src/readable_size.rs
+++ b/src/common/base/src/readable_size.rs
@@ -19,7 +19,7 @@ pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE;
 pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE;
 pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE;

-#[derive(Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
+#[derive(Clone, Copy, PartialEq, Eq, Ord, PartialOrd, Default)]
 pub struct ReadableSize(pub u64);

 impl ReadableSize {
--- a/src/common/catalog/Cargo.toml
+++ b/src/common/catalog/Cargo.toml
@@ -8,10 +8,5 @@ license.workspace = true
 workspace = true

 [dependencies]
-common-error.workspace = true
-common-macro.workspace = true
-snafu.workspace = true

 [dev-dependencies]
-chrono.workspace = true
-tokio.workspace = true
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -48,5 +48,4 @@ url = "2.3"
 [dev-dependencies]
 common-telemetry.workspace = true
 common-test-util.workspace = true
-dotenv.workspace = true
 uuid.workspace = true
--- a/src/common/datasource/src/object_store/fs.rs
+++ b/src/common/datasource/src/object_store/fs.rs
@@ -27,7 +27,7 @@ pub fn build_fs_backend(root: &str) -> Result<ObjectStore> {
            DefaultLoggingInterceptor,
        ))
        .layer(object_store::layers::TracingLayer)
-        .layer(object_store::layers::PrometheusMetricsLayer::new(true))
+        .layer(object_store::layers::build_prometheus_metrics_layer(true))
        .finish();
    Ok(object_store)
 }
--- a/src/common/datasource/src/object_store/s3.rs
+++ b/src/common/datasource/src/object_store/s3.rs
@@ -89,7 +89,7 @@ pub fn build_s3_backend(
            DefaultLoggingInterceptor,
        ))
        .layer(object_store::layers::TracingLayer)
-        .layer(object_store::layers::PrometheusMetricsLayer::new(true))
+        .layer(object_store::layers::build_prometheus_metrics_layer(true))
        .finish())
 }

--- a/src/common/frontend/Cargo.toml
+++ b/src/common/frontend/Cargo.toml
@@ -5,12 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-api.workspace = true
 async-trait.workspace = true
-common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
-common-query.workspace = true
-session.workspace = true
 snafu.workspace = true
-sql.workspace = true
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -51,6 +51,5 @@ wkt = { version = "0.11", optional = true }

 [dev-dependencies]
 approx = "0.5"
-ron = "0.7"
 serde = { version = "1.0", features = ["derive"] }
 tokio.workspace = true
--- a/src/common/function/src/scalars/vector.rs
+++ b/src/common/function/src/scalars/vector.rs
@@ -15,6 +15,8 @@
 mod convert;
 mod distance;
 pub(crate) mod impl_conv;
+mod scalar_add;
+mod scalar_mul;

 use std::sync::Arc;

@@ -32,5 +34,9 @@ impl VectorFunction {
        registry.register(Arc::new(distance::CosDistanceFunction));
        registry.register(Arc::new(distance::DotProductFunction));
        registry.register(Arc::new(distance::L2SqDistanceFunction));
+
+        // scalar calculation
+        registry.register(Arc::new(scalar_add::ScalarAddFunction));
+        registry.register(Arc::new(scalar_mul::ScalarMulFunction));
    }
 }
--- a/src/common/function/src/scalars/vector/impl_conv.rs
+++ b/src/common/function/src/scalars/vector/impl_conv.rs
@@ -109,7 +109,6 @@ pub fn parse_veclit_from_strlit(s: &str) -> Result<Vec<f32>> {
        })
 }

-#[allow(unused)]
 /// Convert a vector literal to a binary literal.
 pub fn veclit_to_binlit(vec: &[f32]) -> Vec<u8> {
    if cfg!(target_endian = "little") {
--- a/src/common/function/src/scalars/vector/scalar_add.rs
+++ b/src/common/function/src/scalars/vector/scalar_add.rs
@@ -0,0 +1,173 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::borrow::Cow;
+use std::fmt::Display;
+
+use common_query::error::{InvalidFuncArgsSnafu, Result};
+use common_query::prelude::Signature;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::{BinaryVectorBuilder, MutableVector, VectorRef};
+use nalgebra::DVectorView;
+use snafu::ensure;
+
+use crate::function::{Function, FunctionContext};
+use crate::helper;
+use crate::scalars::vector::impl_conv::{as_veclit, as_veclit_if_const, veclit_to_binlit};
+
+const NAME: &str = "vec_scalar_add";
+
+/// Adds a scalar to each element of a vector.
+///
+/// # Example
+///
+/// ```sql
+/// SELECT vec_to_string(vec_scalar_add(1, "[1, 2, 3]")) as result;
+///
+/// +---------+
+/// | result  |
+/// +---------+
+/// | [2,3,4] |
+/// +---------+
+///
+/// -- Negative scalar to simulate subtraction
+/// SELECT vec_to_string(vec_scalar_add(-1, "[1, 2, 3]")) as result;
+///
+/// +---------+
+/// | result  |
+/// +---------+
+/// | [0,1,2] |
+/// +---------+
+/// ```
+#[derive(Debug, Clone, Default)]
+pub struct ScalarAddFunction;
+
+impl Function for ScalarAddFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::binary_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        helper::one_of_sigs2(
+            vec![ConcreteDataType::float64_datatype()],
+            vec![
+                ConcreteDataType::string_datatype(),
+                ConcreteDataType::binary_datatype(),
+            ],
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 2,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly two, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        let arg0 = &columns[0];
+        let arg1 = &columns[1];
+
+        let len = arg0.len();
+        let mut result = BinaryVectorBuilder::with_capacity(len);
+        if len == 0 {
+            return Ok(result.to_vector());
+        }
+
+        let arg1_const = as_veclit_if_const(arg1)?;
+
+        for i in 0..len {
+            let arg0 = arg0.get(i).as_f64_lossy();
+            let Some(arg0) = arg0 else {
+                result.push_null();
+                continue;
+            };
+
+            let arg1 = match arg1_const.as_ref() {
+                Some(arg1) => Some(Cow::Borrowed(arg1.as_ref())),
+                None => as_veclit(arg1.get_ref(i))?,
+            };
+            let Some(arg1) = arg1 else {
+                result.push_null();
+                continue;
+            };
+
+            let vec = DVectorView::from_slice(&arg1, arg1.len());
+            let vec_res = vec.add_scalar(arg0 as _);
+
+            let veclit = vec_res.as_slice();
+            let binlit = veclit_to_binlit(veclit);
+            result.push(Some(&binlit));
+        }
+
+        Ok(result.to_vector())
+    }
+}
+
+impl Display for ScalarAddFunction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datatypes::vectors::{Float32Vector, StringVector};
+
+    use super::*;
+
+    #[test]
+    fn test_scalar_add() {
+        let func = ScalarAddFunction;
+
+        let input0 = Arc::new(Float32Vector::from(vec![
+            Some(1.0),
+            Some(-1.0),
+            None,
+            Some(3.0),
+        ]));
+        let input1 = Arc::new(StringVector::from(vec![
+            Some("[1.0,2.0,3.0]".to_string()),
+            Some("[4.0,5.0,6.0]".to_string()),
+            Some("[7.0,8.0,9.0]".to_string()),
+            None,
+        ]));
+
+        let result = func
+            .eval(FunctionContext::default(), &[input0, input1])
+            .unwrap();
+
+        let result = result.as_ref();
+        assert_eq!(result.len(), 4);
+        assert_eq!(
+            result.get_ref(0).as_binary().unwrap(),
+            Some(veclit_to_binlit(&[2.0, 3.0, 4.0]).as_slice())
+        );
+        assert_eq!(
+            result.get_ref(1).as_binary().unwrap(),
+            Some(veclit_to_binlit(&[3.0, 4.0, 5.0]).as_slice())
+        );
+        assert!(result.get_ref(2).is_null());
+        assert!(result.get_ref(3).is_null());
+    }
+}
--- a/src/common/function/src/scalars/vector/scalar_mul.rs
+++ b/src/common/function/src/scalars/vector/scalar_mul.rs
@@ -0,0 +1,173 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::borrow::Cow;
+use std::fmt::Display;
+
+use common_query::error::{InvalidFuncArgsSnafu, Result};
+use common_query::prelude::Signature;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::{BinaryVectorBuilder, MutableVector, VectorRef};
+use nalgebra::DVectorView;
+use snafu::ensure;
+
+use crate::function::{Function, FunctionContext};
+use crate::helper;
+use crate::scalars::vector::impl_conv::{as_veclit, as_veclit_if_const, veclit_to_binlit};
+
+const NAME: &str = "vec_scalar_mul";
+
+/// Multiples a scalar to each element of a vector.
+///
+/// # Example
+///
+/// ```sql
+/// SELECT vec_to_string(vec_scalar_mul(2, "[1, 2, 3]")) as result;
+///
+/// +---------+
+/// | result  |
+/// +---------+
+/// | [2,4,6] |
+/// +---------+
+///
+/// -- 1/scalar to simulate division
+/// SELECT vec_to_string(vec_scalar_mul(0.5, "[2, 4, 6]")) as result;
+///
+/// +---------+
+/// | result  |
+/// +---------+
+/// | [1,2,3] |
+/// +---------+
+/// ```
+#[derive(Debug, Clone, Default)]
+pub struct ScalarMulFunction;
+
+impl Function for ScalarMulFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::binary_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        helper::one_of_sigs2(
+            vec![ConcreteDataType::float64_datatype()],
+            vec![
+                ConcreteDataType::string_datatype(),
+                ConcreteDataType::binary_datatype(),
+            ],
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 2,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly two, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        let arg0 = &columns[0];
+        let arg1 = &columns[1];
+
+        let len = arg0.len();
+        let mut result = BinaryVectorBuilder::with_capacity(len);
+        if len == 0 {
+            return Ok(result.to_vector());
+        }
+
+        let arg1_const = as_veclit_if_const(arg1)?;
+
+        for i in 0..len {
+            let arg0 = arg0.get(i).as_f64_lossy();
+            let Some(arg0) = arg0 else {
+                result.push_null();
+                continue;
+            };
+
+            let arg1 = match arg1_const.as_ref() {
+                Some(arg1) => Some(Cow::Borrowed(arg1.as_ref())),
+                None => as_veclit(arg1.get_ref(i))?,
+            };
+            let Some(arg1) = arg1 else {
+                result.push_null();
+                continue;
+            };
+
+            let vec = DVectorView::from_slice(&arg1, arg1.len());
+            let vec_res = vec.scale(arg0 as _);
+
+            let veclit = vec_res.as_slice();
+            let binlit = veclit_to_binlit(veclit);
+            result.push(Some(&binlit));
+        }
+
+        Ok(result.to_vector())
+    }
+}
+
+impl Display for ScalarMulFunction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datatypes::vectors::{Float32Vector, StringVector};
+
+    use super::*;
+
+    #[test]
+    fn test_scalar_mul() {
+        let func = ScalarMulFunction;
+
+        let input0 = Arc::new(Float32Vector::from(vec![
+            Some(2.0),
+            Some(-0.5),
+            None,
+            Some(3.0),
+        ]));
+        let input1 = Arc::new(StringVector::from(vec![
+            Some("[1.0,2.0,3.0]".to_string()),
+            Some("[8.0,10.0,12.0]".to_string()),
+            Some("[7.0,8.0,9.0]".to_string()),
+            None,
+        ]));
+
+        let result = func
+            .eval(FunctionContext::default(), &[input0, input1])
+            .unwrap();
+
+        let result = result.as_ref();
+        assert_eq!(result.len(), 4);
+        assert_eq!(
+            result.get_ref(0).as_binary().unwrap(),
+            Some(veclit_to_binlit(&[2.0, 4.0, 6.0]).as_slice())
+        );
+        assert_eq!(
+            result.get_ref(1).as_binary().unwrap(),
+            Some(veclit_to_binlit(&[-4.0, -5.0, -6.0]).as_slice())
+        );
+        assert!(result.get_ref(2).is_null());
+        assert!(result.get_ref(3).is_null());
+    }
+}
--- a/src/common/meta/src/cache/table/table_route.rs
+++ b/src/common/meta/src/cache/table/table_route.rs
@@ -49,14 +49,6 @@ impl TableRoute {
            TableRoute::Logical(_) => None,
        }
    }
-
-    /// Returns [LogicalTableRouteValue] reference if it's [TableRoute::Logical]; Otherwise it returns [None].
-    pub fn as_logical_table_route_ref(&self) -> Option<&Arc<LogicalTableRouteValue>> {
-        match self {
-            TableRoute::Physical(_) => None,
-            TableRoute::Logical(table_route) => Some(table_route),
-        }
-    }
 }

 /// [TableRouteCache] caches the [TableId] to [TableRoute] mapping.
--- a/src/common/meta/src/key/table_route.rs
+++ b/src/common/meta/src/key/table_route.rs
@@ -290,28 +290,6 @@ impl TableRouteManager {
        }
    }

-    /// Returns the [`PhysicalTableRouteValue`] in the first level,
-    /// It won't follow the [`LogicalTableRouteValue`] to find the next level [`PhysicalTableRouteValue`].
-    ///
-    /// Returns an error if the first level value is not a [`PhysicalTableRouteValue`].
-    pub async fn try_get_physical_table_route(
-        &self,
-        table_id: TableId,
-    ) -> Result<Option<PhysicalTableRouteValue>> {
-        match self.storage.get(table_id).await? {
-            Some(route) => {
-                ensure!(
-                    route.is_physical(),
-                    UnexpectedLogicalRouteTableSnafu {
-                        err_msg: format!("{route:?} is a non-physical TableRouteValue.")
-                    }
-                );
-                Ok(Some(route.into_physical_table_route()))
-            }
-            None => Ok(None),
-        }
-    }
-
    /// Returns the [TableId] recursively.
    ///
    /// Returns a [TableRouteNotFound](crate::error::Error::TableRouteNotFound) Error if:
@@ -569,37 +547,6 @@ impl TableRouteStorage {
            .transpose()
    }

-    /// Returns the physical `DeserializedValueWithBytes<TableRouteValue>` recursively.
-    ///
-    /// Returns a [TableRouteNotFound](crate::error::Error::TableRouteNotFound) Error if:
-    /// - the physical table(`logical_or_physical_table_id`) does not exist
-    /// - the corresponding physical table of the logical table(`logical_or_physical_table_id`) does not exist.
-    pub async fn get_physical_table_route_with_raw_bytes(
-        &self,
-        logical_or_physical_table_id: TableId,
-    ) -> Result<(TableId, DeserializedValueWithBytes<TableRouteValue>)> {
-        let table_route = self
-            .get_with_raw_bytes(logical_or_physical_table_id)
-            .await?
-            .context(TableRouteNotFoundSnafu {
-                table_id: logical_or_physical_table_id,
-            })?;
-
-        match table_route.get_inner_ref() {
-            TableRouteValue::Physical(_) => Ok((logical_or_physical_table_id, table_route)),
-            TableRouteValue::Logical(x) => {
-                let physical_table_id = x.physical_table_id();
-                let physical_table_route = self
-                    .get_with_raw_bytes(physical_table_id)
-                    .await?
-                    .context(TableRouteNotFoundSnafu {
-                        table_id: physical_table_id,
-                    })?;
-                Ok((physical_table_id, physical_table_route))
-            }
-        }
-    }
-
    /// Returns batch of [`TableRouteValue`] that respects the order of `table_ids`.
    pub async fn batch_get(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
        let mut table_routes = self.batch_get_inner(table_ids).await?;
--- a/src/common/meta/src/kv_backend/etcd.rs
+++ b/src/common/meta/src/kv_backend/etcd.rs
@@ -15,6 +15,7 @@
 use std::any::Any;
 use std::sync::Arc;

+use common_telemetry::info;
 use etcd_client::{
    Client, DeleteOptions, GetOptions, PutOptions, Txn, TxnOp, TxnOpResponse, TxnResponse,
 };
@@ -55,6 +56,7 @@ impl EtcdStore {
    }

    pub fn with_etcd_client(client: Client, max_txn_ops: usize) -> KvBackendRef {
+        info!("Connected to etcd");
        Arc::new(Self {
            client,
            max_txn_ops,
--- a/src/common/meta/src/rpc/router.rs
+++ b/src/common/meta/src/rpc/router.rs
@@ -89,39 +89,6 @@ pub fn convert_to_region_leader_map(region_routes: &[RegionRoute]) -> HashMap<Re
        .collect::<HashMap<_, _>>()
 }

-/// Returns the HashMap<[RegionNumber], HashSet<DatanodeId>>
-pub fn convert_to_region_peer_map(
-    region_routes: &[RegionRoute],
-) -> HashMap<RegionNumber, HashSet<u64>> {
-    region_routes
-        .iter()
-        .map(|x| {
-            let set = x
-                .follower_peers
-                .iter()
-                .map(|p| p.id)
-                .chain(x.leader_peer.as_ref().map(|p| p.id))
-                .collect::<HashSet<_>>();
-
-            (x.region.id.region_number(), set)
-        })
-        .collect::<HashMap<_, _>>()
-}
-
-/// Returns the HashMap<[RegionNumber], [LeaderState]>;
-pub fn convert_to_region_leader_state_map(
-    region_routes: &[RegionRoute],
-) -> HashMap<RegionNumber, LeaderState> {
-    region_routes
-        .iter()
-        .filter_map(|x| {
-            x.leader_state
-                .as_ref()
-                .map(|state| (x.region.id.region_number(), *state))
-        })
-        .collect::<HashMap<_, _>>()
-}
-
 pub fn find_region_leader(
    region_routes: &[RegionRoute],
    region_number: RegionNumber,
@@ -147,19 +114,6 @@ pub fn find_leader_regions(region_routes: &[RegionRoute], datanode: &Peer) -> Ve
        .collect()
 }

-pub fn extract_all_peers(region_routes: &[RegionRoute]) -> Vec<Peer> {
-    let mut peers = region_routes
-        .iter()
-        .flat_map(|x| x.leader_peer.iter().chain(x.follower_peers.iter()))
-        .collect::<HashSet<_>>()
-        .into_iter()
-        .cloned()
-        .collect::<Vec<_>>();
-    peers.sort_by_key(|x| x.id);
-
-    peers
-}
-
 impl TableRoute {
    pub fn new(table: Table, region_routes: Vec<RegionRoute>) -> Self {
        let region_leaders = region_routes
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -544,7 +544,7 @@ mod tests {
    use common_test_util::temp_dir::create_temp_dir;
    use futures_util::future::BoxFuture;
    use futures_util::FutureExt;
-    use object_store::ObjectStore;
+    use object_store::{EntryMode, ObjectStore};
    use tokio::sync::mpsc;

    use super::*;
@@ -578,7 +578,11 @@ mod tests {
    ) {
        let dir = proc_path!(procedure_store, "{procedure_id}/");
        let lister = object_store.list(&dir).await.unwrap();
-        let mut files_in_dir: Vec<_> = lister.into_iter().map(|de| de.name().to_string()).collect();
+        let mut files_in_dir: Vec<_> = lister
+            .into_iter()
+            .filter(|x| x.metadata().mode() == EntryMode::FILE)
+            .map(|de| de.name().to_string())
+            .collect();
        files_in_dir.sort_unstable();
        assert_eq!(files, files_in_dir);
    }
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -26,7 +26,6 @@ use std::sync::Arc;

 use adapter::RecordBatchMetrics;
 use arc_swap::ArcSwapOption;
-use datafusion::physical_plan::memory::MemoryStream;
 pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
 use datatypes::arrow::compute::SortOptions;
 pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
@@ -170,19 +169,6 @@ impl RecordBatches {
            index: 0,
        })
    }
-
-    pub fn into_df_stream(self) -> DfSendableRecordBatchStream {
-        let df_record_batches = self
-            .batches
-            .into_iter()
-            .map(|batch| batch.into_df_record_batch())
-            .collect();
-        // unwrap safety: `MemoryStream::try_new` won't fail
-        Box::pin(
-            MemoryStream::try_new(df_record_batches, self.schema.arrow_schema().clone(), None)
-                .unwrap(),
-        )
-    }
 }

 impl IntoIterator for RecordBatches {
--- a/src/common/runtime/Cargo.toml
+++ b/src/common/runtime/Cargo.toml
@@ -35,8 +35,6 @@ serde_json.workspace = true
 snafu.workspace = true
 tempfile.workspace = true
 tokio.workspace = true
-tokio-metrics = "0.3"
-tokio-metrics-collector = { git = "https://github.com/MichaelScofield/tokio-metrics-collector.git", rev = "89d692d5753d28564a7aac73c6ac5aba22243ba0" }
 tokio-util.workspace = true

 [dev-dependencies]
--- a/src/common/time/src/util.rs
+++ b/src/common/time/src/util.rs
@@ -29,10 +29,6 @@ pub fn format_utc_datetime(utc: &NaiveDateTime, pattern: &str) -> String {
    }
 }

-pub fn system_datetime_to_utc(local: &NaiveDateTime) -> LocalResult<NaiveDateTime> {
-    datetime_to_utc(local, get_timezone(None))
-}
-
 /// Cast a [`NaiveDateTime`] with the given timezone.
 pub fn datetime_to_utc(
    datetime: &NaiveDateTime,
--- a/src/datanode/src/error.rs
+++ b/src/datanode/src/error.rs
@@ -193,6 +193,14 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Failed to build http client"))]
+    BuildHttpClient {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: reqwest::Error,
+    },
+
    #[snafu(display("Missing required field: {}", name))]
    MissingRequiredField {
        name: String,
@@ -406,9 +414,10 @@ impl ErrorExt for Error {
            | MissingKvBackend { .. }
            | TomlFormat { .. } => StatusCode::InvalidArguments,

-            PayloadNotExist { .. } | Unexpected { .. } | WatchAsyncTaskChange { .. } => {
-                StatusCode::Unexpected
-            }
+            PayloadNotExist { .. }
+            | Unexpected { .. }
+            | WatchAsyncTaskChange { .. }
+            | BuildHttpClient { .. } => StatusCode::Unexpected,

            AsyncTaskExecute { source, .. } => source.status_code(),

--- a/src/datanode/src/store.rs
+++ b/src/datanode/src/store.rs
@@ -32,7 +32,7 @@ use object_store::{Access, Error, HttpClient, ObjectStore, ObjectStoreBuilder, O
 use snafu::prelude::*;

 use crate::config::{HttpClientConfig, ObjectStoreConfig, DEFAULT_OBJECT_STORE_CACHE_SIZE};
-use crate::error::{self, CreateDirSnafu, Result};
+use crate::error::{self, BuildHttpClientSnafu, CreateDirSnafu, Result};

 pub(crate) async fn new_raw_object_store(
    store: &ObjectStoreConfig,
@@ -236,7 +236,8 @@ pub(crate) fn build_http_client(config: &HttpClientConfig) -> Result<HttpClient>
        builder.timeout(config.timeout)
    };

-    HttpClient::build(http_builder).context(error::InitBackendSnafu)
+    let client = http_builder.build().context(BuildHttpClientSnafu)?;
+    Ok(HttpClient::with(client))
 }
 struct PrintDetailedError;

--- a/src/datatypes/src/data_type.rs
+++ b/src/datatypes/src/data_type.rs
@@ -370,6 +370,51 @@ impl ConcreteDataType {
            _ => None,
        }
    }
+
+    /// Return the datatype name in postgres type system
+    pub fn postgres_datatype_name(&self) -> &'static str {
+        match self {
+            &ConcreteDataType::Null(_) => "UNKNOWN",
+            &ConcreteDataType::Boolean(_) => "BOOL",
+            &ConcreteDataType::Int8(_) | &ConcreteDataType::UInt8(_) => "CHAR",
+            &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt16(_) => "INT2",
+            &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt32(_) => "INT4",
+            &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt64(_) => "INT8",
+            &ConcreteDataType::Float32(_) => "FLOAT4",
+            &ConcreteDataType::Float64(_) => "FLOAT8",
+            &ConcreteDataType::Binary(_) | &ConcreteDataType::Vector(_) => "BYTEA",
+            &ConcreteDataType::String(_) => "VARCHAR",
+            &ConcreteDataType::Date(_) => "DATE",
+            &ConcreteDataType::DateTime(_) | &ConcreteDataType::Timestamp(_) => "TIMESTAMP",
+            &ConcreteDataType::Time(_) => "TIME",
+            &ConcreteDataType::Interval(_) => "INTERVAL",
+            &ConcreteDataType::Decimal128(_) => "NUMERIC",
+            &ConcreteDataType::Json(_) => "JSON",
+            ConcreteDataType::List(list) => match list.item_type() {
+                &ConcreteDataType::Null(_) => "UNKNOWN",
+                &ConcreteDataType::Boolean(_) => "_BOOL",
+                &ConcreteDataType::Int8(_) | &ConcreteDataType::UInt8(_) => "_CHAR",
+                &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt16(_) => "_INT2",
+                &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt32(_) => "_INT4",
+                &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt64(_) => "_INT8",
+                &ConcreteDataType::Float32(_) => "_FLOAT4",
+                &ConcreteDataType::Float64(_) => "_FLOAT8",
+                &ConcreteDataType::Binary(_) => "_BYTEA",
+                &ConcreteDataType::String(_) => "_VARCHAR",
+                &ConcreteDataType::Date(_) => "_DATE",
+                &ConcreteDataType::DateTime(_) | &ConcreteDataType::Timestamp(_) => "_TIMESTAMP",
+                &ConcreteDataType::Time(_) => "_TIME",
+                &ConcreteDataType::Interval(_) => "_INTERVAL",
+                &ConcreteDataType::Decimal128(_) => "_NUMERIC",
+                &ConcreteDataType::Json(_) => "_JSON",
+                &ConcreteDataType::Duration(_)
+                | &ConcreteDataType::Dictionary(_)
+                | &ConcreteDataType::Vector(_)
+                | &ConcreteDataType::List(_) => "UNKNOWN",
+            },
+            &ConcreteDataType::Duration(_) | &ConcreteDataType::Dictionary(_) => "UNKNOWN",
+        }
+    }
 }

 impl From<&ConcreteDataType> for ConcreteDataType {
--- a/src/datatypes/src/error.rs
+++ b/src/datatypes/src/error.rs
@@ -232,6 +232,12 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+    #[snafu(display("Invalid skipping index option: {}", msg))]
+    InvalidSkippingIndexOption {
+        msg: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 impl ErrorExt for Error {
@@ -252,7 +258,8 @@ impl ErrorExt for Error {
            | InvalidPrecisionOrScale { .. }
            | InvalidJson { .. }
            | InvalidVector { .. }
-            | InvalidFulltextOption { .. } => StatusCode::InvalidArguments,
+            | InvalidFulltextOption { .. }
+            | InvalidSkippingIndexOption { .. } => StatusCode::InvalidArguments,

            ValueExceedsPrecision { .. }
            | CastType { .. }
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -28,10 +28,11 @@ use snafu::{ensure, ResultExt};
 use crate::error::{self, DuplicateColumnSnafu, Error, ProjectArrowSchemaSnafu, Result};
 use crate::prelude::ConcreteDataType;
 pub use crate::schema::column_schema::{
-    ColumnSchema, FulltextAnalyzer, FulltextOptions, Metadata,
+    ColumnSchema, FulltextAnalyzer, FulltextOptions, Metadata, SkippingIndexOptions,
    COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE, COLUMN_FULLTEXT_OPT_KEY_ANALYZER,
-    COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, COMMENT_KEY, FULLTEXT_KEY, INVERTED_INDEX_KEY,
-    TIME_INDEX_KEY,
+    COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
+    COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, FULLTEXT_KEY, INVERTED_INDEX_KEY,
+    SKIPPING_INDEX_KEY, TIME_INDEX_KEY,
 };
 pub use crate::schema::constraint::ColumnDefaultConstraint;
 pub use crate::schema::raw::RawSchema;
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -39,12 +39,20 @@ const DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint";
 pub const FULLTEXT_KEY: &str = "greptime:fulltext";
 /// Key used to store whether the column has inverted index in arrow field's metadata.
 pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
+/// Key used to store skip options in arrow field's metadata.
+pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";

 /// Keys used in fulltext options
 pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
 pub const COLUMN_FULLTEXT_OPT_KEY_ANALYZER: &str = "analyzer";
 pub const COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE: &str = "case_sensitive";

+/// Keys used in SKIPPING index options
+pub const COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY: &str = "granularity";
+pub const COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE: &str = "type";
+
+pub const DEFAULT_GRANULARITY: u32 = 10240;
+
 /// Schema of a column, used as an immutable struct.
 #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ColumnSchema {
@@ -156,6 +164,10 @@ impl ColumnSchema {
            .unwrap_or(false)
    }

+    pub fn has_fulltext_index_key(&self) -> bool {
+        self.metadata.contains_key(FULLTEXT_KEY)
+    }
+
    pub fn has_inverted_index_key(&self) -> bool {
        self.metadata.contains_key(INVERTED_INDEX_KEY)
    }
@@ -298,6 +310,34 @@ impl ColumnSchema {
        );
        Ok(())
    }
+
+    /// Retrieves the skipping index options for the column.
+    pub fn skipping_index_options(&self) -> Result<Option<SkippingIndexOptions>> {
+        match self.metadata.get(SKIPPING_INDEX_KEY) {
+            None => Ok(None),
+            Some(json) => {
+                let options =
+                    serde_json::from_str(json).context(error::DeserializeSnafu { json })?;
+                Ok(Some(options))
+            }
+        }
+    }
+
+    pub fn with_skipping_options(mut self, options: SkippingIndexOptions) -> Result<Self> {
+        self.metadata.insert(
+            SKIPPING_INDEX_KEY.to_string(),
+            serde_json::to_string(&options).context(error::SerializeSnafu)?,
+        );
+        Ok(self)
+    }
+
+    pub fn set_skipping_options(&mut self, options: &SkippingIndexOptions) -> Result<()> {
+        self.metadata.insert(
+            SKIPPING_INDEX_KEY.to_string(),
+            serde_json::to_string(options).context(error::SerializeSnafu)?,
+        );
+        Ok(())
+    }
 }

 /// Column extended type set in column schema's metadata.
@@ -495,6 +535,76 @@ impl fmt::Display for FulltextAnalyzer {
    }
 }

+/// Skipping options for a column.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)]
+#[serde(rename_all = "kebab-case")]
+pub struct SkippingIndexOptions {
+    /// The granularity of the skip index.
+    pub granularity: u32,
+    /// The type of the skip index.
+    #[serde(default)]
+    pub index_type: SkipIndexType,
+}
+
+impl fmt::Display for SkippingIndexOptions {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "granularity={}", self.granularity)?;
+        write!(f, ", index_type={}", self.index_type)?;
+        Ok(())
+    }
+}
+
+/// Skip index types.
+#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
+pub enum SkipIndexType {
+    #[default]
+    BloomFilter,
+}
+
+impl fmt::Display for SkipIndexType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            SkipIndexType::BloomFilter => write!(f, "BLOOM"),
+        }
+    }
+}
+
+impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
+    type Error = Error;
+
+    fn try_from(options: HashMap<String, String>) -> Result<Self> {
+        // Parse granularity with default value 1
+        let granularity = match options.get(COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY) {
+            Some(value) => value.parse::<u32>().map_err(|_| {
+                error::InvalidSkippingIndexOptionSnafu {
+                    msg: format!("Invalid granularity: {value}, expected: positive integer"),
+                }
+                .build()
+            })?,
+            None => DEFAULT_GRANULARITY,
+        };
+
+        // Parse index type with default value BloomFilter
+        let index_type = match options.get(COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE) {
+            Some(typ) => match typ.to_ascii_uppercase().as_str() {
+                "BLOOM" => SkipIndexType::BloomFilter,
+                _ => {
+                    return error::InvalidSkippingIndexOptionSnafu {
+                        msg: format!("Invalid index type: {typ}, expected: 'BLOOM'"),
+                    }
+                    .fail();
+                }
+            },
+            None => SkipIndexType::default(),
+        };
+
+        Ok(SkippingIndexOptions {
+            granularity,
+            index_type,
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/file-engine/Cargo.toml
+++ b/src/file-engine/Cargo.toml
@@ -38,5 +38,4 @@ tokio.workspace = true

 [dev-dependencies]
 api.workspace = true
-common-procedure-test.workspace = true
 common-test-util.workspace = true
--- a/src/file-engine/src/manifest.rs
+++ b/src/file-engine/src/manifest.rs
@@ -46,7 +46,7 @@ impl FileRegionManifest {
    pub async fn store(&self, region_dir: &str, object_store: &ObjectStore) -> Result<()> {
        let path = &region_manifest_path(region_dir);
        let exist = object_store
-            .is_exist(path)
+            .exists(path)
            .await
            .context(CheckObjectSnafu { path })?;
        ensure!(!exist, ManifestExistsSnafu { path });
--- a/src/file-engine/src/region.rs
+++ b/src/file-engine/src/region.rs
@@ -130,7 +130,7 @@ mod tests {
        assert_eq!(region.metadata.primary_key, vec![1]);

        assert!(object_store
-            .is_exist("create_region_dir/manifest/_file_manifest")
+            .exists("create_region_dir/manifest/_file_manifest")
            .await
            .unwrap());

@@ -198,13 +198,13 @@ mod tests {
            .unwrap();

        assert!(object_store
-            .is_exist("drop_region_dir/manifest/_file_manifest")
+            .exists("drop_region_dir/manifest/_file_manifest")
            .await
            .unwrap());

        FileRegion::drop(&region, &object_store).await.unwrap();
        assert!(!object_store
-            .is_exist("drop_region_dir/manifest/_file_manifest")
+            .exists("drop_region_dir/manifest/_file_manifest")
            .await
            .unwrap());

--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -47,7 +47,6 @@ hydroflow = { git = "https://github.com/GreptimeTeam/hydroflow.git", branch = "m
 itertools.workspace = true
 lazy_static.workspace = true
 meta-client.workspace = true
-minstant = "0.1.7"
 nom = "7.1.3"
 num-traits = "0.2"
 operator.workspace = true
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -206,28 +206,6 @@ impl DiffRequest {
    }
 }

-/// iterate through the diff row and form continuous diff row with same diff type
-pub fn diff_row_to_request(rows: Vec<DiffRow>) -> Vec<DiffRequest> {
-    let mut reqs = Vec::new();
-    for (row, ts, diff) in rows {
-        let last = reqs.last_mut();
-        match (last, diff) {
-            (Some(DiffRequest::Insert(rows)), 1) => {
-                rows.push((row, ts));
-            }
-            (Some(DiffRequest::Insert(_)), -1) => reqs.push(DiffRequest::Delete(vec![(row, ts)])),
-            (Some(DiffRequest::Delete(rows)), -1) => {
-                rows.push((row, ts));
-            }
-            (Some(DiffRequest::Delete(_)), 1) => reqs.push(DiffRequest::Insert(vec![(row, ts)])),
-            (None, 1) => reqs.push(DiffRequest::Insert(vec![(row, ts)])),
-            (None, -1) => reqs.push(DiffRequest::Delete(vec![(row, ts)])),
-            _ => {}
-        }
-    }
-    reqs
-}
-
 pub fn batches_to_rows_req(batches: Vec<Batch>) -> Result<Vec<DiffRequest>, Error> {
    let mut reqs = Vec::new();
    for batch in batches {
--- a/src/flow/src/compute/render/src_sink.rs
+++ b/src/flow/src/compute/render/src_sink.rs
@@ -14,7 +14,7 @@

 //! Source and Sink for the dataflow

-use std::collections::{BTreeMap, VecDeque};
+use std::collections::BTreeMap;

 use common_telemetry::{debug, trace};
 use hydroflow::scheduled::graph_ext::GraphExt;
@@ -28,7 +28,7 @@ use crate::compute::types::{Arranged, Collection, CollectionBundle, Toff};
 use crate::error::{Error, PlanSnafu};
 use crate::expr::error::InternalSnafu;
 use crate::expr::{Batch, EvalError};
-use crate::repr::{DiffRow, Row, BROADCAST_CAP};
+use crate::repr::{DiffRow, Row};

 #[allow(clippy::mutable_key_type)]
 impl Context<'_, '_> {
@@ -242,44 +242,4 @@ impl Context<'_, '_> {
            },
        );
    }
-
-    /// Render a sink which send updates to broadcast channel, have internal buffer in case broadcast channel is full
-    pub fn render_sink(&mut self, bundle: CollectionBundle, sender: broadcast::Sender<DiffRow>) {
-        let CollectionBundle {
-            collection,
-            arranged: _,
-        } = bundle;
-        let mut buf = VecDeque::with_capacity(1000);
-
-        let schd = self.compute_state.get_scheduler();
-        let inner_schd = schd.clone();
-        let now = self.compute_state.current_time_ref();
-
-        let sink = self
-            .df
-            .add_subgraph_sink("Sink", collection.into_inner(), move |_ctx, recv| {
-                let data = recv.take_inner();
-                buf.extend(data.into_iter().flat_map(|i| i.into_iter()));
-                if sender.len() >= BROADCAST_CAP {
-                    return;
-                } else {
-                    while let Some(row) = buf.pop_front() {
-                        // if the sender is full, stop sending
-                        if sender.len() >= BROADCAST_CAP {
-                            break;
-                        }
-                        // TODO(discord9): handling tokio broadcast error
-                        let _ = sender.send(row);
-                    }
-                }
-
-                // if buffer is not empty, schedule the next run at next tick
-                // so the buffer can be drained as soon as possible
-                if !buf.is_empty() {
-                    inner_schd.schedule_at(*now.borrow() + 1);
-                }
-            });
-
-        schd.set_cur_subgraph(sink);
-    }
 }
--- a/src/flow/src/compute/types.rs
+++ b/src/flow/src/compute/types.rs
@@ -82,22 +82,6 @@ impl Arranged {
                writer: self.writer.clone(),
            })
    }
-
-    /// Copy the full arrangement, including the future and the current updates.
-    ///
-    /// Internally `Rc-ed` so it's cheap to copy
-    pub fn try_copy_full(&self) -> Option<Self> {
-        self.arrangement
-            .clone_full_arrange()
-            .map(|arrangement| Arranged {
-                arrangement,
-                readers: self.readers.clone(),
-                writer: self.writer.clone(),
-            })
-    }
-    pub fn add_reader(&self, id: SubgraphId) {
-        self.readers.borrow_mut().push(id)
-    }
 }

 /// A bundle of the various ways a collection can be represented.
--- a/src/flow/src/expr/error.rs
+++ b/src/flow/src/expr/error.rs
@@ -21,11 +21,6 @@ use datafusion_common::DataFusionError;
 use datatypes::data_type::ConcreteDataType;
 use snafu::{Location, Snafu};

-fn is_send_sync() {
-    fn check<T: Send + Sync>() {}
-    check::<EvalError>();
-}
-
 /// EvalError is about errors happen on columnar evaluation
 ///
 /// TODO(discord9): add detailed location of column/operator(instead of code) to errors tp help identify related column
--- a/src/flow/src/expr/linear.rs
+++ b/src/flow/src/expr/linear.rs
@@ -359,14 +359,6 @@ impl MapFilterProject {
        )
    }

-    /// Convert the `MapFilterProject` into a staged evaluation plan.
-    ///
-    /// The main behavior is extract temporal predicates, which cannot be evaluated
-    /// using the standard machinery.
-    pub fn into_plan(self) -> Result<MfpPlan, Error> {
-        MfpPlan::create_from(self)
-    }
-
    /// Lists input columns whose values are used in outputs.
    ///
    /// It is entirely appropriate to determine the demand of an instance
@@ -602,26 +594,6 @@ impl SafeMfpPlan {
        }
    }

-    /// A version of `evaluate` which produces an iterator over `Datum`
-    /// as output.
-    ///
-    /// This version can be useful when one wants to capture the resulting
-    /// datums without packing and then unpacking a row.
-    #[inline(always)]
-    pub fn evaluate_iter<'a>(
-        &'a self,
-        datums: &'a mut Vec<Value>,
-    ) -> Result<Option<impl Iterator<Item = Value> + 'a>, EvalError> {
-        let passed_predicates = self.evaluate_inner(datums)?;
-        if !passed_predicates {
-            Ok(None)
-        } else {
-            Ok(Some(
-                self.mfp.projection.iter().map(move |i| datums[*i].clone()),
-            ))
-        }
-    }
-
    /// Populates `values` with `self.expressions` and tests `self.predicates`.
    ///
    /// This does not apply `self.projection`, which is up to the calling method.
--- a/src/flow/src/plan.rs
+++ b/src/flow/src/plan.rs
@@ -18,10 +18,8 @@
 mod join;
 mod reduce;

-use std::collections::BTreeSet;
-
 use crate::error::Error;
-use crate::expr::{GlobalId, Id, LocalId, MapFilterProject, SafeMfpPlan, TypedExpr};
+use crate::expr::{Id, LocalId, MapFilterProject, SafeMfpPlan, TypedExpr};
 use crate::plan::join::JoinPlan;
 pub(crate) use crate::plan::reduce::{AccumulablePlan, AggrWithIndex, KeyValPlan, ReducePlan};
 use crate::repr::{DiffRow, RelationDesc};
@@ -186,48 +184,6 @@ pub enum Plan {
    },
 }

-impl Plan {
-    /// Find all the used collection in the plan
-    pub fn find_used_collection(&self) -> BTreeSet<GlobalId> {
-        fn recur_find_use(plan: &Plan, used: &mut BTreeSet<GlobalId>) {
-            match plan {
-                Plan::Get { id } => {
-                    match id {
-                        Id::Local(_) => (),
-                        Id::Global(g) => {
-                            used.insert(*g);
-                        }
-                    };
-                }
-                Plan::Let { value, body, .. } => {
-                    recur_find_use(&value.plan, used);
-                    recur_find_use(&body.plan, used);
-                }
-                Plan::Mfp { input, .. } => {
-                    recur_find_use(&input.plan, used);
-                }
-                Plan::Reduce { input, .. } => {
-                    recur_find_use(&input.plan, used);
-                }
-                Plan::Join { inputs, .. } => {
-                    for input in inputs {
-                        recur_find_use(&input.plan, used);
-                    }
-                }
-                Plan::Union { inputs, .. } => {
-                    for input in inputs {
-                        recur_find_use(&input.plan, used);
-                    }
-                }
-                _ => {}
-            }
-        }
-        let mut ret = Default::default();
-        recur_find_use(self, &mut ret);
-        ret
-    }
-}
-
 impl Plan {
    pub fn with_types(self, schema: RelationDesc) -> TypedPlan {
        TypedPlan { schema, plan: self }
--- a/src/flow/src/repr/relation.rs
+++ b/src/flow/src/repr/relation.rs
@@ -46,14 +46,6 @@ impl Key {
        self.column_indices.push(col);
    }

-    /// Add columns to Key
-    pub fn add_cols<I>(&mut self, cols: I)
-    where
-        I: IntoIterator<Item = usize>,
-    {
-        self.column_indices.extend(cols);
-    }
-
    /// Remove a column from Key
    pub fn remove_col(&mut self, col: usize) {
        self.column_indices.retain(|&r| r != col);
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -25,7 +25,6 @@ common-catalog.workspace = true
 common-config.workspace = true
 common-datasource.workspace = true
 common-error.workspace = true
-common-frontend.workspace = true
 common-function.workspace = true
 common-grpc.workspace = true
 common-macro.workspace = true
@@ -71,7 +70,6 @@ common-test-util.workspace = true
 datanode.workspace = true
 datatypes.workspace = true
 futures = "0.3"
-meta-srv = { workspace = true, features = ["mock"] }
 serde_json.workspace = true
 strfmt = "0.2"
 tower.workspace = true
--- a/src/frontend/src/instance/log_handler.rs
+++ b/src/frontend/src/instance/log_handler.rs
@@ -19,14 +19,16 @@ use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use client::Output;
 use common_error::ext::BoxedError;
+use pipeline::pipeline_operator::PipelineOperator;
 use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion};
 use servers::error::{
    AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, PipelineSnafu, Result as ServerResult,
 };
 use servers::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef};
 use servers::query_handler::PipelineHandler;
-use session::context::QueryContextRef;
+use session::context::{QueryContext, QueryContextRef};
 use snafu::ResultExt;
+use table::Table;

 use crate::instance::Instance;

@@ -84,6 +86,22 @@ impl PipelineHandler for Instance {
            .await
            .context(PipelineSnafu)
    }
+
+    async fn get_table(
+        &self,
+        table: &str,
+        query_ctx: &QueryContext,
+    ) -> std::result::Result<Option<Arc<Table>>, catalog::error::Error> {
+        let catalog = query_ctx.current_catalog();
+        let schema = query_ctx.current_schema();
+        self.catalog_manager
+            .table(catalog, &schema, table, None)
+            .await
+    }
+
+    fn build_pipeline(&self, pipeline: &str) -> ServerResult<Pipeline<GreptimeTransformer>> {
+        PipelineOperator::build_pipeline(pipeline).context(PipelineSnafu)
+    }
 }

 impl Instance {
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -17,6 +17,7 @@ common-error.workspace = true
 common-macro.workspace = true
 common-runtime.workspace = true
 common-telemetry.workspace = true
+fastbloom = "0.8"
 fst.workspace = true
 futures.workspace = true
 greptime-proto.workspace = true
@@ -26,6 +27,7 @@ prost.workspace = true
 regex.workspace = true
 regex-automata.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 snafu.workspace = true
 tantivy = { version = "0.22", features = ["zstd-compression"] }
 tantivy-jieba = "0.11.0"
--- a/src/index/src/bloom_filter.rs
+++ b/src/index/src/bloom_filter.rs
@@ -0,0 +1,53 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+pub mod creator;
+mod error;
+
+pub type Bytes = Vec<u8>;
+pub type BytesRef<'a> = &'a [u8];
+
+/// The Meta information of the bloom filter stored in the file.
+#[derive(Debug, Default, Serialize, Deserialize)]
+pub struct BloomFilterMeta {
+    /// The number of rows per segment.
+    pub rows_per_segment: usize,
+
+    /// The number of segments.
+    pub seg_count: usize,
+
+    /// The number of total rows.
+    pub row_count: usize,
+
+    /// The size of the bloom filter excluding the meta information.
+    pub bloom_filter_segments_size: usize,
+
+    /// Offset and size of bloom filters in the file.
+    pub bloom_filter_segments: Vec<BloomFilterSegmentLocation>,
+}
+
+/// The location of the bloom filter segment in the file.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct BloomFilterSegmentLocation {
+    /// The offset of the bloom filter segment in the file.
+    pub offset: u64,
+
+    /// The size of the bloom filter segment in the file.
+    pub size: u64,
+
+    /// The number of elements in the bloom filter segment.
+    pub elem_count: usize,
+}
--- a/src/index/src/bloom_filter/creator.rs
+++ b/src/index/src/bloom_filter/creator.rs
@@ -0,0 +1,294 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+
+use fastbloom::BloomFilter;
+use futures::{AsyncWrite, AsyncWriteExt};
+use snafu::ResultExt;
+
+use super::error::{IoSnafu, SerdeJsonSnafu};
+use crate::bloom_filter::error::Result;
+use crate::bloom_filter::{BloomFilterMeta, BloomFilterSegmentLocation, Bytes};
+
+/// The seed used for the Bloom filter.
+const SEED: u128 = 42;
+
+/// The false positive rate of the Bloom filter.
+const FALSE_POSITIVE_RATE: f64 = 0.01;
+
+/// `BloomFilterCreator` is responsible for creating and managing bloom filters
+/// for a set of elements. It divides the rows into segments and creates
+/// bloom filters for each segment.
+///
+/// # Format
+///
+/// The bloom filter creator writes the following format to the writer:
+///
+/// ```text
+/// +--------------------+--------------------+-----+----------------------+----------------------+
+/// | Bloom filter 0     | Bloom filter 1     | ... | BloomFilterMeta      | Meta size            |
+/// +--------------------+--------------------+-----+----------------------+----------------------+
+/// |<- bytes (size 0) ->|<- bytes (size 1) ->| ... |<- json (meta size) ->|<- u32 LE (4 bytes) ->|
+/// ```
+///
+pub struct BloomFilterCreator {
+    /// The number of rows per segment set by the user.
+    rows_per_segment: usize,
+
+    /// Row count that added to the bloom filter so far.
+    accumulated_row_count: usize,
+
+    /// A set of distinct elements in the current segment.
+    cur_seg_distinct_elems: HashSet<Bytes>,
+
+    /// The memory usage of the current segment's distinct elements.
+    cur_seg_distinct_elems_mem_usage: usize,
+
+    /// Storage for finalized Bloom filters.
+    finalized_bloom_filters: FinalizedBloomFilterStorage,
+}
+
+impl BloomFilterCreator {
+    /// Creates a new `BloomFilterCreator` with the specified number of rows per segment.
+    ///
+    /// # PANICS
+    ///
+    /// `rows_per_segment` <= 0
+    pub fn new(rows_per_segment: usize) -> Self {
+        assert!(
+            rows_per_segment > 0,
+            "rows_per_segment must be greater than 0"
+        );
+
+        Self {
+            rows_per_segment,
+            accumulated_row_count: 0,
+            cur_seg_distinct_elems: HashSet::default(),
+            cur_seg_distinct_elems_mem_usage: 0,
+            finalized_bloom_filters: FinalizedBloomFilterStorage::default(),
+        }
+    }
+
+    /// Adds a row of elements to the bloom filter. If the number of accumulated rows
+    /// reaches `rows_per_segment`, it finalizes the current segment.
+    pub fn push_row_elems(&mut self, elems: impl IntoIterator<Item = Bytes>) {
+        self.accumulated_row_count += 1;
+        for elem in elems.into_iter() {
+            let len = elem.len();
+            let is_new = self.cur_seg_distinct_elems.insert(elem);
+            if is_new {
+                self.cur_seg_distinct_elems_mem_usage += len;
+            }
+        }
+
+        if self.accumulated_row_count % self.rows_per_segment == 0 {
+            self.finalize_segment();
+        }
+    }
+
+    /// Finalizes any remaining segments and writes the bloom filters and metadata to the provided writer.
+    pub async fn finish(&mut self, mut writer: impl AsyncWrite + Unpin) -> Result<()> {
+        if !self.cur_seg_distinct_elems.is_empty() {
+            self.finalize_segment();
+        }
+
+        let mut meta = BloomFilterMeta {
+            rows_per_segment: self.rows_per_segment,
+            seg_count: self.finalized_bloom_filters.len(),
+            row_count: self.accumulated_row_count,
+            ..Default::default()
+        };
+
+        let mut buf = Vec::new();
+        for segment in self.finalized_bloom_filters.drain() {
+            let slice = segment.bloom_filter.as_slice();
+            buf.clear();
+            write_u64_slice(&mut buf, slice);
+            writer.write_all(&buf).await.context(IoSnafu)?;
+
+            let size = buf.len();
+            meta.bloom_filter_segments.push(BloomFilterSegmentLocation {
+                offset: meta.bloom_filter_segments_size as _,
+                size: size as _,
+                elem_count: segment.element_count,
+            });
+            meta.bloom_filter_segments_size += size;
+        }
+
+        let meta_bytes = serde_json::to_vec(&meta).context(SerdeJsonSnafu)?;
+        writer.write_all(&meta_bytes).await.context(IoSnafu)?;
+
+        let meta_size = meta_bytes.len() as u32;
+        writer
+            .write_all(&meta_size.to_le_bytes())
+            .await
+            .context(IoSnafu)?;
+        writer.flush().await.unwrap();
+
+        Ok(())
+    }
+
+    /// Returns the memory usage of the creating bloom filter.
+    pub fn memory_usage(&self) -> usize {
+        self.cur_seg_distinct_elems_mem_usage + self.finalized_bloom_filters.memory_usage()
+    }
+
+    fn finalize_segment(&mut self) {
+        let elem_count = self.cur_seg_distinct_elems.len();
+        self.finalized_bloom_filters
+            .add(self.cur_seg_distinct_elems.drain(), elem_count);
+        self.cur_seg_distinct_elems_mem_usage = 0;
+    }
+}
+
+/// Storage for finalized Bloom filters.
+///
+/// TODO(zhongzc): Add support for storing intermediate bloom filters on disk to control memory usage.
+#[derive(Debug, Default)]
+struct FinalizedBloomFilterStorage {
+    /// Bloom filters that are stored in memory.
+    in_memory: Vec<FinalizedBloomFilterSegment>,
+}
+
+impl FinalizedBloomFilterStorage {
+    fn memory_usage(&self) -> usize {
+        self.in_memory.iter().map(|s| s.size).sum()
+    }
+
+    /// Adds a new finalized Bloom filter to the storage.
+    ///
+    /// TODO(zhongzc): Add support for flushing to disk.
+    fn add(&mut self, elems: impl IntoIterator<Item = Bytes>, elem_count: usize) {
+        let mut bf = BloomFilter::with_false_pos(FALSE_POSITIVE_RATE)
+            .seed(&SEED)
+            .expected_items(elem_count);
+        for elem in elems.into_iter() {
+            bf.insert(&elem);
+        }
+
+        let cbf = FinalizedBloomFilterSegment::new(bf, elem_count);
+        self.in_memory.push(cbf);
+    }
+
+    fn len(&self) -> usize {
+        self.in_memory.len()
+    }
+
+    fn drain(&mut self) -> impl Iterator<Item = FinalizedBloomFilterSegment> + '_ {
+        self.in_memory.drain(..)
+    }
+}
+
+/// A finalized Bloom filter segment.
+#[derive(Debug)]
+struct FinalizedBloomFilterSegment {
+    /// The underlying Bloom filter.
+    bloom_filter: BloomFilter,
+
+    /// The number of elements in the Bloom filter.
+    element_count: usize,
+
+    /// The occupied memory size of the Bloom filter.
+    size: usize,
+}
+
+impl FinalizedBloomFilterSegment {
+    fn new(bloom_filter: BloomFilter, elem_count: usize) -> Self {
+        let memory_usage = std::mem::size_of_val(bloom_filter.as_slice());
+        Self {
+            bloom_filter,
+            element_count: elem_count,
+            size: memory_usage,
+        }
+    }
+}
+
+/// Writes a slice of `u64` to the buffer in little-endian order.
+fn write_u64_slice(buf: &mut Vec<u8>, slice: &[u64]) {
+    buf.reserve(std::mem::size_of_val(slice));
+    for &x in slice {
+        buf.extend_from_slice(&x.to_le_bytes());
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::io::Cursor;
+
+    use super::*;
+
+    fn u64_vec_from_bytes(bytes: &[u8]) -> Vec<u64> {
+        bytes
+            .chunks_exact(std::mem::size_of::<u64>())
+            .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap()))
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn test_bloom_filter_creator() {
+        let mut writer = Cursor::new(Vec::new());
+        let mut creator = BloomFilterCreator::new(2);
+
+        creator.push_row_elems(vec![b"a".to_vec(), b"b".to_vec()]);
+        assert!(creator.cur_seg_distinct_elems_mem_usage > 0);
+        assert!(creator.memory_usage() > 0);
+
+        creator.push_row_elems(vec![b"c".to_vec(), b"d".to_vec()]);
+        // Finalize the first segment
+        assert!(creator.cur_seg_distinct_elems_mem_usage == 0);
+        assert!(creator.memory_usage() > 0);
+
+        creator.push_row_elems(vec![b"e".to_vec(), b"f".to_vec()]);
+        assert!(creator.cur_seg_distinct_elems_mem_usage > 0);
+        assert!(creator.memory_usage() > 0);
+
+        creator.finish(&mut writer).await.unwrap();
+
+        let bytes = writer.into_inner();
+        let total_size = bytes.len();
+        let meta_size_offset = total_size - 4;
+        let meta_size = u32::from_le_bytes((&bytes[meta_size_offset..]).try_into().unwrap());
+
+        let meta_bytes = &bytes[total_size - meta_size as usize - 4..total_size - 4];
+        let meta: BloomFilterMeta = serde_json::from_slice(meta_bytes).unwrap();
+
+        assert_eq!(meta.rows_per_segment, 2);
+        assert_eq!(meta.seg_count, 2);
+        assert_eq!(meta.row_count, 3);
+        assert_eq!(
+            meta.bloom_filter_segments_size + meta_bytes.len() + 4,
+            total_size
+        );
+
+        let mut bfs = Vec::new();
+        for segment in meta.bloom_filter_segments {
+            let bloom_filter_bytes =
+                &bytes[segment.offset as usize..(segment.offset + segment.size) as usize];
+            let v = u64_vec_from_bytes(bloom_filter_bytes);
+            let bloom_filter = BloomFilter::from_vec(v)
+                .seed(&SEED)
+                .expected_items(segment.elem_count);
+            bfs.push(bloom_filter);
+        }
+
+        assert_eq!(bfs.len(), 2);
+        assert!(bfs[0].contains(&b"a"));
+        assert!(bfs[0].contains(&b"b"));
+        assert!(bfs[0].contains(&b"c"));
+        assert!(bfs[0].contains(&b"d"));
+        assert!(bfs[1].contains(&b"e"));
+        assert!(bfs[1].contains(&b"f"));
+    }
+}
--- a/src/index/src/bloom_filter/error.rs
+++ b/src/index/src/bloom_filter/error.rs
@@ -0,0 +1,66 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::{BoxedError, ErrorExt};
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("IO error"))]
+    Io {
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to serde json"))]
+    SerdeJson {
+        #[snafu(source)]
+        error: serde_json::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("External error"))]
+    External {
+        source: BoxedError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        use Error::*;
+
+        match self {
+            Io { .. } | Self::SerdeJson { .. } => StatusCode::Unexpected,
+
+            External { source, .. } => source.status_code(),
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
--- a/src/index/src/inverted_index/error.rs
+++ b/src/index/src/inverted_index/error.rs
@@ -26,14 +26,6 @@ use crate::inverted_index::search::predicate::Predicate;
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
 pub enum Error {
-    #[snafu(display("Failed to seek"))]
-    Seek {
-        #[snafu(source)]
-        error: IoError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to read"))]
    Read {
        #[snafu(source)]
@@ -76,6 +68,18 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Blob size too small"))]
+    BlobSizeTooSmall {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid footer payload size"))]
+    InvalidFooterPayloadSize {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Unexpected inverted index footer payload size, max: {max_payload_size}, actual: {actual_payload_size}"))]
    UnexpectedFooterPayloadSize {
        max_payload_size: u64,
@@ -215,8 +219,7 @@ impl ErrorExt for Error {
    fn status_code(&self) -> StatusCode {
        use Error::*;
        match self {
-            Seek { .. }
-            | Read { .. }
+            Read { .. }
            | Write { .. }
            | Flush { .. }
            | Close { .. }
@@ -229,7 +232,9 @@ impl ErrorExt for Error {
            | KeysApplierUnexpectedPredicates { .. }
            | CommonIo { .. }
            | UnknownIntermediateCodecMagic { .. }
-            | FstCompile { .. } => StatusCode::Unexpected,
+            | FstCompile { .. }
+            | InvalidFooterPayloadSize { .. }
+            | BlobSizeTooSmall { .. } => StatusCode::Unexpected,

            ParseRegex { .. }
            | ParseDFA { .. }
--- a/src/index/src/inverted_index/format/reader.rs
+++ b/src/index/src/inverted_index/format/reader.rs
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::ops::Range;
 use std::sync::Arc;

 use async_trait::async_trait;
+use bytes::Bytes;
 use common_base::BitVec;
 use greptime_proto::v1::index::InvertedIndexMetas;
 use snafu::ResultExt;
@@ -30,23 +32,23 @@ mod footer;
 #[mockall::automock]
 #[async_trait]
 pub trait InvertedIndexReader: Send {
-    /// Reads all data to dest.
-    async fn read_all(&mut self, dest: &mut Vec<u8>) -> Result<usize>;
-
    /// Seeks to given offset and reads data with exact size as provided.
-    async fn seek_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>>;
+    async fn range_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>>;
+
+    /// Reads the bytes in the given ranges.
+    async fn read_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>>;

    /// Retrieves metadata of all inverted indices stored within the blob.
    async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>>;

    /// Retrieves the finite state transducer (FST) map from the given offset and size.
    async fn fst(&mut self, offset: u64, size: u32) -> Result<FstMap> {
-        let fst_data = self.seek_read(offset, size).await?;
+        let fst_data = self.range_read(offset, size).await?;
        FstMap::new(fst_data).context(DecodeFstSnafu)
    }

    /// Retrieves the bitmap from the given offset and size.
    async fn bitmap(&mut self, offset: u64, size: u32) -> Result<BitVec> {
-        self.seek_read(offset, size).await.map(BitVec::from_vec)
+        self.range_read(offset, size).await.map(BitVec::from_vec)
    }
 }
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -12,15 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::ops::Range;
 use std::sync::Arc;

 use async_trait::async_trait;
+use bytes::Bytes;
 use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::InvertedIndexMetas;
 use snafu::{ensure, ResultExt};

+use super::footer::DEFAULT_PREFETCH_SIZE;
 use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
-use crate::inverted_index::format::reader::footer::InvertedIndeFooterReader;
+use crate::inverted_index::format::reader::footer::InvertedIndexFooterReader;
 use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::format::MIN_BLOB_SIZE;

@@ -49,16 +52,7 @@ impl<R> InvertedIndexBlobReader<R> {

 #[async_trait]
 impl<R: RangeReader> InvertedIndexReader for InvertedIndexBlobReader<R> {
-    async fn read_all(&mut self, dest: &mut Vec<u8>) -> Result<usize> {
-        let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
-        self.source
-            .read_into(0..metadata.content_length, dest)
-            .await
-            .context(CommonIoSnafu)?;
-        Ok(metadata.content_length as usize)
-    }
-
-    async fn seek_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>> {
+    async fn range_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>> {
        let buf = self
            .source
            .read(offset..offset + size as u64)
@@ -67,12 +61,17 @@ impl<R: RangeReader> InvertedIndexReader for InvertedIndexBlobReader<R> {
        Ok(buf.into())
    }

+    async fn read_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
+        self.source.read_vec(ranges).await.context(CommonIoSnafu)
+    }
+
    async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>> {
        let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
        let blob_size = metadata.content_length;
        Self::validate_blob_size(blob_size)?;

-        let mut footer_reader = InvertedIndeFooterReader::new(&mut self.source, blob_size);
+        let mut footer_reader = InvertedIndexFooterReader::new(&mut self.source, blob_size)
+            .with_prefetch_size(DEFAULT_PREFETCH_SIZE);
        footer_reader.metadata().await.map(Arc::new)
    }
 }
--- a/src/index/src/inverted_index/format/reader/footer.rs
+++ b/src/index/src/inverted_index/format/reader/footer.rs
@@ -18,53 +18,88 @@ use prost::Message;
 use snafu::{ensure, ResultExt};

 use crate::inverted_index::error::{
-    CommonIoSnafu, DecodeProtoSnafu, Result, UnexpectedFooterPayloadSizeSnafu,
-    UnexpectedOffsetSizeSnafu, UnexpectedZeroSegmentRowCountSnafu,
+    BlobSizeTooSmallSnafu, CommonIoSnafu, DecodeProtoSnafu, InvalidFooterPayloadSizeSnafu, Result,
+    UnexpectedFooterPayloadSizeSnafu, UnexpectedOffsetSizeSnafu,
+    UnexpectedZeroSegmentRowCountSnafu,
 };
 use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;

-/// InvertedIndeFooterReader is for reading the footer section of the blob.
-pub struct InvertedIndeFooterReader<R> {
+pub const DEFAULT_PREFETCH_SIZE: u64 = 1024; // 1KiB
+
+/// InvertedIndexFooterReader is for reading the footer section of the blob.
+pub struct InvertedIndexFooterReader<R> {
    source: R,
    blob_size: u64,
+    prefetch_size: Option<u64>,
 }

-impl<R> InvertedIndeFooterReader<R> {
+impl<R> InvertedIndexFooterReader<R> {
    pub fn new(source: R, blob_size: u64) -> Self {
-        Self { source, blob_size }
+        Self {
+            source,
+            blob_size,
+            prefetch_size: None,
+        }
+    }
+
+    /// Set the prefetch size for the footer reader.
+    pub fn with_prefetch_size(mut self, prefetch_size: u64) -> Self {
+        self.prefetch_size = Some(prefetch_size.max(FOOTER_PAYLOAD_SIZE_SIZE));
+        self
+    }
+
+    pub fn prefetch_size(&self) -> u64 {
+        self.prefetch_size.unwrap_or(FOOTER_PAYLOAD_SIZE_SIZE)
    }
 }

-impl<R: RangeReader> InvertedIndeFooterReader<R> {
+impl<R: RangeReader> InvertedIndexFooterReader<R> {
    pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
-        let payload_size = self.read_payload_size().await?;
-        let metas = self.read_payload(payload_size).await?;
-        Ok(metas)
-    }
+        ensure!(
+            self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
+            BlobSizeTooSmallSnafu
+        );

-    async fn read_payload_size(&mut self) -> Result<u64> {
-        let mut size_buf = [0u8; FOOTER_PAYLOAD_SIZE_SIZE as usize];
-        let end = self.blob_size;
-        let start = end - FOOTER_PAYLOAD_SIZE_SIZE;
-        self.source
-            .read_into(start..end, &mut &mut size_buf[..])
+        let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
+        let suffix = self
+            .source
+            .read(footer_start..self.blob_size)
            .await
            .context(CommonIoSnafu)?;
+        let suffix_len = suffix.len();
+        let length = u32::from_le_bytes(Self::read_tailing_four_bytes(&suffix)?) as u64;
+        self.validate_payload_size(length)?;

-        let payload_size = u32::from_le_bytes(size_buf) as u64;
-        self.validate_payload_size(payload_size)?;
+        let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;

-        Ok(payload_size)
+        // Did not fetch the entire file metadata in the initial read, need to make a second request.
+        if length > suffix_len as u64 - footer_size {
+            let metadata_start = self.blob_size - length - footer_size;
+            let meta = self
+                .source
+                .read(metadata_start..self.blob_size - footer_size)
+                .await
+                .context(CommonIoSnafu)?;
+            self.parse_payload(&meta, length)
+        } else {
+            let metadata_start = self.blob_size - length - footer_size - footer_start;
+            let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
+            self.parse_payload(meta, length)
+        }
    }

-    async fn read_payload(&mut self, payload_size: u64) -> Result<InvertedIndexMetas> {
-        let end = self.blob_size - FOOTER_PAYLOAD_SIZE_SIZE;
-        let start = end - payload_size;
-        let bytes = self.source.read(start..end).await.context(CommonIoSnafu)?;
+    fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
+        let suffix_len = suffix.len();
+        ensure!(suffix_len >= 4, InvalidFooterPayloadSizeSnafu);
+        let mut bytes = [0; 4];
+        bytes.copy_from_slice(&suffix[suffix_len - 4..suffix_len]);

-        let metas = InvertedIndexMetas::decode(&*bytes).context(DecodeProtoSnafu)?;
+        Ok(bytes)
+    }
+
+    fn parse_payload(&mut self, bytes: &[u8], payload_size: u64) -> Result<InvertedIndexMetas> {
+        let metas = InvertedIndexMetas::decode(bytes).context(DecodeProtoSnafu)?;
        self.validate_metas(&metas, payload_size)?;
-
        Ok(metas)
    }

@@ -113,9 +148,12 @@ impl<R: RangeReader> InvertedIndeFooterReader<R> {

 #[cfg(test)]
 mod tests {
+    use std::assert_matches::assert_matches;
+
    use prost::Message;

    use super::*;
+    use crate::inverted_index::error::Error;

    fn create_test_payload(meta: InvertedIndexMeta) -> Vec<u8> {
        let mut metas = InvertedIndexMetas {
@@ -141,14 +179,18 @@ mod tests {

        let mut payload_buf = create_test_payload(meta);
        let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(&mut payload_buf, blob_size);

-        let payload_size = reader.read_payload_size().await.unwrap();
-        let metas = reader.read_payload(payload_size).await.unwrap();
+        for prefetch in [0, blob_size / 2, blob_size, blob_size + 10] {
+            let mut reader = InvertedIndexFooterReader::new(&mut payload_buf, blob_size);
+            if prefetch > 0 {
+                reader = reader.with_prefetch_size(prefetch);
+            }

-        assert_eq!(metas.metas.len(), 1);
-        let index_meta = &metas.metas.get("test").unwrap();
-        assert_eq!(index_meta.name, "test");
+            let metas = reader.metadata().await.unwrap();
+            assert_eq!(metas.metas.len(), 1);
+            let index_meta = &metas.metas.get("test").unwrap();
+            assert_eq!(index_meta.name, "test");
+        }
    }

    #[tokio::test]
@@ -157,14 +199,20 @@ mod tests {
            name: "test".to_string(),
            ..Default::default()
        };
-
        let mut payload_buf = create_test_payload(meta);
        payload_buf.push(0xff); // Add an extra byte to corrupt the footer
        let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(&mut payload_buf, blob_size);

-        let payload_size_result = reader.read_payload_size().await;
-        assert!(payload_size_result.is_err());
+        for prefetch in [0, blob_size / 2, blob_size, blob_size + 10] {
+            let blob_size = payload_buf.len() as u64;
+            let mut reader = InvertedIndexFooterReader::new(&mut payload_buf, blob_size);
+            if prefetch > 0 {
+                reader = reader.with_prefetch_size(prefetch);
+            }
+
+            let result = reader.metadata().await;
+            assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
+        }
    }

    #[tokio::test]
@@ -178,10 +226,15 @@ mod tests {

        let mut payload_buf = create_test_payload(meta);
        let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(&mut payload_buf, blob_size);

-        let payload_size = reader.read_payload_size().await.unwrap();
-        let payload_result = reader.read_payload(payload_size).await;
-        assert!(payload_result.is_err());
+        for prefetch in [0, blob_size / 2, blob_size, blob_size + 10] {
+            let mut reader = InvertedIndexFooterReader::new(&mut payload_buf, blob_size);
+            if prefetch > 0 {
+                reader = reader.with_prefetch_size(prefetch);
+            }
+
+            let result = reader.metadata().await;
+            assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
+        }
    }
 }
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -13,6 +13,8 @@
 // limitations under the License.

 #![feature(iter_partition_in_place)]
+#![feature(assert_matches)]

+pub mod bloom_filter;
 pub mod fulltext_index;
 pub mod inverted_index;
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -204,10 +204,6 @@ impl Context {
    pub fn reset_in_memory(&self) {
        self.in_memory.reset();
    }
-
-    pub fn reset_leader_cached_kv_backend(&self) {
-        self.leader_cached_kv_backend.reset();
-    }
 }

 /// The value of the leader. It is used to store the leader's address.
--- a/src/meta-srv/src/mocks.rs
+++ b/src/meta-srv/src/mocks.rs
@@ -52,11 +52,6 @@ pub async fn mock_with_etcdstore(addr: &str) -> MockInfo {
    mock(Default::default(), kv_backend, None, None, None).await
 }

-pub async fn mock_with_memstore_and_selector(selector: SelectorRef) -> MockInfo {
-    let kv_backend = Arc::new(MemoryKvBackend::new());
-    mock(Default::default(), kv_backend, Some(selector), None, None).await
-}
-
 pub async fn mock(
    opts: MetasrvOptions,
    kv_backend: KvBackendRef,
--- a/src/meta-srv/src/procedure/region_migration.rs
+++ b/src/meta-srv/src/procedure/region_migration.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+pub(crate) mod close_downgraded_region;
 pub(crate) mod downgrade_leader_region;
 pub(crate) mod manager;
 pub(crate) mod migration_abort;
@@ -43,6 +44,7 @@ use common_procedure::error::{
    Error as ProcedureError, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu,
 };
 use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status, StringKey};
+use common_telemetry::info;
 use manager::RegionMigrationProcedureGuard;
 pub use manager::{
    RegionMigrationManagerRef, RegionMigrationProcedureTask, RegionMigrationProcedureTracker,
@@ -91,7 +93,9 @@ impl PersistentContext {
        let lock_key = vec![
            CatalogLock::Read(&self.catalog).into(),
            SchemaLock::read(&self.catalog, &self.schema).into(),
-            TableLock::Read(region_id.table_id()).into(),
+            // The optimistic updating of table route is not working very well,
+            // so we need to use the write lock here.
+            TableLock::Write(region_id.table_id()).into(),
            RegionLock::Write(region_id).into(),
        ];

@@ -253,7 +257,7 @@ impl Context {
                .await
                .context(error::TableMetadataManagerSnafu)
                .map_err(BoxedError::new)
-                .context(error::RetryLaterWithSourceSnafu {
+                .with_context(|_| error::RetryLaterWithSourceSnafu {
                    reason: format!("Failed to get TableRoute: {table_id}"),
                })?
                .context(error::TableRouteNotFoundSnafu { table_id })?;
@@ -317,7 +321,7 @@ impl Context {
                .await
                .context(error::TableMetadataManagerSnafu)
                .map_err(BoxedError::new)
-                .context(error::RetryLaterWithSourceSnafu {
+                .with_context(|_| error::RetryLaterWithSourceSnafu {
                    reason: format!("Failed to get TableInfo: {table_id}"),
                })?
                .context(error::TableInfoNotFoundSnafu { table_id })?;
@@ -350,7 +354,7 @@ impl Context {
                .await
                .context(error::TableMetadataManagerSnafu)
                .map_err(BoxedError::new)
-                .context(error::RetryLaterWithSourceSnafu {
+                .with_context(|_| error::RetryLaterWithSourceSnafu {
                    reason: format!("Failed to get DatanodeTable: ({datanode_id},{table_id})"),
                })?
                .context(error::DatanodeTableNotFoundSnafu {
@@ -364,12 +368,6 @@ impl Context {
        Ok(datanode_value.as_ref().unwrap())
    }

-    /// Removes the `table_info` of [VolatileContext], returns true if any.
-    pub fn remove_table_info_value(&mut self) -> bool {
-        let value = self.volatile_ctx.table_info.take();
-        value.is_some()
-    }
-
    /// Returns the [RegionId].
    pub fn region_id(&self) -> RegionId {
        self.persistent_ctx.region_id
@@ -474,6 +472,48 @@ impl RegionMigrationProcedure {
            _guard: guard,
        })
    }
+
+    async fn rollback_inner(&mut self) -> Result<()> {
+        let _timer = METRIC_META_REGION_MIGRATION_EXECUTE
+            .with_label_values(&["rollback"])
+            .start_timer();
+
+        let table_id = self.context.region_id().table_id();
+        let region_id = self.context.region_id();
+        self.context.remove_table_route_value();
+        let table_metadata_manager = self.context.table_metadata_manager.clone();
+        let table_route = self.context.get_table_route_value().await?;
+
+        // Safety: It must be a physical table route.
+        let downgraded = table_route
+            .region_routes()
+            .unwrap()
+            .iter()
+            .filter(|route| route.region.id == region_id)
+            .any(|route| route.is_leader_downgrading());
+
+        if downgraded {
+            info!("Rollbacking downgraded region leader table route, region: {region_id}");
+            table_metadata_manager
+                    .update_leader_region_status(table_id, table_route, |route| {
+                        if route.region.id == region_id {
+                            Some(None)
+                        } else {
+                            None
+                        }
+                    })
+                    .await
+                    .context(error::TableMetadataManagerSnafu)
+                    .map_err(BoxedError::new)
+                    .with_context(|_| error::RetryLaterWithSourceSnafu {
+                        reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
+                    })?;
+        }
+
+        self.context.register_failure_detectors().await;
+
+        Ok(())
+    }
 }

 #[async_trait::async_trait]
@@ -482,6 +522,16 @@ impl Procedure for RegionMigrationProcedure {
        Self::TYPE_NAME
    }

+    async fn rollback(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<()> {
+        self.rollback_inner()
+            .await
+            .map_err(ProcedureError::external)
+    }
+
+    fn rollback_supported(&self) -> bool {
+        true
+    }
+
    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
        let state = &mut self.state;

@@ -707,6 +757,12 @@ mod tests {
                Assertion::simple(assert_update_metadata_upgrade, assert_no_persist),
            ),
            // UpdateMetadata::Upgrade
+            Step::next(
+                "Should be the close downgraded region",
+                None,
+                Assertion::simple(assert_close_downgraded_region, assert_no_persist),
+            ),
+            // CloseDowngradedRegion
            Step::next(
                "Should be the region migration end",
                None,
@@ -1077,6 +1133,12 @@ mod tests {
                Assertion::simple(assert_update_metadata_upgrade, assert_no_persist),
            ),
            // UpdateMetadata::Upgrade
+            Step::next(
+                "Should be the close downgraded region",
+                None,
+                Assertion::simple(assert_close_downgraded_region, assert_no_persist),
+            ),
+            // CloseDowngradedRegion
            Step::next(
                "Should be the region migration end",
                None,
--- a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
@@ -0,0 +1,138 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::time::Duration;
+
+use api::v1::meta::MailboxMessage;
+use common_meta::distributed_time_constants::MAILBOX_RTT_SECS;
+use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
+use common_meta::key::datanode_table::RegionInfo;
+use common_meta::RegionIdent;
+use common_procedure::Status;
+use common_telemetry::{info, warn};
+use serde::{Deserialize, Serialize};
+use snafu::ResultExt;
+
+use crate::error::{self, Result};
+use crate::handler::HeartbeatMailbox;
+use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
+use crate::procedure::region_migration::{Context, State};
+use crate::service::mailbox::Channel;
+
+const CLOSE_DOWNGRADED_REGION_TIMEOUT: Duration = Duration::from_secs(MAILBOX_RTT_SECS);
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CloseDowngradedRegion;
+
+#[async_trait::async_trait]
+#[typetag::serde]
+impl State for CloseDowngradedRegion {
+    async fn next(&mut self, ctx: &mut Context) -> Result<(Box<dyn State>, Status)> {
+        if let Err(err) = self.close_downgraded_leader_region(ctx).await {
+            let downgrade_leader_datanode = &ctx.persistent_ctx.from_peer;
+            let region_id = ctx.region_id();
+            warn!(err; "Failed to close downgraded leader region: {region_id} on datanode {:?}", downgrade_leader_datanode);
+        }
+
+        Ok((Box::new(RegionMigrationEnd), Status::done()))
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+impl CloseDowngradedRegion {
+    /// Builds close region instruction.
+    ///
+    /// Abort(non-retry):
+    /// - Datanode Table is not found.
+    async fn build_close_region_instruction(&self, ctx: &mut Context) -> Result<Instruction> {
+        let pc = &ctx.persistent_ctx;
+        let downgrade_leader_datanode_id = pc.from_peer.id;
+        let cluster_id = pc.cluster_id;
+        let table_id = pc.region_id.table_id();
+        let region_number = pc.region_id.region_number();
+        let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
+
+        let RegionInfo { engine, .. } = datanode_table_value.region_info.clone();
+
+        Ok(Instruction::CloseRegion(RegionIdent {
+            cluster_id,
+            datanode_id: downgrade_leader_datanode_id,
+            table_id,
+            region_number,
+            engine,
+        }))
+    }
+
+    /// Closes the downgraded leader region.
+    async fn close_downgraded_leader_region(&self, ctx: &mut Context) -> Result<()> {
+        let close_instruction = self.build_close_region_instruction(ctx).await?;
+        let region_id = ctx.region_id();
+        let pc = &ctx.persistent_ctx;
+        let downgrade_leader_datanode = &pc.from_peer;
+        let msg = MailboxMessage::json_message(
+            &format!("Close downgraded region: {}", region_id),
+            &format!("Meta@{}", ctx.server_addr()),
+            &format!(
+                "Datanode-{}@{}",
+                downgrade_leader_datanode.id, downgrade_leader_datanode.addr
+            ),
+            common_time::util::current_time_millis(),
+            &close_instruction,
+        )
+        .with_context(|_| error::SerializeToJsonSnafu {
+            input: close_instruction.to_string(),
+        })?;
+
+        let ch = Channel::Datanode(downgrade_leader_datanode.id);
+        let receiver = ctx
+            .mailbox
+            .send(&ch, msg, CLOSE_DOWNGRADED_REGION_TIMEOUT)
+            .await?;
+
+        match receiver.await? {
+            Ok(msg) => {
+                let reply = HeartbeatMailbox::json_reply(&msg)?;
+                info!(
+                    "Received close downgraded leade region reply: {:?}, region: {}",
+                    reply, region_id
+                );
+                let InstructionReply::CloseRegion(SimpleReply { result, error }) = reply else {
+                    return error::UnexpectedInstructionReplySnafu {
+                        mailbox_message: msg.to_string(),
+                        reason: "expect close region reply",
+                    }
+                    .fail();
+                };
+
+                if result {
+                    Ok(())
+                } else {
+                    error::UnexpectedSnafu {
+                        violated: format!(
+                            "Failed to close downgraded leader region: {region_id} on datanode {:?}, error: {error:?}",
+                            downgrade_leader_datanode,
+                        ),
+                    }
+                    .fail()
+                }
+            }
+
+            Err(e) => Err(e),
+        }
+    }
+}
--- a/src/meta-srv/src/procedure/region_migration/migration_start.rs
+++ b/src/meta-srv/src/procedure/region_migration/migration_start.rs
@@ -21,11 +21,11 @@ use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::RegionId;

-use super::migration_abort::RegionMigrationAbort;
-use super::migration_end::RegionMigrationEnd;
-use super::open_candidate_region::OpenCandidateRegion;
-use super::update_metadata::UpdateMetadata;
 use crate::error::{self, Result};
+use crate::procedure::region_migration::migration_abort::RegionMigrationAbort;
+use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
+use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion;
+use crate::procedure::region_migration::update_metadata::UpdateMetadata;
 use crate::procedure::region_migration::{Context, State};

 /// The behaviors:
--- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
@@ -25,9 +25,9 @@ use common_telemetry::info;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};

-use super::update_metadata::UpdateMetadata;
 use crate::error::{self, Result};
 use crate::handler::HeartbeatMailbox;
+use crate::procedure::region_migration::update_metadata::UpdateMetadata;
 use crate::procedure::region_migration::{Context, State};
 use crate::service::mailbox::Channel;

@@ -145,7 +145,10 @@ impl OpenCandidateRegion {
        match receiver.await? {
            Ok(msg) => {
                let reply = HeartbeatMailbox::json_reply(&msg)?;
-                info!("Received open region reply: {:?}", reply);
+                info!(
+                    "Received open region reply: {:?}, region: {}",
+                    reply, region_id
+                );
                let InstructionReply::OpenRegion(SimpleReply { result, error }) = reply else {
                    return error::UnexpectedInstructionReplySnafu {
                        mailbox_message: msg.to_string(),
--- a/src/meta-srv/src/procedure/region_migration/test_util.rs
+++ b/src/meta-srv/src/procedure/region_migration/test_util.rs
@@ -44,19 +44,21 @@ use store_api::storage::RegionId;
 use table::metadata::RawTableInfo;
 use tokio::sync::mpsc::{Receiver, Sender};

-use super::manager::RegionMigrationProcedureTracker;
-use super::migration_abort::RegionMigrationAbort;
-use super::upgrade_candidate_region::UpgradeCandidateRegion;
-use super::{Context, ContextFactory, DefaultContextFactory, State, VolatileContext};
 use crate::cache_invalidator::MetasrvCacheInvalidator;
 use crate::error::{self, Error, Result};
 use crate::handler::{HeartbeatMailbox, Pusher, Pushers};
 use crate::metasrv::MetasrvInfo;
+use crate::procedure::region_migration::close_downgraded_region::CloseDowngradedRegion;
 use crate::procedure::region_migration::downgrade_leader_region::DowngradeLeaderRegion;
+use crate::procedure::region_migration::manager::RegionMigrationProcedureTracker;
+use crate::procedure::region_migration::migration_abort::RegionMigrationAbort;
 use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
 use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion;
 use crate::procedure::region_migration::update_metadata::UpdateMetadata;
-use crate::procedure::region_migration::PersistentContext;
+use crate::procedure::region_migration::upgrade_candidate_region::UpgradeCandidateRegion;
+use crate::procedure::region_migration::{
+    Context, ContextFactory, DefaultContextFactory, PersistentContext, State, VolatileContext,
+};
 use crate::service::mailbox::{Channel, MailboxRef};

 pub type MockHeartbeatReceiver = Receiver<std::result::Result<HeartbeatResponse, tonic::Status>>;
@@ -569,6 +571,14 @@ pub(crate) fn assert_region_migration_end(next: &dyn State) {
    let _ = next.as_any().downcast_ref::<RegionMigrationEnd>().unwrap();
 }

+/// Asserts the [State] should be [CloseDowngradedRegion].
+pub(crate) fn assert_close_downgraded_region(next: &dyn State) {
+    let _ = next
+        .as_any()
+        .downcast_ref::<CloseDowngradedRegion>()
+        .unwrap();
+}
+
 /// Asserts the [State] should be [RegionMigrationAbort].
 pub(crate) fn assert_region_migration_abort(next: &dyn State) {
    let _ = next
--- a/src/meta-srv/src/procedure/region_migration/update_metadata.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata.rs
@@ -22,10 +22,10 @@ use common_procedure::Status;
 use common_telemetry::warn;
 use serde::{Deserialize, Serialize};

-use super::migration_abort::RegionMigrationAbort;
-use super::migration_end::RegionMigrationEnd;
 use crate::error::Result;
+use crate::procedure::region_migration::close_downgraded_region::CloseDowngradedRegion;
 use crate::procedure::region_migration::downgrade_leader_region::DowngradeLeaderRegion;
+use crate::procedure::region_migration::migration_abort::RegionMigrationAbort;
 use crate::procedure::region_migration::{Context, State};

 #[derive(Debug, Serialize, Deserialize)]
@@ -58,7 +58,7 @@ impl State for UpdateMetadata {
                if let Err(err) = ctx.invalidate_table_cache().await {
                    warn!("Failed to broadcast the invalidate table cache message during the upgrade candidate, error: {err:?}");
                };
-                Ok((Box::new(RegionMigrationEnd), Status::done()))
+                Ok((Box::new(CloseDowngradedRegion), Status::executing(false)))
            }
            UpdateMetadata::Rollback => {
                self.rollback_downgraded_region(ctx).await?;
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
@@ -195,7 +195,7 @@ mod tests {
    use store_api::storage::RegionId;

    use crate::error::Error;
-    use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
+    use crate::procedure::region_migration::close_downgraded_region::CloseDowngradedRegion;
    use crate::procedure::region_migration::test_util::{self, TestingEnv};
    use crate::procedure::region_migration::update_metadata::UpdateMetadata;
    use crate::procedure::region_migration::{ContextFactory, PersistentContext, State};
@@ -443,7 +443,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_next_migration_end_state() {
+    async fn test_next_close_downgraded_region_state() {
        let mut state = Box::new(UpdateMetadata::Upgrade);
        let env = TestingEnv::new();
        let persistent_context = new_persistent_context();
@@ -471,7 +471,10 @@ mod tests {

        let (next, _) = state.next(&mut ctx).await.unwrap();

-        let _ = next.as_any().downcast_ref::<RegionMigrationEnd>().unwrap();
+        let _ = next
+            .as_any()
+            .downcast_ref::<CloseDowngradedRegion>()
+            .unwrap();

        let table_route = table_metadata_manager
            .table_route_manager()
--- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
@@ -23,9 +23,9 @@ use serde::{Deserialize, Serialize};
 use snafu::{ensure, OptionExt, ResultExt};
 use tokio::time::{sleep, Instant};

-use super::update_metadata::UpdateMetadata;
 use crate::error::{self, Result};
 use crate::handler::HeartbeatMailbox;
+use crate::procedure::region_migration::update_metadata::UpdateMetadata;
 use crate::procedure::region_migration::{Context, State};
 use crate::service::mailbox::Channel;

@@ -155,7 +155,7 @@ impl UpgradeCandidateRegion {
                    exists,
                    error::UnexpectedSnafu {
                        violated: format!(
-                            "Expected region {} doesn't exist on datanode {:?}",
+                            "Candidate region {} doesn't exist on datanode {:?}",
                            region_id, candidate
                        )
                    }
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -210,7 +210,6 @@ impl RegionEngine for MetricEngine {
        for x in [
            utils::to_metadata_region_id(region_id),
            utils::to_data_region_id(region_id),
-            region_id,
        ] {
            if let Err(e) = self.inner.mito.set_region_role(x, role)
                && e.status_code() != StatusCode::RegionNotFound
@@ -226,6 +225,13 @@ impl RegionEngine for MetricEngine {
        region_id: RegionId,
        region_role_state: SettableRegionRoleState,
    ) -> std::result::Result<SetRegionRoleStateResponse, BoxedError> {
+        self.inner
+            .mito
+            .set_region_role_state_gracefully(
+                utils::to_metadata_region_id(region_id),
+                region_role_state,
+            )
+            .await?;
        self.inner
            .mito
            .set_region_role_state_gracefully(region_id, region_role_state)
--- a/src/metric-engine/src/engine/catchup.rs
+++ b/src/metric-engine/src/engine/catchup.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use common_telemetry::debug;
 use snafu::ResultExt;
 use store_api::region_engine::RegionEngine;
 use store_api::region_request::{AffectedRows, RegionCatchupRequest, RegionRequest};
@@ -35,6 +36,7 @@ impl MetricEngineInner {
        }
        let metadata_region_id = utils::to_metadata_region_id(region_id);
        // TODO(weny): improve the catchup, we can read the wal entries only once.
+        debug!("Catchup metadata region {metadata_region_id}");
        self.mito
            .handle_request(
                metadata_region_id,
@@ -48,6 +50,7 @@ impl MetricEngineInner {
            .context(MitoCatchupOperationSnafu)?;

        let data_region_id = utils::to_data_region_id(region_id);
+        debug!("Catchup data region {data_region_id}");
        self.mito
            .handle_request(
                data_region_id,
--- a/src/metric-engine/src/test_util.rs
+++ b/src/metric-engine/src/test_util.rs
@@ -313,12 +313,12 @@ mod test {
        let region_dir = "test_metric_region";
        // assert metadata region's dir
        let metadata_region_dir = join_dir(region_dir, METADATA_REGION_SUBDIR);
-        let exist = object_store.is_exist(&metadata_region_dir).await.unwrap();
+        let exist = object_store.exists(&metadata_region_dir).await.unwrap();
        assert!(exist);

        // assert data region's dir
        let data_region_dir = join_dir(region_dir, DATA_REGION_SUBDIR);
-        let exist = object_store.is_exist(&data_region_dir).await.unwrap();
+        let exist = object_store.exists(&data_region_dir).await.unwrap();
        assert!(exist);

        // check mito engine
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -17,6 +17,7 @@ aquamarine.workspace = true
 async-channel = "1.9"
 async-stream.workspace = true
 async-trait = "0.1"
+bytemuck.workspace = true
 bytes.workspace = true
 common-base.workspace = true
 common-config.workspace = true
@@ -76,7 +77,6 @@ uuid.workspace = true
 [dev-dependencies]
 common-function.workspace = true
 common-meta = { workspace = true, features = ["testing"] }
-common-procedure-test.workspace = true
 common-test-util.workspace = true
 criterion = "0.4"
 dotenv.workspace = true
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -32,6 +32,7 @@ use moka::notification::RemovalCause;
 use moka::sync::Cache;
 use parquet::column::page::Page;
 use parquet::file::metadata::ParquetMetaData;
+use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
 use store_api::storage::{ConcreteDataType, RegionId, TimeSeriesRowSelector};

 use crate::cache::cache_size::parquet_meta_size;
@@ -68,6 +69,8 @@ pub struct CacheManager {
    write_cache: Option<WriteCacheRef>,
    /// Cache for inverted index.
    index_cache: Option<InvertedIndexCacheRef>,
+    /// Puffin metadata cache.
+    puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
    /// Cache for time series selectors.
    selector_result_cache: Option<SelectorResultCache>,
 }
@@ -217,6 +220,10 @@ impl CacheManager {
    pub(crate) fn index_cache(&self) -> Option<&InvertedIndexCacheRef> {
        self.index_cache.as_ref()
    }
+
+    pub(crate) fn puffin_metadata_cache(&self) -> Option<&PuffinMetadataCacheRef> {
+        self.puffin_metadata_cache.as_ref()
+    }
 }

 /// Increases selector cache miss metrics.
@@ -237,6 +244,8 @@ pub struct CacheManagerBuilder {
    page_cache_size: u64,
    index_metadata_size: u64,
    index_content_size: u64,
+    index_content_page_size: u64,
+    puffin_metadata_size: u64,
    write_cache: Option<WriteCacheRef>,
    selector_result_cache_size: u64,
 }
@@ -278,6 +287,18 @@ impl CacheManagerBuilder {
        self
    }

+    /// Sets page size for index content.
+    pub fn index_content_page_size(mut self, bytes: u64) -> Self {
+        self.index_content_page_size = bytes;
+        self
+    }
+
+    /// Sets cache size for puffin metadata.
+    pub fn puffin_metadata_size(mut self, bytes: u64) -> Self {
+        self.puffin_metadata_size = bytes;
+        self
+    }
+
    /// Sets selector result cache size.
    pub fn selector_result_cache_size(mut self, bytes: u64) -> Self {
        self.selector_result_cache_size = bytes;
@@ -338,8 +359,13 @@ impl CacheManagerBuilder {
                })
                .build()
        });
-        let inverted_index_cache =
-            InvertedIndexCache::new(self.index_metadata_size, self.index_content_size);
+        let inverted_index_cache = InvertedIndexCache::new(
+            self.index_metadata_size,
+            self.index_content_size,
+            self.index_content_page_size,
+        );
+        let puffin_metadata_cache =
+            PuffinMetadataCache::new(self.puffin_metadata_size, &CACHE_BYTES);
        let selector_result_cache = (self.selector_result_cache_size != 0).then(|| {
            Cache::builder()
                .max_capacity(self.selector_result_cache_size)
@@ -361,6 +387,7 @@ impl CacheManagerBuilder {
            page_cache,
            write_cache: self.write_cache,
            index_cache: Some(Arc::new(inverted_index_cache)),
+            puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)),
            selector_result_cache,
        }
    }
--- a/src/mito2/src/cache/file_cache.rs
+++ b/src/mito2/src/cache/file_cache.rs
@@ -286,7 +286,7 @@ impl FileCache {
    }

    async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
-        if self.local_store.is_exist(file_path).await? {
+        if self.local_store.exists(file_path).await? {
            Ok(Some(self.local_store.reader(file_path).await?))
        } else {
            Ok(None)
@@ -480,7 +480,7 @@ mod tests {
        cache.memory_index.run_pending_tasks().await;

        // The file also not exists.
-        assert!(!local_store.is_exist(&file_path).await.unwrap());
+        assert!(!local_store.exists(&file_path).await.unwrap());
        assert_eq!(0, cache.memory_index.weighted_size());
    }

--- a/src/mito2/src/cache/index.rs
+++ b/src/mito2/src/cache/index.rs
@@ -12,14 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::ops::Range;
 use std::sync::Arc;

 use api::v1::index::InvertedIndexMetas;
 use async_trait::async_trait;
+use bytes::Bytes;
 use common_base::BitVec;
 use index::inverted_index::error::DecodeFstSnafu;
 use index::inverted_index::format::reader::InvertedIndexReader;
 use index::inverted_index::FstMap;
+use object_store::Buffer;
 use prost::Message;
 use snafu::ResultExt;

@@ -34,14 +37,16 @@ const INDEX_CONTENT_TYPE: &str = "index_content";
 /// Inverted index blob reader with cache.
 pub struct CachedInvertedIndexBlobReader<R> {
    file_id: FileId,
+    file_size: u64,
    inner: R,
    cache: InvertedIndexCacheRef,
 }

 impl<R> CachedInvertedIndexBlobReader<R> {
-    pub fn new(file_id: FileId, inner: R, cache: InvertedIndexCacheRef) -> Self {
+    pub fn new(file_id: FileId, file_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
        Self {
            file_id,
+            file_size,
            inner,
            cache,
        }
@@ -59,43 +64,71 @@ where
        offset: u64,
        size: u32,
    ) -> index::inverted_index::error::Result<Vec<u8>> {
-        let range = offset as usize..(offset + size as u64) as usize;
-        if let Some(cached) = self.cache.get_index(IndexKey {
-            file_id: self.file_id,
-        }) {
-            CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
-            Ok(cached[range].to_vec())
-        } else {
-            let mut all_data = Vec::with_capacity(1024 * 1024);
-            self.inner.read_all(&mut all_data).await?;
-            let result = all_data[range].to_vec();
-            self.cache.put_index(
-                IndexKey {
-                    file_id: self.file_id,
-                },
-                Arc::new(all_data),
-            );
-            CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
-            Ok(result)
+        let keys =
+            IndexDataPageKey::generate_page_keys(self.file_id, offset, size, self.cache.page_size);
+        // Size is 0, return empty data.
+        if keys.is_empty() {
+            return Ok(Vec::new());
        }
+        let mut data = Vec::with_capacity(keys.len());
+        data.resize(keys.len(), Bytes::new());
+        let mut cache_miss_range = vec![];
+        let mut cache_miss_idx = vec![];
+        let last_index = keys.len() - 1;
+        // TODO: Avoid copy as much as possible.
+        for (i, index) in keys.iter().enumerate() {
+            match self.cache.get_index(index) {
+                Some(page) => {
+                    CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
+                    data[i] = page;
+                }
+                None => {
+                    CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
+                    let base_offset = index.page_id * self.cache.page_size;
+                    let pruned_size = if i == last_index {
+                        prune_size(&keys, self.file_size, self.cache.page_size)
+                    } else {
+                        self.cache.page_size
+                    };
+                    cache_miss_range.push(base_offset..base_offset + pruned_size);
+                    cache_miss_idx.push(i);
+                }
+            }
+        }
+        if !cache_miss_range.is_empty() {
+            let pages = self.inner.read_vec(&cache_miss_range).await?;
+            for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) {
+                let key = keys[i].clone();
+                data[i] = page.clone();
+                self.cache.put_index(key, page.clone());
+            }
+        }
+        let buffer = Buffer::from_iter(data.into_iter());
+        Ok(buffer
+            .slice(IndexDataPageKey::calculate_range(
+                offset,
+                size,
+                self.cache.page_size,
+            ))
+            .to_vec())
    }
 }

 #[async_trait]
 impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobReader<R> {
-    async fn read_all(
-        &mut self,
-        dest: &mut Vec<u8>,
-    ) -> index::inverted_index::error::Result<usize> {
-        self.inner.read_all(dest).await
-    }
-
-    async fn seek_read(
+    async fn range_read(
        &mut self,
        offset: u64,
        size: u32,
    ) -> index::inverted_index::error::Result<Vec<u8>> {
-        self.inner.seek_read(offset, size).await
+        self.inner.range_read(offset, size).await
+    }
+
+    async fn read_vec(
+        &mut self,
+        ranges: &[Range<u64>],
+    ) -> index::inverted_index::error::Result<Vec<Bytes>> {
+        self.inner.read_vec(ranges).await
    }

    async fn metadata(&mut self) -> index::inverted_index::error::Result<Arc<InvertedIndexMetas>> {
@@ -130,22 +163,69 @@ impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobRead
 }

 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct IndexKey {
+pub struct IndexMetadataKey {
    file_id: FileId,
 }

+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct IndexDataPageKey {
+    file_id: FileId,
+    page_id: u64,
+}
+
+impl IndexDataPageKey {
+    /// Converts an offset to a page ID based on the page size.
+    fn calculate_page_id(offset: u64, page_size: u64) -> u64 {
+        offset / page_size
+    }
+
+    /// Calculates the total number of pages that a given size spans, starting from a specific offset.
+    fn calculate_page_count(offset: u64, size: u32, page_size: u64) -> u32 {
+        let start_page = Self::calculate_page_id(offset, page_size);
+        let end_page = Self::calculate_page_id(offset + (size as u64) - 1, page_size);
+        (end_page + 1 - start_page) as u32
+    }
+
+    /// Calculates the byte range for data retrieval based on the specified offset and size.
+    ///
+    /// This function determines the starting and ending byte positions required for reading data.
+    /// For example, with an offset of 5000 and a size of 5000, using a PAGE_SIZE of 4096,
+    /// the resulting byte range will be 904..5904. This indicates that:
+    /// - The reader will first access fixed-size pages [4096, 8192) and [8192, 12288).
+    /// - To read the range [5000..10000), it only needs to fetch bytes within the range [904, 5904) across two pages.
+    fn calculate_range(offset: u64, size: u32, page_size: u64) -> Range<usize> {
+        let start = (offset % page_size) as usize;
+        let end = start + size as usize;
+        start..end
+    }
+
+    /// Generates a vector of IndexKey instances for the pages that a given offset and size span.
+    fn generate_page_keys(file_id: FileId, offset: u64, size: u32, page_size: u64) -> Vec<Self> {
+        let start_page = Self::calculate_page_id(offset, page_size);
+        let total_pages = Self::calculate_page_count(offset, size, page_size);
+        (0..total_pages)
+            .map(|i| Self {
+                file_id,
+                page_id: start_page + i as u64,
+            })
+            .collect()
+    }
+}
+
 pub type InvertedIndexCacheRef = Arc<InvertedIndexCache>;

 pub struct InvertedIndexCache {
    /// Cache for inverted index metadata
-    index_metadata: moka::sync::Cache<IndexKey, Arc<InvertedIndexMetas>>,
+    index_metadata: moka::sync::Cache<IndexMetadataKey, Arc<InvertedIndexMetas>>,
    /// Cache for inverted index content.
-    index: moka::sync::Cache<IndexKey, Arc<Vec<u8>>>,
+    index: moka::sync::Cache<IndexDataPageKey, Bytes>,
+    // Page size for index content.
+    page_size: u64,
 }

 impl InvertedIndexCache {
    /// Creates `InvertedIndexCache` with provided `index_metadata_cap` and `index_content_cap`.
-    pub fn new(index_metadata_cap: u64, index_content_cap: u64) -> Self {
+    pub fn new(index_metadata_cap: u64, index_content_cap: u64, page_size: u64) -> Self {
        common_telemetry::debug!("Building InvertedIndexCache with metadata size: {index_metadata_cap}, content size: {index_content_cap}");
        let index_metadata = moka::sync::CacheBuilder::new(index_metadata_cap)
            .name("inverted_index_metadata")
@@ -170,29 +250,29 @@ impl InvertedIndexCache {
        Self {
            index_metadata,
            index: index_cache,
+            page_size,
        }
    }
 }

 impl InvertedIndexCache {
    pub fn get_index_metadata(&self, file_id: FileId) -> Option<Arc<InvertedIndexMetas>> {
-        self.index_metadata.get(&IndexKey { file_id })
+        self.index_metadata.get(&IndexMetadataKey { file_id })
    }

    pub fn put_index_metadata(&self, file_id: FileId, metadata: Arc<InvertedIndexMetas>) {
-        let key = IndexKey { file_id };
+        let key = IndexMetadataKey { file_id };
        CACHE_BYTES
            .with_label_values(&[INDEX_METADATA_TYPE])
            .add(index_metadata_weight(&key, &metadata).into());
        self.index_metadata.insert(key, metadata)
    }

-    // todo(hl): align index file content to pages with size like 4096 bytes.
-    pub fn get_index(&self, key: IndexKey) -> Option<Arc<Vec<u8>>> {
-        self.index.get(&key)
+    pub fn get_index(&self, key: &IndexDataPageKey) -> Option<Bytes> {
+        self.index.get(key)
    }

-    pub fn put_index(&self, key: IndexKey, value: Arc<Vec<u8>>) {
+    pub fn put_index(&self, key: IndexDataPageKey, value: Bytes) {
        CACHE_BYTES
            .with_label_values(&[INDEX_CONTENT_TYPE])
            .add(index_content_weight(&key, &value).into());
@@ -201,11 +281,229 @@ impl InvertedIndexCache {
 }

 /// Calculates weight for index metadata.
-fn index_metadata_weight(k: &IndexKey, v: &Arc<InvertedIndexMetas>) -> u32 {
+fn index_metadata_weight(k: &IndexMetadataKey, v: &Arc<InvertedIndexMetas>) -> u32 {
    (k.file_id.as_bytes().len() + v.encoded_len()) as u32
 }

 /// Calculates weight for index content.
-fn index_content_weight(k: &IndexKey, v: &Arc<Vec<u8>>) -> u32 {
+fn index_content_weight(k: &IndexDataPageKey, v: &Bytes) -> u32 {
    (k.file_id.as_bytes().len() + v.len()) as u32
 }
+
+/// Prunes the size of the last page based on the indexes.
+/// We have following cases:
+/// 1. The rest file size is less than the page size, read to the end of the file.
+/// 2. Otherwise, read the page size.
+fn prune_size(indexes: &[IndexDataPageKey], file_size: u64, page_size: u64) -> u64 {
+    let last_page_start = indexes.last().map(|i| i.page_id * page_size).unwrap_or(0);
+    page_size.min(file_size - last_page_start)
+}
+
+#[cfg(test)]
+mod test {
+    use std::num::NonZeroUsize;
+
+    use common_base::BitVec;
+    use futures::stream;
+    use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
+    use index::inverted_index::format::writer::{InvertedIndexBlobWriter, InvertedIndexWriter};
+    use index::inverted_index::Bytes;
+    use prometheus::register_int_counter_vec;
+    use rand::{Rng, RngCore};
+
+    use super::*;
+    use crate::sst::index::store::InstrumentedStore;
+    use crate::test_util::TestEnv;
+
+    // Repeat times for following little fuzz tests.
+    const FUZZ_REPEAT_TIMES: usize = 100;
+
+    // Fuzz test for index data page key
+    #[test]
+    fn fuzz_index_calculation() {
+        // randomly generate a large u8 array
+        let mut rng = rand::thread_rng();
+        let mut data = vec![0u8; 1024 * 1024];
+        rng.fill_bytes(&mut data);
+        let file_id = FileId::random();
+
+        for _ in 0..FUZZ_REPEAT_TIMES {
+            let offset = rng.gen_range(0..data.len() as u64);
+            let size = rng.gen_range(0..data.len() as u32 - offset as u32);
+            let page_size: usize = rng.gen_range(1..1024);
+
+            let indexes =
+                IndexDataPageKey::generate_page_keys(file_id, offset, size, page_size as u64);
+            let page_num = indexes.len();
+            let mut read = Vec::with_capacity(size as usize);
+            for key in indexes.into_iter() {
+                let start = key.page_id as usize * page_size;
+                let page = if start + page_size < data.len() {
+                    &data[start..start + page_size]
+                } else {
+                    &data[start..]
+                };
+                read.extend_from_slice(page);
+            }
+            let expected_range = offset as usize..(offset + size as u64 as u64) as usize;
+            let read =
+                read[IndexDataPageKey::calculate_range(offset, size, page_size as u64)].to_vec();
+            if read != data.get(expected_range).unwrap() {
+                panic!(
+                    "fuzz_read_index failed, offset: {}, size: {}, page_size: {}\nread len: {}, expected len: {}\nrange: {:?}, page num: {}",
+                    offset, size, page_size, read.len(), size as usize,
+                    IndexDataPageKey::calculate_range(offset, size, page_size as u64),
+                    page_num
+                );
+            }
+        }
+    }
+
+    fn unpack(fst_value: u64) -> [u32; 2] {
+        bytemuck::cast::<u64, [u32; 2]>(fst_value)
+    }
+
+    async fn create_inverted_index_blob() -> Vec<u8> {
+        let mut blob = Vec::new();
+        let mut writer = InvertedIndexBlobWriter::new(&mut blob);
+        writer
+            .add_index(
+                "tag0".to_string(),
+                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
+                Box::new(stream::iter(vec![
+                    Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
+                    Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
+                ])),
+            )
+            .await
+            .unwrap();
+        writer
+            .add_index(
+                "tag1".to_string(),
+                BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
+                Box::new(stream::iter(vec![
+                    Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
+                    Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
+                    Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
+                ])),
+            )
+            .await
+            .unwrap();
+        writer
+            .finish(8, NonZeroUsize::new(1).unwrap())
+            .await
+            .unwrap();
+
+        blob
+    }
+
+    #[tokio::test]
+    async fn test_inverted_index_cache() {
+        let blob = create_inverted_index_blob().await;
+
+        // Init a test range reader in local fs.
+        let mut env = TestEnv::new();
+        let file_size = blob.len() as u64;
+        let store = env.init_object_store_manager();
+        let temp_path = "data";
+        store.write(temp_path, blob).await.unwrap();
+        let store = InstrumentedStore::new(store);
+        let metric =
+            register_int_counter_vec!("test_bytes", "a counter for test", &["test"]).unwrap();
+        let counter = metric.with_label_values(&["test"]);
+        let range_reader = store
+            .range_reader("data", &counter, &counter)
+            .await
+            .unwrap();
+
+        let reader = InvertedIndexBlobReader::new(range_reader);
+        let mut cached_reader = CachedInvertedIndexBlobReader::new(
+            FileId::random(),
+            file_size,
+            reader,
+            Arc::new(InvertedIndexCache::new(8192, 8192, 50)),
+        );
+        let metadata = cached_reader.metadata().await.unwrap();
+        assert_eq!(metadata.total_row_count, 8);
+        assert_eq!(metadata.segment_row_count, 1);
+        assert_eq!(metadata.metas.len(), 2);
+        // tag0
+        let tag0 = metadata.metas.get("tag0").unwrap();
+        let stats0 = tag0.stats.as_ref().unwrap();
+        assert_eq!(stats0.distinct_count, 3);
+        assert_eq!(stats0.null_count, 1);
+        assert_eq!(stats0.min_value, Bytes::from("a"));
+        assert_eq!(stats0.max_value, Bytes::from("c"));
+        let fst0 = cached_reader
+            .fst(
+                tag0.base_offset + tag0.relative_fst_offset as u64,
+                tag0.fst_size,
+            )
+            .await
+            .unwrap();
+        assert_eq!(fst0.len(), 3);
+        let [offset, size] = unpack(fst0.get(b"a").unwrap());
+        let bitmap = cached_reader
+            .bitmap(tag0.base_offset + offset as u64, size)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+        let [offset, size] = unpack(fst0.get(b"b").unwrap());
+        let bitmap = cached_reader
+            .bitmap(tag0.base_offset + offset as u64, size)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
+        let [offset, size] = unpack(fst0.get(b"c").unwrap());
+        let bitmap = cached_reader
+            .bitmap(tag0.base_offset + offset as u64, size)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+
+        // tag1
+        let tag1 = metadata.metas.get("tag1").unwrap();
+        let stats1 = tag1.stats.as_ref().unwrap();
+        assert_eq!(stats1.distinct_count, 3);
+        assert_eq!(stats1.null_count, 1);
+        assert_eq!(stats1.min_value, Bytes::from("x"));
+        assert_eq!(stats1.max_value, Bytes::from("z"));
+        let fst1 = cached_reader
+            .fst(
+                tag1.base_offset + tag1.relative_fst_offset as u64,
+                tag1.fst_size,
+            )
+            .await
+            .unwrap();
+        assert_eq!(fst1.len(), 3);
+        let [offset, size] = unpack(fst1.get(b"x").unwrap());
+        let bitmap = cached_reader
+            .bitmap(tag1.base_offset + offset as u64, size)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+        let [offset, size] = unpack(fst1.get(b"y").unwrap());
+        let bitmap = cached_reader
+            .bitmap(tag1.base_offset + offset as u64, size)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
+        let [offset, size] = unpack(fst1.get(b"z").unwrap());
+        let bitmap = cached_reader
+            .bitmap(tag1.base_offset + offset as u64, size)
+            .await
+            .unwrap();
+        assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
+
+        // fuzz test
+        let mut rng = rand::thread_rng();
+        for _ in 0..FUZZ_REPEAT_TIMES {
+            let offset = rng.gen_range(0..file_size);
+            let size = rng.gen_range(0..file_size as u32 - offset as u32);
+            let expected = cached_reader.range_read(offset, size).await.unwrap();
+            let read = cached_reader.get_or_load(offset, size).await.unwrap();
+            assert_eq!(read, expected);
+        }
+    }
+}
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -44,7 +44,7 @@ use tokio::sync::mpsc::{self, Sender};

 use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
-use crate::compaction::compactor::{CompactionRegion, DefaultCompactor};
+use crate::compaction::compactor::{CompactionRegion, CompactionVersion, DefaultCompactor};
 use crate::compaction::picker::{new_picker, CompactionTask};
 use crate::compaction::task::CompactionTaskImpl;
 use crate::config::MitoConfig;
@@ -59,7 +59,7 @@ use crate::read::scan_region::ScanInput;
 use crate::read::seq_scan::SeqScan;
 use crate::read::BoxedBatchReader;
 use crate::region::options::MergeMode;
-use crate::region::version::{VersionControlRef, VersionRef};
+use crate::region::version::VersionControlRef;
 use crate::region::ManifestContextRef;
 use crate::request::{OptionOutputTx, OutputTx, WorkerRequest};
 use crate::schedule::remote_job_scheduler::{
@@ -73,7 +73,7 @@ use crate::worker::WorkerListener;
 /// Region compaction request.
 pub struct CompactionRequest {
    pub(crate) engine_config: Arc<MitoConfig>,
-    pub(crate) current_version: VersionRef,
+    pub(crate) current_version: CompactionVersion,
    pub(crate) access_layer: AccessLayerRef,
    /// Sender to send notification to the region worker.
    pub(crate) request_sender: mpsc::Sender<WorkerRequest>,
@@ -522,7 +522,7 @@ impl CompactionStatus {
        listener: WorkerListener,
        schema_metadata_manager: SchemaMetadataManagerRef,
    ) -> CompactionRequest {
-        let current_version = self.version_control.current().version;
+        let current_version = CompactionVersion::from(self.version_control.current().version);
        let start_time = Instant::now();
        let mut req = CompactionRequest {
            engine_config,
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -35,12 +35,10 @@ use crate::error::{EmptyRegionDirSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Res
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::manifest::storage::manifest_compress_type;
-use crate::memtable::time_partition::TimePartitions;
-use crate::memtable::MemtableBuilderProvider;
 use crate::read::Source;
 use crate::region::opener::new_manifest_dir;
 use crate::region::options::RegionOptions;
-use crate::region::version::{VersionBuilder, VersionRef};
+use crate::region::version::VersionRef;
 use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState};
 use crate::schedule::scheduler::LocalScheduler;
 use crate::sst::file::{FileMeta, IndexType};
@@ -48,6 +46,34 @@ use crate::sst::file_purger::LocalFilePurger;
 use crate::sst::index::intermediate::IntermediateManager;
 use crate::sst::index::puffin_manager::PuffinManagerFactory;
 use crate::sst::parquet::WriteOptions;
+use crate::sst::version::{SstVersion, SstVersionRef};
+
+/// Region version for compaction that does not hold memtables.
+#[derive(Clone)]
+pub struct CompactionVersion {
+    /// Metadata of the region.
+    ///
+    /// Altering metadata isn't frequent, storing metadata in Arc to allow sharing
+    /// metadata and reuse metadata when creating a new `Version`.
+    pub(crate) metadata: RegionMetadataRef,
+    /// Options of the region.
+    pub(crate) options: RegionOptions,
+    /// SSTs of the region.
+    pub(crate) ssts: SstVersionRef,
+    /// Inferred compaction time window.
+    pub(crate) compaction_time_window: Option<Duration>,
+}
+
+impl From<VersionRef> for CompactionVersion {
+    fn from(value: VersionRef) -> Self {
+        Self {
+            metadata: value.metadata.clone(),
+            options: value.options.clone(),
+            ssts: value.ssts.clone(),
+            compaction_time_window: value.compaction_time_window,
+        }
+    }
+}

 /// CompactionRegion represents a region that needs to be compacted.
 /// It's the subset of MitoRegion.
@@ -62,7 +88,7 @@ pub struct CompactionRegion {
    pub(crate) cache_manager: CacheManagerRef,
    pub(crate) access_layer: AccessLayerRef,
    pub(crate) manifest_ctx: Arc<ManifestContext>,
-    pub(crate) current_version: VersionRef,
+    pub(crate) current_version: CompactionVersion,
    pub(crate) file_purger: Option<Arc<LocalFilePurger>>,
    pub(crate) ttl: Option<TimeToLive>,
 }
@@ -147,30 +173,14 @@ pub async fn open_compaction_region(
    };

    let current_version = {
-        let memtable_builder = MemtableBuilderProvider::new(None, Arc::new(mito_config.clone()))
-            .builder_for_options(
-                req.region_options.memtable.as_ref(),
-                req.region_options.need_dedup(),
-                req.region_options.merge_mode(),
-            );
-
-        // Initial memtable id is 0.
-        let mutable = Arc::new(TimePartitions::new(
-            region_metadata.clone(),
-            memtable_builder.clone(),
-            0,
-            req.region_options.compaction.time_window(),
-        ));
-
-        let version = VersionBuilder::new(region_metadata.clone(), mutable)
-            .add_files(file_purger.clone(), manifest.files.values().cloned())
-            .flushed_entry_id(manifest.flushed_entry_id)
-            .flushed_sequence(manifest.flushed_sequence)
-            .truncated_entry_id(manifest.truncated_entry_id)
-            .compaction_time_window(manifest.compaction_time_window)
-            .options(req.region_options.clone())
-            .build();
-        Arc::new(version)
+        let mut ssts = SstVersion::new();
+        ssts.add_files(file_purger.clone(), manifest.files.values().cloned());
+        CompactionVersion {
+            metadata: region_metadata.clone(),
+            options: req.region_options.clone(),
+            ssts: Arc::new(ssts),
+            compaction_time_window: manifest.compaction_time_window,
+        }
    };

    let ttl = find_ttl(
--- a/src/mito2/src/compaction/window.rs
+++ b/src/mito2/src/compaction/window.rs
@@ -23,10 +23,9 @@ use common_time::Timestamp;
 use store_api::storage::RegionId;

 use crate::compaction::buckets::infer_time_bucket;
-use crate::compaction::compactor::CompactionRegion;
+use crate::compaction::compactor::{CompactionRegion, CompactionVersion};
 use crate::compaction::picker::{Picker, PickerOutput};
 use crate::compaction::{get_expired_ssts, CompactionOutput};
-use crate::region::version::VersionRef;
 use crate::sst::file::{FileHandle, FileId};

 /// Compaction picker that splits the time range of all involved files to windows, and merges
@@ -48,7 +47,11 @@ impl WindowedCompactionPicker {
    // use persisted window. If persist window is not present, we check the time window
    // provided while creating table. If all of those are absent, we infer the window
    // from files in level0.
-    fn calculate_time_window(&self, region_id: RegionId, current_version: &VersionRef) -> i64 {
+    fn calculate_time_window(
+        &self,
+        region_id: RegionId,
+        current_version: &CompactionVersion,
+    ) -> i64 {
        self.compaction_time_window_seconds
            .or(current_version
                .compaction_time_window
@@ -67,7 +70,7 @@ impl WindowedCompactionPicker {
    fn pick_inner(
        &self,
        region_id: RegionId,
-        current_version: &VersionRef,
+        current_version: &CompactionVersion,
        current_time: Timestamp,
    ) -> (Vec<CompactionOutput>, Vec<FileHandle>, i64) {
        let time_window = self.calculate_time_window(region_id, current_version);
@@ -205,28 +208,19 @@ mod tests {
    use common_time::Timestamp;
    use store_api::storage::RegionId;

+    use crate::compaction::compactor::CompactionVersion;
    use crate::compaction::window::{file_time_bucket_span, WindowedCompactionPicker};
-    use crate::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtableBuilder};
-    use crate::memtable::time_partition::TimePartitions;
-    use crate::memtable::version::MemtableVersion;
    use crate::region::options::RegionOptions;
-    use crate::region::version::{Version, VersionRef};
    use crate::sst::file::{FileId, FileMeta, Level};
    use crate::sst::version::SstVersion;
    use crate::test_util::memtable_util::metadata_for_test;
    use crate::test_util::NoopFilePurger;

-    fn build_version(files: &[(FileId, i64, i64, Level)], ttl: Option<Duration>) -> VersionRef {
+    fn build_version(
+        files: &[(FileId, i64, i64, Level)],
+        ttl: Option<Duration>,
+    ) -> CompactionVersion {
        let metadata = metadata_for_test();
-        let memtables = Arc::new(MemtableVersion::new(Arc::new(TimePartitions::new(
-            metadata.clone(),
-            Arc::new(PartitionTreeMemtableBuilder::new(
-                PartitionTreeConfig::default(),
-                None,
-            )),
-            0,
-            None,
-        ))));
        let file_purger_ref = Arc::new(NoopFilePurger);

        let mut ssts = SstVersion::new();
@@ -244,14 +238,9 @@ mod tests {
            }),
        );

-        Arc::new(Version {
+        CompactionVersion {
            metadata,
-            memtables,
            ssts: Arc::new(ssts),
-            flushed_entry_id: 0,
-            flushed_sequence: 0,
-            truncated_entry_id: None,
-            compaction_time_window: None,
            options: RegionOptions {
                ttl: ttl.map(|t| t.into()),
                compaction: Default::default(),
@@ -262,7 +251,8 @@ mod tests {
                memtable: None,
                merge_mode: None,
            },
-        })
+            compaction_time_window: None,
+        }
    }

    #[test]
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -304,6 +304,9 @@ pub struct IndexConfig {

    /// Write buffer size for creating the index.
    pub write_buffer_size: ReadableSize,
+
+    /// Cache size for metadata of puffin files. Setting it to 0 to disable the cache.
+    pub metadata_cache_size: ReadableSize,
 }

 impl Default for IndexConfig {
@@ -312,6 +315,7 @@ impl Default for IndexConfig {
            aux_path: String::new(),
            staging_size: ReadableSize::gb(2),
            write_buffer_size: ReadableSize::mb(8),
+            metadata_cache_size: ReadableSize::mb(64),
        }
    }
 }
@@ -412,6 +416,8 @@ pub struct InvertedIndexConfig {
    pub metadata_cache_size: ReadableSize,
    /// Cache size for inverted index content. Setting it to 0 to disable the cache.
    pub content_cache_size: ReadableSize,
+    /// Page size for inverted index content.
+    pub content_cache_page_size: ReadableSize,
 }

 impl InvertedIndexConfig {
@@ -437,6 +443,7 @@ impl Default for InvertedIndexConfig {
            intermediate_path: String::new(),
            metadata_cache_size: ReadableSize::mb(64),
            content_cache_size: ReadableSize::mb(128),
+            content_cache_page_size: ReadableSize::mb(8),
        };

        if let Some(sys_memory) = common_config::utils::get_sys_total_memory() {
--- a/src/mito2/src/engine/create_test.rs
+++ b/src/mito2/src/engine/create_test.rs
@@ -192,12 +192,12 @@ async fn test_engine_create_with_custom_store() {
    assert!(object_store_manager
        .find("Gcs")
        .unwrap()
-        .is_exist(region_dir)
+        .exists(region_dir)
        .await
        .unwrap());
    assert!(!object_store_manager
        .default_object_store()
-        .is_exist(region_dir)
+        .exists(region_dir)
        .await
        .unwrap());
 }
--- a/src/mito2/src/engine/drop_test.rs
+++ b/src/mito2/src/engine/drop_test.rs
@@ -71,7 +71,7 @@ async fn test_engine_drop_region() {
    assert!(!env
        .get_object_store()
        .unwrap()
-        .is_exist(&join_path(&region_dir, DROPPING_MARKER_FILE))
+        .exists(&join_path(&region_dir, DROPPING_MARKER_FILE))
        .await
        .unwrap());

@@ -93,7 +93,7 @@ async fn test_engine_drop_region() {
    listener.wait().await;

    let object_store = env.get_object_store().unwrap();
-    assert!(!object_store.is_exist(&region_dir).await.unwrap());
+    assert!(!object_store.exists(&region_dir).await.unwrap());
 }

 #[tokio::test]
@@ -167,13 +167,13 @@ async fn test_engine_drop_region_for_custom_store() {
    assert!(object_store_manager
        .find("Gcs")
        .unwrap()
-        .is_exist(&custom_region_dir)
+        .exists(&custom_region_dir)
        .await
        .unwrap());
    assert!(object_store_manager
        .find("default")
        .unwrap()
-        .is_exist(&global_region_dir)
+        .exists(&global_region_dir)
        .await
        .unwrap());

@@ -190,13 +190,13 @@ async fn test_engine_drop_region_for_custom_store() {
    assert!(!object_store_manager
        .find("Gcs")
        .unwrap()
-        .is_exist(&custom_region_dir)
+        .exists(&custom_region_dir)
        .await
        .unwrap());
    assert!(object_store_manager
        .find("default")
        .unwrap()
-        .is_exist(&global_region_dir)
+        .exists(&global_region_dir)
        .await
        .unwrap());
 }
--- a/src/mito2/src/engine/open_test.rs
+++ b/src/mito2/src/engine/open_test.rs
@@ -228,13 +228,13 @@ async fn test_engine_region_open_with_custom_store() {
    let object_store_manager = env.get_object_store_manager().unwrap();
    assert!(!object_store_manager
        .default_object_store()
-        .is_exist(region.access_layer.region_dir())
+        .exists(region.access_layer.region_dir())
        .await
        .unwrap());
    assert!(object_store_manager
        .find("Gcs")
        .unwrap()
-        .is_exist(region.access_layer.region_dir())
+        .exists(region.access_layer.region_dir())
        .await
        .unwrap());
 }
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -756,13 +756,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to build time range filters for value: {:?}", timestamp))]
-    BuildTimeRangeFilter {
-        timestamp: Timestamp,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to open region"))]
    OpenRegion {
        #[snafu(implicit)]
@@ -893,6 +886,14 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Failed to read file metadata"))]
+    Metadata {
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -965,7 +966,8 @@ impl ErrorExt for Error {
            | CreateDir { .. }
            | ReadDataPart { .. }
            | CorruptedEntry { .. }
-            | BuildEntry { .. } => StatusCode::Internal,
+            | BuildEntry { .. }
+            | Metadata { .. } => StatusCode::Internal,

            OpenRegion { source, .. } => source.status_code(),

@@ -1014,7 +1016,6 @@ impl ErrorExt for Error {
            ChecksumMismatch { .. } => StatusCode::Unexpected,
            RegionStopped { .. } => StatusCode::RegionNotReady,
            TimeRangePredicateOverflow { .. } => StatusCode::InvalidArguments,
-            BuildTimeRangeFilter { .. } => StatusCode::Unexpected,
            UnsupportedOperation { .. } => StatusCode::Unsupported,
            RemoteCompaction { .. } => StatusCode::Unexpected,

--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -18,7 +18,7 @@ use std::collections::HashMap;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;

-use common_telemetry::{debug, error, info};
+use common_telemetry::{debug, error, info, trace};
 use smallvec::SmallVec;
 use snafu::ResultExt;
 use store_api::storage::RegionId;
@@ -32,7 +32,10 @@ use crate::error::{
    Error, FlushRegionSnafu, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result,
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
-use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_ERRORS_TOTAL, FLUSH_REQUESTS_TOTAL, INFLIGHT_FLUSH_COUNT};
+use crate::metrics::{
+    FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_ERRORS_TOTAL, FLUSH_REQUESTS_TOTAL,
+    INFLIGHT_FLUSH_COUNT,
+};
 use crate::read::Source;
 use crate::region::options::IndexOptions;
 use crate::region::version::{VersionControlData, VersionControlRef};
@@ -138,17 +141,22 @@ impl WriteBufferManager for WriteBufferManagerImpl {
        // If the memory exceeds the buffer size, we trigger more aggressive
        // flush. But if already more than half memory is being flushed,
        // triggering more flush may not help. We will hold it instead.
-        if memory_usage >= self.global_write_buffer_size
-            && mutable_memtable_memory_usage >= self.global_write_buffer_size / 2
-        {
-            debug!(
+        if memory_usage >= self.global_write_buffer_size {
+            if mutable_memtable_memory_usage >= self.global_write_buffer_size / 2 {
+                debug!(
                "Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
                 mutable_usage: {}.",
                memory_usage,
                self.global_write_buffer_size,
-                mutable_memtable_memory_usage,
-            );
-            return true;
+                mutable_memtable_memory_usage);
+                return true;
+            } else {
+                trace!(
+                    "Engine won't flush, memory_usage: {}, global_write_buffer_size: {}, mutable_usage: {}.",
+                    memory_usage,
+                    self.global_write_buffer_size,
+                    mutable_memtable_memory_usage);
+            }
        }

        false
--- a/src/mito2/src/manifest/tests/checkpoint.rs
+++ b/src/mito2/src/manifest/tests/checkpoint.rs
@@ -84,6 +84,7 @@ async fn manager_without_checkpoint() {

    // check files
    let mut expected = vec![
+        "/",
        "00000000000000000010.json",
        "00000000000000000009.json",
        "00000000000000000008.json",
@@ -130,6 +131,7 @@ async fn manager_with_checkpoint_distance_1() {

    // check files
    let mut expected = vec![
+        "/",
        "00000000000000000009.checkpoint",
        "00000000000000000010.checkpoint",
        "00000000000000000010.json",
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -110,6 +110,15 @@ impl MemtableStats {

 pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>> + Send>;

+/// Ranges in a memtable.
+#[derive(Default)]
+pub struct MemtableRanges {
+    /// Range IDs and ranges.
+    pub ranges: BTreeMap<usize, MemtableRange>,
+    /// Statistics of the memtable at the query time.
+    pub stats: MemtableStats,
+}
+
 /// In memory write buffer.
 pub trait Memtable: Send + Sync + fmt::Debug {
    /// Returns the id of this memtable.
@@ -139,7 +148,7 @@ pub trait Memtable: Send + Sync + fmt::Debug {
        &self,
        projection: Option<&[ColumnId]>,
        predicate: Option<Predicate>,
-    ) -> BTreeMap<usize, MemtableRange>;
+    ) -> MemtableRanges;

    /// Returns true if the memtable is empty.
    fn is_empty(&self) -> bool;
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -14,7 +14,6 @@

 //! Memtable implementation for bulk load

-use std::collections::BTreeMap;
 use std::sync::{Arc, RwLock};

 use store_api::metadata::RegionMetadataRef;
@@ -25,7 +24,7 @@ use crate::error::Result;
 use crate::memtable::bulk::part::BulkPart;
 use crate::memtable::key_values::KeyValue;
 use crate::memtable::{
-    BoxedBatchIterator, KeyValues, Memtable, MemtableId, MemtableRange, MemtableRef, MemtableStats,
+    BoxedBatchIterator, KeyValues, Memtable, MemtableId, MemtableRanges, MemtableRef, MemtableStats,
 };

 #[allow(unused)]
@@ -68,7 +67,7 @@ impl Memtable for BulkMemtable {
        &self,
        _projection: Option<&[ColumnId]>,
        _predicate: Option<Predicate>,
-    ) -> BTreeMap<usize, MemtableRange> {
+    ) -> MemtableRanges {
        todo!()
    }

--- a/Show More
+++ b/Show More