chore: bump version to v0.15.4

Signed-off-by: evenyag <realevenyag@gmail.com>
feat: schema/database support for label_values (#6631 )
2025-12-27 08:29:59 +00:00 · 2025-08-04 22:19:40 +08:00 · 2025-08-04 22:19:40 +08:00 · 2025-08-04 22:19:40 +08:00 · 2025-08-04 22:19:40 +08:00 · 2025-08-04 22:19:40 +08:00
90 changed files with 6795 additions and 611 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -211,7 +211,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"

 [[package]]
 name = "api"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "common-base",
 "common-decimal",
@@ -944,7 +944,7 @@ dependencies = [

 [[package]]
 name = "auth"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -1586,7 +1586,7 @@ dependencies = [

 [[package]]
 name = "cache"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "catalog",
 "common-error",
@@ -1621,7 +1621,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

 [[package]]
 name = "catalog"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arrow 54.2.1",
@@ -1959,7 +1959,7 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"

 [[package]]
 name = "cli"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -2004,7 +2004,7 @@ dependencies = [
 "session",
 "snafu 0.8.5",
 "store-api",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tempfile",
 "tokio",
@@ -2013,7 +2013,7 @@ dependencies = [

 [[package]]
 name = "client"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arc-swap",
@@ -2043,7 +2043,7 @@ dependencies = [
 "rand 0.9.0",
 "serde_json",
 "snafu 0.8.5",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "substrait 0.37.3",
 "tokio",
 "tokio-stream",
@@ -2084,7 +2084,7 @@ dependencies = [

 [[package]]
 name = "cmd"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "auth",
@@ -2145,7 +2145,7 @@ dependencies = [
 "snafu 0.8.5",
 "stat",
 "store-api",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "temp-env",
 "tempfile",
@@ -2192,7 +2192,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"

 [[package]]
 name = "common-base"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "anymap2",
 "async-trait",
@@ -2214,11 +2214,11 @@ dependencies = [

 [[package]]
 name = "common-catalog"
-version = "0.15.2"
+version = "0.15.4"

 [[package]]
 name = "common-config"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "common-base",
 "common-error",
@@ -2243,7 +2243,7 @@ dependencies = [

 [[package]]
 name = "common-datasource"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "arrow 54.2.1",
 "arrow-schema 54.3.1",
@@ -2280,7 +2280,7 @@ dependencies = [

 [[package]]
 name = "common-decimal"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "bigdecimal 0.4.8",
 "common-error",
@@ -2293,7 +2293,7 @@ dependencies = [

 [[package]]
 name = "common-error"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "common-macro",
 "http 1.1.0",
@@ -2304,7 +2304,7 @@ dependencies = [

 [[package]]
 name = "common-frontend"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "common-error",
@@ -2320,7 +2320,7 @@ dependencies = [

 [[package]]
 name = "common-function"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "ahash 0.8.11",
 "api",
@@ -2373,7 +2373,7 @@ dependencies = [

 [[package]]
 name = "common-greptimedb-telemetry"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "common-runtime",
@@ -2390,7 +2390,7 @@ dependencies = [

 [[package]]
 name = "common-grpc"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arrow-flight",
@@ -2422,7 +2422,7 @@ dependencies = [

 [[package]]
 name = "common-grpc-expr"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "common-base",
@@ -2441,7 +2441,7 @@ dependencies = [

 [[package]]
 name = "common-macro"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "arc-swap",
 "common-query",
@@ -2455,7 +2455,7 @@ dependencies = [

 [[package]]
 name = "common-mem-prof"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "anyhow",
 "common-error",
@@ -2471,7 +2471,7 @@ dependencies = [

 [[package]]
 name = "common-meta"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "anymap2",
 "api",
@@ -2536,7 +2536,7 @@ dependencies = [

 [[package]]
 name = "common-options"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "common-grpc",
 "humantime-serde",
@@ -2545,11 +2545,11 @@ dependencies = [

 [[package]]
 name = "common-plugins"
-version = "0.15.2"
+version = "0.15.4"

 [[package]]
 name = "common-pprof"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "common-error",
 "common-macro",
@@ -2561,7 +2561,7 @@ dependencies = [

 [[package]]
 name = "common-procedure"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -2588,7 +2588,7 @@ dependencies = [

 [[package]]
 name = "common-procedure-test"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "common-procedure",
@@ -2597,7 +2597,7 @@ dependencies = [

 [[package]]
 name = "common-query"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -2623,7 +2623,7 @@ dependencies = [

 [[package]]
 name = "common-recordbatch"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "arc-swap",
 "common-error",
@@ -2643,7 +2643,7 @@ dependencies = [

 [[package]]
 name = "common-runtime"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "clap 4.5.19",
@@ -2673,14 +2673,14 @@ dependencies = [

 [[package]]
 name = "common-session"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "strum 0.27.1",
 ]

 [[package]]
 name = "common-telemetry"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "backtrace",
 "common-error",
@@ -2708,7 +2708,7 @@ dependencies = [

 [[package]]
 name = "common-test-util"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "client",
 "common-grpc",
@@ -2721,7 +2721,7 @@ dependencies = [

 [[package]]
 name = "common-time"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "arrow 54.2.1",
 "chrono",
@@ -2739,7 +2739,7 @@ dependencies = [

 [[package]]
 name = "common-version"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "build-data",
 "cargo-manifest",
@@ -2750,7 +2750,7 @@ dependencies = [

 [[package]]
 name = "common-wal"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "common-base",
 "common-error",
@@ -2773,7 +2773,7 @@ dependencies = [

 [[package]]
 name = "common-workload"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "common-telemetry",
@@ -3729,7 +3729,7 @@ dependencies = [

 [[package]]
 name = "datanode"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arrow-flight",
@@ -3782,7 +3782,7 @@ dependencies = [
 "session",
 "snafu 0.8.5",
 "store-api",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tokio",
 "toml 0.8.19",
@@ -3791,7 +3791,7 @@ dependencies = [

 [[package]]
 name = "datatypes"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "arrow 54.2.1",
 "arrow-array 54.2.1",
@@ -4451,7 +4451,7 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"

 [[package]]
 name = "file-engine"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -4588,7 +4588,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"

 [[package]]
 name = "flow"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arrow 54.2.1",
@@ -4653,7 +4653,7 @@ dependencies = [
 "sql",
 "store-api",
 "strum 0.27.1",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tokio",
 "tonic 0.12.3",
@@ -4708,7 +4708,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"

 [[package]]
 name = "frontend"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arc-swap",
@@ -4768,7 +4768,7 @@ dependencies = [
 "sqlparser 0.54.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0cf6c04490d59435ee965edd2078e8855bd8471e)",
 "store-api",
 "strfmt",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tokio",
 "tokio-util",
@@ -5158,7 +5158,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=96c733f8472284d3c83a4c011dc6de9cf830c353#96c733f8472284d3c83a4c011dc6de9cf830c353"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=a5d256ba4abb7393e0859ffbf7fca1e38f3433dc#a5d256ba4abb7393e0859ffbf7fca1e38f3433dc"
 dependencies = [
 "prost 0.13.5",
 "serde",
@@ -5929,7 +5929,7 @@ dependencies = [

 [[package]]
 name = "index"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "asynchronous-codec",
@@ -6814,7 +6814,7 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"

 [[package]]
 name = "log-query"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "chrono",
 "common-error",
@@ -6826,7 +6826,7 @@ dependencies = [

 [[package]]
 name = "log-store"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-stream",
 "async-trait",
@@ -7124,7 +7124,7 @@ dependencies = [

 [[package]]
 name = "meta-client"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -7152,7 +7152,7 @@ dependencies = [

 [[package]]
 name = "meta-srv"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -7243,7 +7243,7 @@ dependencies = [

 [[package]]
 name = "metric-engine"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "aquamarine",
@@ -7333,7 +7333,7 @@ dependencies = [

 [[package]]
 name = "mito-codec"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "bytes",
@@ -7356,7 +7356,7 @@ dependencies = [

 [[package]]
 name = "mito2"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "aquamarine",
@@ -8106,7 +8106,7 @@ dependencies = [

 [[package]]
 name = "object-store"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "anyhow",
 "bytes",
@@ -8420,7 +8420,7 @@ dependencies = [

 [[package]]
 name = "operator"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "ahash 0.8.11",
 "api",
@@ -8475,7 +8475,7 @@ dependencies = [
 "sql",
 "sqlparser 0.54.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0cf6c04490d59435ee965edd2078e8855bd8471e)",
 "store-api",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tokio",
 "tokio-util",
@@ -8742,7 +8742,7 @@ dependencies = [

 [[package]]
 name = "partition"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -9030,7 +9030,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

 [[package]]
 name = "pipeline"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "ahash 0.8.11",
 "api",
@@ -9173,7 +9173,7 @@ dependencies = [

 [[package]]
 name = "plugins"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "auth",
 "clap 4.5.19",
@@ -9486,7 +9486,7 @@ dependencies = [

 [[package]]
 name = "promql"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "ahash 0.8.11",
 "async-trait",
@@ -9768,7 +9768,7 @@ dependencies = [

 [[package]]
 name = "puffin"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-compression 0.4.13",
 "async-trait",
@@ -9810,7 +9810,7 @@ dependencies = [

 [[package]]
 name = "query"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "ahash 0.8.11",
 "api",
@@ -9876,7 +9876,7 @@ dependencies = [
 "sqlparser 0.54.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0cf6c04490d59435ee965edd2078e8855bd8471e)",
 "statrs",
 "store-api",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tokio",
 "tokio-stream",
@@ -11162,7 +11162,7 @@ dependencies = [

 [[package]]
 name = "servers"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "ahash 0.8.11",
 "api",
@@ -11283,7 +11283,7 @@ dependencies = [

 [[package]]
 name = "session"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arc-swap",
@@ -11622,7 +11622,7 @@ dependencies = [

 [[package]]
 name = "sql"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "chrono",
@@ -11677,7 +11677,7 @@ dependencies = [

 [[package]]
 name = "sqlness-runner"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "clap 4.5.19",
@@ -11977,7 +11977,7 @@ dependencies = [

 [[package]]
 name = "stat"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "nix 0.30.1",
 ]
@@ -12003,7 +12003,7 @@ dependencies = [

 [[package]]
 name = "store-api"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "aquamarine",
@@ -12164,7 +12164,7 @@ dependencies = [

 [[package]]
 name = "substrait"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "async-trait",
 "bytes",
@@ -12344,7 +12344,7 @@ dependencies = [

 [[package]]
 name = "table"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "async-trait",
@@ -12605,7 +12605,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"

 [[package]]
 name = "tests-fuzz"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "arbitrary",
 "async-trait",
@@ -12649,7 +12649,7 @@ dependencies = [

 [[package]]
 name = "tests-integration"
-version = "0.15.2"
+version = "0.15.4"
 dependencies = [
 "api",
 "arrow-flight",
@@ -12716,7 +12716,7 @@ dependencies = [
 "sql",
 "sqlx",
 "store-api",
- "substrait 0.15.2",
+ "substrait 0.15.4",
 "table",
 "tempfile",
 "time",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -71,7 +71,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "0.15.2"
+version = "0.15.4"
 edition = "2021"
 license = "Apache-2.0"

@@ -134,7 +134,7 @@ etcd-client = "0.14"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "96c733f8472284d3c83a4c011dc6de9cf830c353" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "a5d256ba4abb7393e0859ffbf7fca1e38f3433dc" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
--- a/config/config.md
+++ b/config/config.md
@@ -147,6 +147,7 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
+| `region_engine.mito.max_concurrent_scan_files` | Integer | `128` | Maximum number of SST files to scan concurrently. |
 | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
 | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -496,6 +497,7 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
+| `region_engine.mito.max_concurrent_scan_files` | Integer | `128` | Maximum number of SST files to scan concurrently. |
 | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
 | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
 | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -474,6 +474,9 @@ sst_write_buffer_size = "8MB"
 ## Capacity of the channel to send data from parallel scan tasks to the main task.
 parallel_scan_channel_size = 32

+## Maximum number of SST files to scan concurrently.
+max_concurrent_scan_files = 128
+
 ## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -565,6 +565,9 @@ sst_write_buffer_size = "8MB"
 ## Capacity of the channel to send data from parallel scan tasks to the main task.
 parallel_scan_channel_size = 32

+## Maximum number of SST files to scan concurrently.
+max_concurrent_scan_files = 128
+
 ## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false

--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -211,12 +211,18 @@ impl Database {
                        retries += 1;
                        warn!("Retrying {} times with error = {:?}", retries, err);
                        continue;
+                    } else {
+                        error!(
+                            err; "Failed to send request to grpc handle, retries = {}, not retryable error, aborting",
+                            retries
+                        );
+                        return Err(err.into());
                    }
                }
                (Err(err), false) => {
                    error!(
-                        "Failed to send request to grpc handle after {} retries, error = {:?}",
-                        retries, err
+                        err; "Failed to send request to grpc handle after {} retries",
+                        retries,
                    );
                    return Err(err.into());
                }
--- a/src/client/src/region.rs
+++ b/src/client/src/region.rs
@@ -163,19 +163,70 @@ impl RegionRequester {
            let _span = tracing_context.attach(common_telemetry::tracing::info_span!(
                "poll_flight_data_stream"
            ));
-            while let Some(flight_message) = flight_message_stream.next().await {
-                let flight_message = flight_message
-                    .map_err(BoxedError::new)
-                    .context(ExternalSnafu)?;
+
+            let mut buffered_message: Option<FlightMessage> = None;
+            let mut stream_ended = false;
+
+            while !stream_ended {
+                // get the next message from the buffered message or read from the flight message stream
+                let flight_message_item = if let Some(msg) = buffered_message.take() {
+                    Some(Ok(msg))
+                } else {
+                    flight_message_stream.next().await
+                };
+
+                let flight_message = match flight_message_item {
+                    Some(Ok(message)) => message,
+                    Some(Err(e)) => {
+                        yield Err(BoxedError::new(e)).context(ExternalSnafu);
+                        break;
+                    }
+                    None => break,
+                };

                match flight_message {
                    FlightMessage::RecordBatch(record_batch) => {
-                        yield RecordBatch::try_from_df_record_batch(
+                        let result_to_yield = RecordBatch::try_from_df_record_batch(
                            schema_cloned.clone(),
                            record_batch,
-                        )
+                        );
+
+                        // get the next message from the stream. normally it should be a metrics message.
+                        if let Some(next_flight_message_result) = flight_message_stream.next().await
+                        {
+                            match next_flight_message_result {
+                                Ok(FlightMessage::Metrics(s)) => {
+                                    let m = serde_json::from_str(&s).ok().map(Arc::new);
+                                    metrics_ref.swap(m);
+                                }
+                                Ok(FlightMessage::RecordBatch(rb)) => {
+                                    // for some reason it's not a metrics message, so we need to buffer this record batch
+                                    // and yield it in the next iteration.
+                                    buffered_message = Some(FlightMessage::RecordBatch(rb));
+                                }
+                                Ok(_) => {
+                                    yield IllegalFlightMessagesSnafu {
+                                        reason: "A RecordBatch message can only be succeeded by a Metrics message or another RecordBatch message"
+                                    }
+                                    .fail()
+                                    .map_err(BoxedError::new)
+                                    .context(ExternalSnafu);
+                                    break;
+                                }
+                                Err(e) => {
+                                    yield Err(BoxedError::new(e)).context(ExternalSnafu);
+                                    break;
+                                }
+                            }
+                        } else {
+                            // the stream has ended
+                            stream_ended = true;
+                        }
+
+                        yield result_to_yield;
                    }
                    FlightMessage::Metrics(s) => {
+                        // just a branch in case of some metrics message comes after other things.
                        let m = serde_json::from_str(&s).ok().map(Arc::new);
                        metrics_ref.swap(m);
                        break;
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -15,6 +15,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;

+use common_telemetry::info;
 use futures::future::BoxFuture;
 use moka::future::Cache;
 use moka::ops::compute::Op;
@@ -89,6 +90,12 @@ fn init_factory(table_flow_manager: TableFlowManagerRef) -> Initializer<TableId,
                // we have a corresponding cache invalidation mechanism to invalidate `(Key, EmptyHashSet)`.
                .map(Arc::new)
                .map(Some)
+                .inspect(|set| {
+                    info!(
+                        "Initialized table_flownode cache for table_id: {}, set: {:?}",
+                        table_id, set
+                    );
+                })
        })
    })
 }
@@ -167,6 +174,13 @@ fn invalidator<'a>(
        match ident {
            CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
            CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
+            CacheIdent::FlowNodeAddressChange(node_id) => {
+                info!(
+                    "Invalidate flow node cache for node_id in table_flownode: {}",
+                    node_id
+                );
+                cache.invalidate_all();
+            }
            _ => {}
        }
        Ok(())
@@ -174,7 +188,10 @@ fn invalidator<'a>(
 }

 fn filter(ident: &CacheIdent) -> bool {
-    matches!(ident, CacheIdent::CreateFlow(_) | CacheIdent::DropFlow(_))
+    matches!(
+        ident,
+        CacheIdent::CreateFlow(_) | CacheIdent::DropFlow(_) | CacheIdent::FlowNodeAddressChange(_)
+    )
 }

 #[cfg(test)]
--- a/src/common/meta/src/cache_invalidator.rs
+++ b/src/common/meta/src/cache_invalidator.rs
@@ -22,6 +22,7 @@ use crate::key::flow::flow_name::FlowNameKey;
 use crate::key::flow::flow_route::FlowRouteKey;
 use crate::key::flow::flownode_flow::FlownodeFlowKey;
 use crate::key::flow::table_flow::TableFlowKey;
+use crate::key::node_address::NodeAddressKey;
 use crate::key::schema_name::SchemaNameKey;
 use crate::key::table_info::TableInfoKey;
 use crate::key::table_name::TableNameKey;
@@ -53,6 +54,10 @@ pub struct Context {
 #[async_trait::async_trait]
 pub trait CacheInvalidator: Send + Sync {
    async fn invalidate(&self, ctx: &Context, caches: &[CacheIdent]) -> Result<()>;
+
+    fn name(&self) -> &'static str {
+        std::any::type_name::<Self>()
+    }
 }

 pub type CacheInvalidatorRef = Arc<dyn CacheInvalidator>;
@@ -137,6 +142,13 @@ where
                    let key = FlowInfoKey::new(*flow_id);
                    self.invalidate_key(&key.to_bytes()).await;
                }
+                CacheIdent::FlowNodeAddressChange(node_id) => {
+                    // other caches doesn't need to be invalidated
+                    // since this is only for flownode address change not id change
+                    common_telemetry::info!("Invalidate flow node cache for node_id: {}", node_id);
+                    let key = NodeAddressKey::with_flownode(*node_id);
+                    self.invalidate_key(&key.to_bytes()).await;
+                }
            }
        }
        Ok(())
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -174,6 +174,8 @@ pub struct UpgradeRegion {
 /// The identifier of cache.
 pub enum CacheIdent {
    FlowId(FlowId),
+    /// Indicate change of address of flownode.
+    FlowNodeAddressChange(u64),
    FlowName(FlowName),
    TableId(TableId),
    TableName(TableName),
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -222,6 +222,7 @@ pub struct RecordBatchStreamAdapter {
 enum Metrics {
    Unavailable,
    Unresolved(Arc<dyn ExecutionPlan>),
+    PartialResolved(Arc<dyn ExecutionPlan>, RecordBatchMetrics),
    Resolved(RecordBatchMetrics),
 }

@@ -275,7 +276,9 @@ impl RecordBatchStream for RecordBatchStreamAdapter {

    fn metrics(&self) -> Option<RecordBatchMetrics> {
        match &self.metrics_2 {
-            Metrics::Resolved(metrics) => Some(metrics.clone()),
+            Metrics::Resolved(metrics) | Metrics::PartialResolved(_, metrics) => {
+                Some(metrics.clone())
+            }
            Metrics::Unavailable | Metrics::Unresolved(_) => None,
        }
    }
@@ -299,13 +302,25 @@ impl Stream for RecordBatchStreamAdapter {
            Poll::Pending => Poll::Pending,
            Poll::Ready(Some(df_record_batch)) => {
                let df_record_batch = df_record_batch?;
+                if let Metrics::Unresolved(df_plan) | Metrics::PartialResolved(df_plan, _) =
+                    &self.metrics_2
+                {
+                    let mut metric_collector = MetricCollector::new(self.explain_verbose);
+                    accept(df_plan.as_ref(), &mut metric_collector).unwrap();
+                    self.metrics_2 = Metrics::PartialResolved(
+                        df_plan.clone(),
+                        metric_collector.record_batch_metrics,
+                    );
+                }
                Poll::Ready(Some(RecordBatch::try_from_df_record_batch(
                    self.schema(),
                    df_record_batch,
                )))
            }
            Poll::Ready(None) => {
-                if let Metrics::Unresolved(df_plan) = &self.metrics_2 {
+                if let Metrics::Unresolved(df_plan) | Metrics::PartialResolved(df_plan, _) =
+                    &self.metrics_2
+                {
                    let mut metric_collector = MetricCollector::new(self.explain_verbose);
                    accept(df_plan.as_ref(), &mut metric_collector).unwrap();
                    self.metrics_2 = Metrics::Resolved(metric_collector.record_batch_metrics);
--- a/src/common/substrait/src/extension_serializer.rs
+++ b/src/common/substrait/src/extension_serializer.rs
@@ -19,7 +19,8 @@ use datafusion::execution::registry::SerializerRegistry;
 use datafusion_common::DataFusionError;
 use datafusion_expr::UserDefinedLogicalNode;
 use promql::extension_plan::{
-    EmptyMetric, InstantManipulate, RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize,
+    Absent, EmptyMetric, InstantManipulate, RangeManipulate, ScalarCalculate, SeriesDivide,
+    SeriesNormalize,
 };

 #[derive(Debug)]
@@ -65,6 +66,13 @@ impl SerializerRegistry for ExtensionSerializer {
                    .expect("Failed to downcast to SeriesDivide");
                Ok(series_divide.serialize())
            }
+            name if name == Absent::name() => {
+                let absent = node
+                    .as_any()
+                    .downcast_ref::<Absent>()
+                    .expect("Failed to downcast to Absent");
+                Ok(absent.serialize())
+            }
            name if name == EmptyMetric::name() => Err(DataFusionError::Substrait(
                "EmptyMetric should not be serialized".to_string(),
            )),
@@ -103,6 +111,10 @@ impl SerializerRegistry for ExtensionSerializer {
                let scalar_calculate = ScalarCalculate::deserialize(bytes)?;
                Ok(Arc::new(scalar_calculate))
            }
+            name if name == Absent::name() => {
+                let absent = Absent::deserialize(bytes)?;
+                Ok(Arc::new(absent))
+            }
            name if name == EmptyMetric::name() => Err(DataFusionError::Substrait(
                "EmptyMetric should not be deserialized".to_string(),
            )),
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -51,7 +51,7 @@ use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as S
 use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
 use servers::grpc::region_server::RegionServerHandler;
 use servers::grpc::FlightCompression;
-use session::context::{QueryContextBuilder, QueryContextRef};
+use session::context::{QueryContext, QueryContextBuilder, QueryContextRef};
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::metric_engine_consts::{
    FILE_ENGINE_NAME, LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME,
@@ -194,6 +194,7 @@ impl RegionServer {
    pub async fn handle_remote_read(
        &self,
        request: api::v1::region::QueryRequest,
+        query_ctx: QueryContextRef,
    ) -> Result<SendableRecordBatchStream> {
        let _permit = if let Some(p) = &self.inner.parallelism {
            Some(p.acquire().await?)
@@ -201,12 +202,6 @@ impl RegionServer {
            None
        };

-        let query_ctx: QueryContextRef = request
-            .header
-            .as_ref()
-            .map(|h| Arc::new(h.into()))
-            .unwrap_or_else(|| Arc::new(QueryContextBuilder::default().build()));
-
        let region_id = RegionId::from_u64(request.region_id);
        let provider = self.table_provider(region_id, Some(&query_ctx)).await?;
        let catalog_list = Arc::new(DummyCatalogList::with_table_provider(provider));
@@ -214,7 +209,7 @@ impl RegionServer {
        let decoder = self
            .inner
            .query_engine
-            .engine_context(query_ctx)
+            .engine_context(query_ctx.clone())
            .new_plan_decoder()
            .context(NewPlanDecoderSnafu)?;

@@ -224,11 +219,14 @@ impl RegionServer {
            .context(DecodeLogicalPlanSnafu)?;

        self.inner
-            .handle_read(QueryRequest {
-                header: request.header,
-                region_id,
-                plan,
-            })
+            .handle_read(
+                QueryRequest {
+                    header: request.header,
+                    region_id,
+                    plan,
+                },
+                query_ctx,
+            )
            .await
    }

@@ -243,6 +241,7 @@ impl RegionServer {
        let ctx: Option<session::context::QueryContext> = request.header.as_ref().map(|h| h.into());

        let provider = self.table_provider(request.region_id, ctx.as_ref()).await?;
+        let query_ctx = Arc::new(ctx.unwrap_or_else(|| QueryContextBuilder::default().build()));

        struct RegionDataSourceInjector {
            source: Arc<dyn TableSource>,
@@ -271,7 +270,7 @@ impl RegionServer {
            .data;

        self.inner
-            .handle_read(QueryRequest { plan, ..request })
+            .handle_read(QueryRequest { plan, ..request }, query_ctx)
            .await
    }

@@ -536,9 +535,14 @@ impl FlightCraft for RegionServer {
            .as_ref()
            .map(|h| TracingContext::from_w3c(&h.tracing_context))
            .unwrap_or_default();
+        let query_ctx = request
+            .header
+            .as_ref()
+            .map(|h| Arc::new(QueryContext::from(h)))
+            .unwrap_or(QueryContext::arc());

        let result = self
-            .handle_remote_read(request)
+            .handle_remote_read(request, query_ctx.clone())
            .trace(tracing_context.attach(info_span!("RegionServer::handle_read")))
            .await?;

@@ -546,6 +550,7 @@ impl FlightCraft for RegionServer {
            result,
            tracing_context,
            self.flight_compression,
+            query_ctx,
        ));
        Ok(Response::new(stream))
    }
@@ -1123,16 +1128,13 @@ impl RegionServerInner {
        Ok(())
    }

-    pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
+    pub async fn handle_read(
+        &self,
+        request: QueryRequest,
+        query_ctx: QueryContextRef,
+    ) -> Result<SendableRecordBatchStream> {
        // TODO(ruihang): add metrics and set trace id

-        // Build query context from gRPC header
-        let query_ctx: QueryContextRef = request
-            .header
-            .as_ref()
-            .map(|h| Arc::new(h.into()))
-            .unwrap_or_else(|| QueryContextBuilder::default().build().into());
-
        let result = self
            .query_engine
            .execute(request.plan, query_ctx)
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -14,7 +14,7 @@

 //! Batching mode engine

-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::sync::Arc;

 use api::v1::flow::{DirtyWindowRequests, FlowResponse};
@@ -142,7 +142,7 @@ impl BatchingEngine {

            let handle: JoinHandle<Result<(), Error>> = tokio::spawn(async move {
                let src_table_names = &task.config.source_table_names;
-                let mut all_dirty_windows = vec![];
+                let mut all_dirty_windows = HashSet::new();
                for src_table_name in src_table_names {
                    if let Some((timestamps, unit)) = group_by_table_name.get(src_table_name) {
                        let Some(expr) = &task.config.time_window_expr else {
@@ -155,7 +155,7 @@ impl BatchingEngine {
                                .context(UnexpectedSnafu {
                                    reason: "Failed to eval start value",
                                })?;
-                            all_dirty_windows.push(align_start);
+                            all_dirty_windows.insert(align_start);
                        }
                    }
                }
--- a/src/flow/src/batching_mode/time_window.rs
+++ b/src/flow/src/batching_mode/time_window.rs
@@ -50,7 +50,8 @@ use snafu::{ensure, OptionExt, ResultExt};

 use crate::adapter::util::from_proto_to_data_type;
 use crate::error::{
-    ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, PlanSnafu, UnexpectedSnafu,
+    ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, PlanSnafu, TimeSnafu,
+    UnexpectedSnafu,
 };
 use crate::expr::error::DataTypeSnafu;
 use crate::Error;
@@ -74,6 +75,7 @@ pub struct TimeWindowExpr {
    logical_expr: Expr,
    df_schema: DFSchema,
    eval_time_window_size: Option<std::time::Duration>,
+    eval_time_original: Option<Timestamp>,
 }

 impl std::fmt::Display for TimeWindowExpr {
@@ -106,10 +108,11 @@ impl TimeWindowExpr {
            logical_expr: expr.clone(),
            df_schema: df_schema.clone(),
            eval_time_window_size: None,
+            eval_time_original: None,
        };
        let test_ts = DEFAULT_TEST_TIMESTAMP;
-        let (l, u) = zelf.eval(test_ts)?;
-        let time_window_size = match (l, u) {
+        let (lower, upper) = zelf.eval(test_ts)?;
+        let time_window_size = match (lower, upper) {
            (Some(l), Some(u)) => u.sub(&l).map(|r| r.to_std()).transpose().map_err(|_| {
                UnexpectedSnafu {
                    reason: format!(
@@ -121,13 +124,59 @@ impl TimeWindowExpr {
            _ => None,
        };
        zelf.eval_time_window_size = time_window_size;
+        zelf.eval_time_original = lower;
+
        Ok(zelf)
    }

+    /// TODO(discord9): add `eval_batch` too
    pub fn eval(
        &self,
        current: Timestamp,
    ) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
+        fn compute_distance(time_diff_ns: i64, stride_ns: i64) -> i64 {
+            if stride_ns == 0 {
+                return time_diff_ns;
+            }
+            // a - (a % n) impl ceil to nearest n * stride
+            let time_delta = time_diff_ns - (time_diff_ns % stride_ns);
+
+            if time_diff_ns < 0 && time_delta != time_diff_ns {
+                // The origin is later than the source timestamp, round down to the previous bin
+
+                time_delta - stride_ns
+            } else {
+                time_delta
+            }
+        }
+
+        // FAST PATH: if we have eval_time_original and eval_time_window_size,
+        // we can compute the bounds directly
+        if let (Some(original), Some(window_size)) =
+            (self.eval_time_original, self.eval_time_window_size)
+        {
+            // date_bin align current to lower bound
+            let time_diff_ns = current.sub(&original).and_then(|s|s.num_nanoseconds()).with_context(||UnexpectedSnafu {
+                reason: format!(
+                    "Failed to compute time difference between current {current:?} and original {original:?}"
+                ),
+            })?;
+
+            let window_size_ns = window_size.as_nanos() as i64;
+
+            let distance_ns = compute_distance(time_diff_ns, window_size_ns);
+
+            let lower_bound = if distance_ns >= 0 {
+                original.add_duration(std::time::Duration::from_nanos(distance_ns as u64))
+            } else {
+                original.sub_duration(std::time::Duration::from_nanos((-distance_ns) as u64))
+            }
+            .context(TimeSnafu)?;
+            let upper_bound = lower_bound.add_duration(window_size).context(TimeSnafu)?;
+
+            return Ok((Some(lower_bound), Some(upper_bound)));
+        }
+
        let lower_bound =
            calc_expr_time_window_lower_bound(&self.phy_expr, &self.df_schema, current)?;
        let upper_bound =
--- a/src/frontend/src/instance/promql.rs
+++ b/src/frontend/src/instance/promql.rs
@@ -21,9 +21,10 @@ use common_catalog::format_full_table_name;
 use common_recordbatch::util;
 use common_telemetry::tracing;
 use datatypes::prelude::Value;
-use promql_parser::label::{Matcher, Matchers};
+use promql_parser::label::{MatchOp, Matcher, Matchers};
 use query::promql;
 use query::promql::planner::PromPlanner;
+use servers::prom_store::{DATABASE_LABEL, SCHEMA_LABEL};
 use servers::prometheus;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
@@ -114,7 +115,17 @@ impl Instance {
        end: SystemTime,
        ctx: &QueryContextRef,
    ) -> Result<Vec<String>> {
-        let table_schema = ctx.current_schema();
+        let table_schema = matchers
+            .iter()
+            .find_map(|m| {
+                if (m.name == SCHEMA_LABEL || m.name == DATABASE_LABEL) && m.op == MatchOp::Equal {
+                    Some(m.value.clone())
+                } else {
+                    None
+                }
+            })
+            .unwrap_or_else(|| ctx.current_schema());
+
        let table = self
            .catalog_manager
            .table(ctx.current_catalog(), &table_schema, &metric, Some(ctx))
--- a/src/meta-srv/src/handler/remap_flow_peer_handler.rs
+++ b/src/meta-srv/src/handler/remap_flow_peer_handler.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use api::v1::meta::{HeartbeatRequest, Peer, Role};
+use common_meta::instruction::CacheIdent;
 use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
 use common_meta::key::{MetadataKey, MetadataValue};
 use common_meta::rpc::store::PutRequest;
@@ -80,7 +81,19 @@ async fn rewrite_node_address(ctx: &mut Context, peer: &Peer) {
        match ctx.leader_cached_kv_backend.put(put).await {
            Ok(_) => {
                info!("Successfully updated flow `NodeAddressValue`: {:?}", peer);
-                // TODO(discord): broadcast invalidating cache to all frontends
+                // broadcast invalidating cache to all frontends
+                let cache_idents = vec![CacheIdent::FlowNodeAddressChange(peer.id)];
+                info!(
+                    "Invalidate flow node cache for new address with cache idents: {:?}",
+                    cache_idents
+                );
+                if let Err(e) = ctx
+                    .cache_invalidator
+                    .invalidate(&Default::default(), &cache_idents)
+                    .await
+                {
+                    error!(e; "Failed to invalidate {} `NodeAddressKey` cache, peer: {:?}", cache_idents.len(), peer);
+                }
            }
            Err(e) => {
                error!(e; "Failed to update flow `NodeAddressValue`: {:?}", peer);
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -62,7 +62,7 @@ use crate::read::BoxedBatchReader;
 use crate::region::options::MergeMode;
 use crate::region::version::VersionControlRef;
 use crate::region::ManifestContextRef;
-use crate::request::{OptionOutputTx, OutputTx, WorkerRequest};
+use crate::request::{OptionOutputTx, OutputTx, WorkerRequestWithTime};
 use crate::schedule::remote_job_scheduler::{
    CompactionJob, DefaultNotifier, RemoteJob, RemoteJobSchedulerRef,
 };
@@ -77,7 +77,7 @@ pub struct CompactionRequest {
    pub(crate) current_version: CompactionVersion,
    pub(crate) access_layer: AccessLayerRef,
    /// Sender to send notification to the region worker.
-    pub(crate) request_sender: mpsc::Sender<WorkerRequest>,
+    pub(crate) request_sender: mpsc::Sender<WorkerRequestWithTime>,
    /// Waiters of the compaction request.
    pub(crate) waiters: Vec<OutputTx>,
    /// Start time of compaction task.
@@ -101,7 +101,7 @@ pub(crate) struct CompactionScheduler {
    /// Compacting regions.
    region_status: HashMap<RegionId, CompactionStatus>,
    /// Request sender of the worker that this scheduler belongs to.
-    request_sender: Sender<WorkerRequest>,
+    request_sender: Sender<WorkerRequestWithTime>,
    cache_manager: CacheManagerRef,
    engine_config: Arc<MitoConfig>,
    listener: WorkerListener,
@@ -112,7 +112,7 @@ pub(crate) struct CompactionScheduler {
 impl CompactionScheduler {
    pub(crate) fn new(
        scheduler: SchedulerRef,
-        request_sender: Sender<WorkerRequest>,
+        request_sender: Sender<WorkerRequestWithTime>,
        cache_manager: CacheManagerRef,
        engine_config: Arc<MitoConfig>,
        listener: WorkerListener,
@@ -559,7 +559,7 @@ impl CompactionStatus {
    #[allow(clippy::too_many_arguments)]
    fn new_compaction_request(
        &mut self,
-        request_sender: Sender<WorkerRequest>,
+        request_sender: Sender<WorkerRequestWithTime>,
        mut waiter: OptionOutputTx,
        engine_config: Arc<MitoConfig>,
        cache_manager: CacheManagerRef,
--- a/src/mito2/src/compaction/task.rs
+++ b/src/mito2/src/compaction/task.rs
@@ -27,6 +27,7 @@ use crate::manifest::action::RegionEdit;
 use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED};
 use crate::request::{
    BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, WorkerRequest,
+    WorkerRequestWithTime,
 };
 use crate::worker::WorkerListener;
 use crate::{error, metrics};
@@ -37,7 +38,7 @@ pub const MAX_PARALLEL_COMPACTION: usize = 1;
 pub(crate) struct CompactionTaskImpl {
    pub compaction_region: CompactionRegion,
    /// Request sender to notify the worker.
-    pub(crate) request_sender: mpsc::Sender<WorkerRequest>,
+    pub(crate) request_sender: mpsc::Sender<WorkerRequestWithTime>,
    /// Senders that are used to notify waiters waiting for pending compaction tasks.
    pub waiters: Vec<OutputTx>,
    /// Start time of compaction task
@@ -135,7 +136,11 @@ impl CompactionTaskImpl {

    /// Notifies region worker to handle post-compaction tasks.
    async fn send_to_worker(&self, request: WorkerRequest) {
-        if let Err(e) = self.request_sender.send(request).await {
+        if let Err(e) = self
+            .request_sender
+            .send(WorkerRequestWithTime::new(request))
+            .await
+        {
            error!(
                "Failed to notify compaction job status for region {}, request: {:?}",
                self.compaction_region.region_id, e.0
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -30,6 +30,8 @@ use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
 const MULTIPART_UPLOAD_MINIMUM_SIZE: ReadableSize = ReadableSize::mb(5);
 /// Default channel size for parallel scan task.
 pub(crate) const DEFAULT_SCAN_CHANNEL_SIZE: usize = 32;
+/// Default maximum number of SST files to scan concurrently.
+pub(crate) const DEFAULT_MAX_CONCURRENT_SCAN_FILES: usize = 128;

 // Use `1/GLOBAL_WRITE_BUFFER_SIZE_FACTOR` of OS memory as global write buffer size in default mode
 const GLOBAL_WRITE_BUFFER_SIZE_FACTOR: u64 = 8;
@@ -107,6 +109,8 @@ pub struct MitoConfig {
    pub sst_write_buffer_size: ReadableSize,
    /// Capacity of the channel to send data from parallel scan tasks to the main task (default 32).
    pub parallel_scan_channel_size: usize,
+    /// Maximum number of SST files to scan concurrently (default 128).
+    pub max_concurrent_scan_files: usize,
    /// Whether to allow stale entries read during replay.
    pub allow_stale_entries: bool,

@@ -152,6 +156,7 @@ impl Default for MitoConfig {
            write_cache_ttl: None,
            sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE,
            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
+            max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES,
            allow_stale_entries: false,
            index: IndexConfig::default(),
            inverted_index: InvertedIndexConfig::default(),
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -506,6 +506,7 @@ impl EngineInner {
            CacheStrategy::EnableAll(cache_manager),
        )
        .with_parallel_scan_channel_size(self.config.parallel_scan_channel_size)
+        .with_max_concurrent_scan_files(self.config.max_concurrent_scan_files)
        .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
        .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled())
        .with_ignore_bloom_filter(self.config.bloom_filter_index.apply_on_query.disabled())
--- a/src/mito2/src/engine/scan_test.rs
+++ b/src/mito2/src/engine/scan_test.rs
@@ -13,6 +13,8 @@
 // limitations under the License.

 use api::v1::Rows;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
 use common_recordbatch::RecordBatches;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use futures::TryStreamExt;
@@ -151,6 +153,58 @@ async fn test_scan_with_min_sst_sequence() {
    .await;
 }

+#[tokio::test]
+async fn test_max_concurrent_scan_files() {
+    let mut env = TestEnv::with_prefix("test_max_concurrent_scan_files").await;
+    let config = MitoConfig {
+        max_concurrent_scan_files: 2,
+        ..Default::default()
+    };
+    let engine = env.create_engine(config).await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    let column_schemas = test_util::rows_schema(&request);
+
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let put_and_flush = async |start, end| {
+        let rows = Rows {
+            schema: column_schemas.clone(),
+            rows: test_util::build_rows(start, end),
+        };
+        test_util::put_rows(&engine, region_id, rows).await;
+        test_util::flush_region(&engine, region_id, None).await;
+    };
+
+    // Write overlapping files.
+    put_and_flush(0, 4).await;
+    put_and_flush(3, 7).await;
+    put_and_flush(6, 9).await;
+
+    let request = ScanRequest::default();
+    let scanner = engine.scanner(region_id, request).await.unwrap();
+    let Scanner::Seq(scanner) = scanner else {
+        panic!("Scanner should be seq scan");
+    };
+    let error = scanner.check_scan_limit().unwrap_err();
+    assert_eq!(StatusCode::RateLimited, error.status_code());
+
+    let request = ScanRequest {
+        distribution: Some(TimeSeriesDistribution::PerSeries),
+        ..Default::default()
+    };
+    let scanner = engine.scanner(region_id, request).await.unwrap();
+    let Scanner::Series(scanner) = scanner else {
+        panic!("Scanner should be series scan");
+    };
+    let error = scanner.check_scan_limit().unwrap_err();
+    assert_eq!(StatusCode::RateLimited, error.status_code());
+}
+
 #[tokio::test]
 async fn test_series_scan() {
    let mut env = TestEnv::with_prefix("test_series_scan").await;
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -1032,6 +1032,18 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display(
+        "Too many files to read concurrently: {}, max allowed: {}",
+        actual,
+        max
+    ))]
+    TooManyFilesToRead {
+        actual: usize,
+        max: usize,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -1189,6 +1201,8 @@ impl ErrorExt for Error {
            Encode { source, .. } | Decode { source, .. } => source.status_code(),

            InconsistentTimestampLength { .. } => StatusCode::InvalidArguments,
+
+            TooManyFilesToRead { .. } => StatusCode::RateLimited,
        }
    }

--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -42,7 +42,7 @@ use crate::region::version::{VersionControlData, VersionControlRef};
 use crate::region::{ManifestContextRef, RegionLeaderState};
 use crate::request::{
    BackgroundNotify, FlushFailed, FlushFinished, OptionOutputTx, OutputTx, SenderBulkRequest,
-    SenderDdlRequest, SenderWriteRequest, WorkerRequest,
+    SenderDdlRequest, SenderWriteRequest, WorkerRequest, WorkerRequestWithTime,
 };
 use crate::schedule::scheduler::{Job, SchedulerRef};
 use crate::sst::file::FileMeta;
@@ -223,7 +223,7 @@ pub(crate) struct RegionFlushTask {
    /// Flush result senders.
    pub(crate) senders: Vec<OutputTx>,
    /// Request sender to notify the worker.
-    pub(crate) request_sender: mpsc::Sender<WorkerRequest>,
+    pub(crate) request_sender: mpsc::Sender<WorkerRequestWithTime>,

    pub(crate) access_layer: AccessLayerRef,
    pub(crate) listener: WorkerListener,
@@ -441,7 +441,11 @@ impl RegionFlushTask {

    /// Notify flush job status.
    async fn send_worker_request(&self, request: WorkerRequest) {
-        if let Err(e) = self.request_sender.send(request).await {
+        if let Err(e) = self
+            .request_sender
+            .send(WorkerRequestWithTime::new(request))
+            .await
+        {
            error!(
                "Failed to notify flush job status for region {}, request: {:?}",
                self.region_id, e.0
--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -126,7 +126,12 @@ impl From<&BulkPart> for BulkWalEntry {

 impl BulkPart {
    pub(crate) fn estimated_size(&self) -> usize {
-        self.batch.get_array_memory_size()
+        self.batch
+            .columns()
+            .iter()
+            // If can not get slice memory size, assume 0 here.
+            .map(|c| c.to_data().get_slice_memory_size().unwrap_or(0))
+            .sum()
    }

    /// Converts [BulkPart] to [Mutation] for fallback `write_bulk` implementation.
--- a/src/mito2/src/metrics.rs
+++ b/src/mito2/src/metrics.rs
@@ -94,12 +94,7 @@ lazy_static! {


    // ------ Write related metrics
-    /// Number of stalled write requests in each worker.
-    pub static ref WRITE_STALL_TOTAL: IntGaugeVec = register_int_gauge_vec!(
-            "greptime_mito_write_stall_total",
-            "mito stalled write request in each worker",
-            &[WORKER_LABEL]
-        ).unwrap();
+    //
    /// Counter of rejected write requests.
    pub static ref WRITE_REJECT_TOTAL: IntCounter =
        register_int_counter!("greptime_mito_write_reject_total", "mito write reject total").unwrap();
@@ -402,6 +397,7 @@ lazy_static! {

 }

+// Use another block to avoid reaching the recursion limit.
 lazy_static! {
    /// Counter for compaction input file size.
    pub static ref COMPACTION_INPUT_BYTES: Counter = register_counter!(
@@ -426,6 +422,27 @@ lazy_static! {
        "greptime_mito_memtable_field_builder_count",
        "active field builder count in TimeSeriesMemtable",
        ).unwrap();
+
+    /// Number of stalling write requests in each worker.
+    pub static ref WRITE_STALLING: IntGaugeVec = register_int_gauge_vec!(
+            "greptime_mito_write_stalling_count",
+            "mito stalled write request in each worker",
+            &[WORKER_LABEL]
+        ).unwrap();
+    /// Total number of stalled write requests.
+    pub static ref WRITE_STALL_TOTAL: IntCounter = register_int_counter!(
+        "greptime_mito_write_stall_total",
+        "Total number of stalled write requests"
+    ).unwrap();
+    /// Time waiting for requests to be handled by the region worker.
+    pub static ref REQUEST_WAIT_TIME: HistogramVec = register_histogram_vec!(
+            "greptime_mito_request_wait_time",
+            "mito request wait time before being handled by region worker",
+            &[WORKER_LABEL],
+            // 0.001 ~ 10000
+            exponential_buckets(0.001, 10.0, 8).unwrap(),
+        )
+        .unwrap();
 }

 /// Stager notifier to collect metrics.
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -39,7 +39,7 @@ use tokio_stream::wrappers::ReceiverStream;

 use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheStrategy;
-use crate::config::DEFAULT_SCAN_CHANNEL_SIZE;
+use crate::config::{DEFAULT_MAX_CONCURRENT_SCAN_FILES, DEFAULT_SCAN_CHANNEL_SIZE};
 use crate::error::Result;
 use crate::memtable::MemtableRange;
 use crate::metrics::READ_SST_COUNT;
@@ -187,6 +187,8 @@ pub(crate) struct ScanRegion {
    cache_strategy: CacheStrategy,
    /// Capacity of the channel to send data from parallel scan tasks to the main task.
    parallel_scan_channel_size: usize,
+    /// Maximum number of SST files to scan concurrently.
+    max_concurrent_scan_files: usize,
    /// Whether to ignore inverted index.
    ignore_inverted_index: bool,
    /// Whether to ignore fulltext index.
@@ -214,6 +216,7 @@ impl ScanRegion {
            request,
            cache_strategy,
            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
+            max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES,
            ignore_inverted_index: false,
            ignore_fulltext_index: false,
            ignore_bloom_filter: false,
@@ -232,6 +235,16 @@ impl ScanRegion {
        self
    }

+    /// Sets maximum number of SST files to scan concurrently.
+    #[must_use]
+    pub(crate) fn with_max_concurrent_scan_files(
+        mut self,
+        max_concurrent_scan_files: usize,
+    ) -> Self {
+        self.max_concurrent_scan_files = max_concurrent_scan_files;
+        self
+    }
+
    /// Sets whether to ignore inverted index.
    #[must_use]
    pub(crate) fn with_ignore_inverted_index(mut self, ignore: bool) -> Self {
@@ -421,6 +434,7 @@ impl ScanRegion {
            .with_bloom_filter_index_applier(bloom_filter_applier)
            .with_fulltext_index_applier(fulltext_index_applier)
            .with_parallel_scan_channel_size(self.parallel_scan_channel_size)
+            .with_max_concurrent_scan_files(self.max_concurrent_scan_files)
            .with_start_time(self.start_time)
            .with_append_mode(self.version.options.append_mode)
            .with_filter_deleted(self.filter_deleted)
@@ -597,6 +611,8 @@ pub struct ScanInput {
    ignore_file_not_found: bool,
    /// Capacity of the channel to send data from parallel scan tasks to the main task.
    pub(crate) parallel_scan_channel_size: usize,
+    /// Maximum number of SST files to scan concurrently.
+    pub(crate) max_concurrent_scan_files: usize,
    /// Index appliers.
    inverted_index_applier: Option<InvertedIndexApplierRef>,
    bloom_filter_index_applier: Option<BloomFilterIndexApplierRef>,
@@ -629,6 +645,7 @@ impl ScanInput {
            cache_strategy: CacheStrategy::Disabled,
            ignore_file_not_found: false,
            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
+            max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES,
            inverted_index_applier: None,
            bloom_filter_index_applier: None,
            fulltext_index_applier: None,
@@ -693,6 +710,16 @@ impl ScanInput {
        self
    }

+    /// Sets maximum number of SST files to scan concurrently.
+    #[must_use]
+    pub(crate) fn with_max_concurrent_scan_files(
+        mut self,
+        max_concurrent_scan_files: usize,
+    ) -> Self {
+        self.max_concurrent_scan_files = max_concurrent_scan_files;
+        self
+    }
+
    /// Sets invereted index applier.
    #[must_use]
    pub(crate) fn with_inverted_index_applier(
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -33,11 +33,11 @@ use store_api::region_engine::{PartitionRange, PrepareRequest, RegionScanner, Sc
 use store_api::storage::TimeSeriesRowSelector;
 use tokio::sync::Semaphore;

-use crate::error::{PartitionOutOfRangeSnafu, Result};
+use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu};
 use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
 use crate::read::last_row::LastRowReader;
 use crate::read::merge::MergeReaderBuilder;
-use crate::read::range::RangeBuilderList;
+use crate::read::range::{RangeBuilderList, RangeMeta};
 use crate::read::scan_region::{ScanInput, StreamContext};
 use crate::read::scan_util::{
    scan_file_ranges, scan_mem_ranges, PartitionMetrics, PartitionMetricsList,
@@ -347,6 +347,40 @@ impl SeqScan {

        metrics
    }
+
+    /// Finds the maximum number of files to read in a single partition range.
+    fn max_files_in_partition(ranges: &[RangeMeta], partition_ranges: &[PartitionRange]) -> usize {
+        partition_ranges
+            .iter()
+            .map(|part_range| {
+                let range_meta = &ranges[part_range.identifier];
+                range_meta.indices.len()
+            })
+            .max()
+            .unwrap_or(0)
+    }
+
+    /// Checks resource limit for the scanner.
+    pub(crate) fn check_scan_limit(&self) -> Result<()> {
+        // Check max file count limit for all partitions since we scan them in parallel.
+        let total_max_files: usize = self
+            .properties
+            .partitions
+            .iter()
+            .map(|partition| Self::max_files_in_partition(&self.stream_ctx.ranges, partition))
+            .sum();
+
+        let max_concurrent_files = self.stream_ctx.input.max_concurrent_scan_files;
+        if total_max_files > max_concurrent_files {
+            return TooManyFilesToReadSnafu {
+                actual: total_max_files,
+                max: max_concurrent_files,
+            }
+            .fail();
+        }
+
+        Ok(())
+    }
 }

 impl RegionScanner for SeqScan {
@@ -372,6 +406,9 @@ impl RegionScanner for SeqScan {

    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
        self.properties.prepare(request);
+
+        self.check_scan_limit().map_err(BoxedError::new)?;
+
        Ok(())
    }

--- a/src/mito2/src/read/series_scan.rs
+++ b/src/mito2/src/read/series_scan.rs
@@ -37,7 +37,7 @@ use tokio::sync::Semaphore;

 use crate::error::{
    ComputeArrowSnafu, Error, InvalidSenderSnafu, PartitionOutOfRangeSnafu, Result,
-    ScanMultiTimesSnafu, ScanSeriesSnafu,
+    ScanMultiTimesSnafu, ScanSeriesSnafu, TooManyFilesToReadSnafu,
 };
 use crate::read::range::RangeBuilderList;
 use crate::read::scan_region::{ScanInput, StreamContext};
@@ -201,6 +201,32 @@ impl SeriesScan {
        let chained_stream = ChainedRecordBatchStream::new(streams).map_err(BoxedError::new)?;
        Ok(Box::pin(chained_stream))
    }
+
+    /// Checks resource limit for the scanner.
+    pub(crate) fn check_scan_limit(&self) -> Result<()> {
+        // Sum the total number of files across all partitions
+        let total_files: usize = self
+            .properties
+            .partitions
+            .iter()
+            .flat_map(|partition| partition.iter())
+            .map(|part_range| {
+                let range_meta = &self.stream_ctx.ranges[part_range.identifier];
+                range_meta.indices.len()
+            })
+            .sum();
+
+        let max_concurrent_files = self.stream_ctx.input.max_concurrent_scan_files;
+        if total_files > max_concurrent_files {
+            return TooManyFilesToReadSnafu {
+                actual: total_files,
+                max: max_concurrent_files,
+            }
+            .fail();
+        }
+
+        Ok(())
+    }
 }

 fn new_channel_list(num_partitions: usize) -> (SenderList, ReceiverList) {
@@ -236,6 +262,9 @@ impl RegionScanner for SeriesScan {

    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
        self.properties.prepare(request);
+
+        self.check_scan_limit().map_err(BoxedError::new)?;
+
        Ok(())
    }

--- a/src/mito2/src/read/unordered_scan.rs
+++ b/src/mito2/src/read/unordered_scan.rs
@@ -242,6 +242,7 @@ impl RegionScanner for UnorderedScan {

    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
        self.properties.prepare(request);
+        // UnorderedScan only scans one row group per partition so the resource requirement won't be too high.
        Ok(())
    }

--- a/src/mito2/src/request.rs
+++ b/src/mito2/src/request.rs
@@ -542,6 +542,22 @@ pub(crate) struct SenderBulkRequest {
    pub(crate) region_metadata: RegionMetadataRef,
 }

+/// Request sent to a worker with timestamp
+#[derive(Debug)]
+pub(crate) struct WorkerRequestWithTime {
+    pub(crate) request: WorkerRequest,
+    pub(crate) created_at: Instant,
+}
+
+impl WorkerRequestWithTime {
+    pub(crate) fn new(request: WorkerRequest) -> Self {
+        Self {
+            request,
+            created_at: Instant::now(),
+        }
+    }
+}
+
 /// Request sent to a worker
 #[derive(Debug)]
 pub(crate) enum WorkerRequest {
--- a/src/mito2/src/schedule/remote_job_scheduler.rs
+++ b/src/mito2/src/schedule/remote_job_scheduler.rs
@@ -30,6 +30,7 @@ use crate::manifest::action::RegionEdit;
 use crate::metrics::{COMPACTION_FAILURE_COUNT, INFLIGHT_COMPACTION_COUNT};
 use crate::request::{
    BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, WorkerRequest,
+    WorkerRequestWithTime,
 };

 pub type RemoteJobSchedulerRef = Arc<dyn RemoteJobScheduler>;
@@ -130,7 +131,7 @@ pub struct CompactionJobResult {
 /// DefaultNotifier is a default implementation of Notifier that sends WorkerRequest to the mito engine.
 pub(crate) struct DefaultNotifier {
    /// The sender to send WorkerRequest to the mito engine. This is used to notify the mito engine when a remote job is completed.
-    pub(crate) request_sender: Sender<WorkerRequest>,
+    pub(crate) request_sender: Sender<WorkerRequestWithTime>,
 }

 impl DefaultNotifier {
@@ -173,10 +174,10 @@ impl Notifier for DefaultNotifier {

                if let Err(e) = self
                    .request_sender
-                    .send(WorkerRequest::Background {
+                    .send(WorkerRequestWithTime::new(WorkerRequest::Background {
                        region_id: result.region_id,
                        notify,
-                    })
+                    }))
                    .await
                {
                    error!(
--- a/src/mito2/src/sst/parquet/row_selection.rs
+++ b/src/mito2/src/sst/parquet/row_selection.rs
@@ -294,7 +294,7 @@ impl RowGroupSelection {
            let Some(y) = self.selection_in_rg.get(rg_id) else {
                continue;
            };
-            let selection = x.selection.intersection(&y.selection);
+            let selection = intersect_row_selections(&x.selection, &y.selection);
            let row_count = selection.row_count();
            let selector_len = selector_len(&selection);
            if row_count > 0 {
@@ -423,6 +423,68 @@ impl RowGroupSelection {
    }
 }

+/// Ported from `parquet` but trailing rows are removed.
+///
+/// Combine two lists of `RowSelection` return the intersection of them
+/// For example:
+/// self:      NNYYYYNNYYNYN
+/// other:     NYNNNNNNY
+///
+/// returned:  NNNNNNNNY     (modified)
+///            NNNNNNNNYYNYN (original)
+fn intersect_row_selections(left: &RowSelection, right: &RowSelection) -> RowSelection {
+    let mut l_iter = left.iter().copied().peekable();
+    let mut r_iter = right.iter().copied().peekable();
+
+    let iter = std::iter::from_fn(move || {
+        loop {
+            let l = l_iter.peek_mut();
+            let r = r_iter.peek_mut();
+
+            match (l, r) {
+                (Some(a), _) if a.row_count == 0 => {
+                    l_iter.next().unwrap();
+                }
+                (_, Some(b)) if b.row_count == 0 => {
+                    r_iter.next().unwrap();
+                }
+                (Some(l), Some(r)) => {
+                    return match (l.skip, r.skip) {
+                        // Keep both ranges
+                        (false, false) => {
+                            if l.row_count < r.row_count {
+                                r.row_count -= l.row_count;
+                                l_iter.next()
+                            } else {
+                                l.row_count -= r.row_count;
+                                r_iter.next()
+                            }
+                        }
+                        // skip at least one
+                        _ => {
+                            if l.row_count < r.row_count {
+                                let skip = l.row_count;
+                                r.row_count -= l.row_count;
+                                l_iter.next();
+                                Some(RowSelector::skip(skip))
+                            } else {
+                                let skip = r.row_count;
+                                l.row_count -= skip;
+                                r_iter.next();
+                                Some(RowSelector::skip(skip))
+                            }
+                        }
+                    };
+                }
+                (None, _) => return None,
+                (_, None) => return None,
+            }
+        }
+    });
+
+    iter.collect()
+}
+
 /// Converts an iterator of row ranges into a `RowSelection` by creating a sequence of `RowSelector`s.
 ///
 /// This function processes each range in the input and either creates a new selector or merges
@@ -448,10 +510,6 @@ pub(crate) fn row_selection_from_row_ranges(
        last_processed_end = end;
    }

-    if last_processed_end < total_row_count {
-        add_or_merge_selector(&mut selectors, total_row_count - last_processed_end, true);
-    }
-
    RowSelection::from(selectors)
 }

@@ -546,7 +604,6 @@ mod tests {
            RowSelector::select(2),
            RowSelector::skip(2),
            RowSelector::select(3),
-            RowSelector::skip(2),
        ]);
        assert_eq!(selection, expected);
    }
@@ -555,7 +612,7 @@ mod tests {
    fn test_empty_range() {
        let ranges = [];
        let selection = row_selection_from_row_ranges(ranges.iter().cloned(), 10);
-        let expected = RowSelection::from(vec![RowSelector::skip(10)]);
+        let expected = RowSelection::from(vec![]);
        assert_eq!(selection, expected);
    }

@@ -563,11 +620,7 @@ mod tests {
    fn test_adjacent_ranges() {
        let ranges = [1..2, 2..3];
        let selection = row_selection_from_row_ranges(ranges.iter().cloned(), 10);
-        let expected = RowSelection::from(vec![
-            RowSelector::skip(1),
-            RowSelector::select(2),
-            RowSelector::skip(7),
-        ]);
+        let expected = RowSelection::from(vec![RowSelector::skip(1), RowSelector::select(2)]);
        assert_eq!(selection, expected);
    }

@@ -580,7 +633,6 @@ mod tests {
            RowSelector::select(1),
            RowSelector::skip(98),
            RowSelector::select(1),
-            RowSelector::skip(10139),
        ]);
        assert_eq!(selection, expected);
    }
--- a/src/mito2/src/test_util/scheduler_util.rs
+++ b/src/mito2/src/test_util/scheduler_util.rs
@@ -32,7 +32,7 @@ use crate::error::Result;
 use crate::flush::FlushScheduler;
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::region::{ManifestContext, ManifestContextRef, RegionLeaderState, RegionRoleState};
-use crate::request::WorkerRequest;
+use crate::request::{WorkerRequest, WorkerRequestWithTime};
 use crate::schedule::scheduler::{Job, LocalScheduler, Scheduler, SchedulerRef};
 use crate::sst::index::intermediate::IntermediateManager;
 use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -85,7 +85,7 @@ impl SchedulerEnv {
    /// Creates a new compaction scheduler.
    pub(crate) fn mock_compaction_scheduler(
        &self,
-        request_sender: Sender<WorkerRequest>,
+        request_sender: Sender<WorkerRequestWithTime>,
    ) -> CompactionScheduler {
        let scheduler = self.get_scheduler();

--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -39,7 +39,7 @@ use common_runtime::JoinHandle;
 use common_telemetry::{error, info, warn};
 use futures::future::try_join_all;
 use object_store::manager::ObjectStoreManagerRef;
-use prometheus::IntGauge;
+use prometheus::{Histogram, IntGauge};
 use rand::{rng, Rng};
 use snafu::{ensure, ResultExt};
 use store_api::logstore::LogStore;
@@ -58,11 +58,11 @@ use crate::error;
 use crate::error::{CreateDirSnafu, JoinSnafu, Result, WorkerStoppedSnafu};
 use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef};
 use crate::memtable::MemtableBuilderProvider;
-use crate::metrics::{REGION_COUNT, WRITE_STALL_TOTAL};
+use crate::metrics::{REGION_COUNT, REQUEST_WAIT_TIME, WRITE_STALLING};
 use crate::region::{MitoRegionRef, OpeningRegions, OpeningRegionsRef, RegionMap, RegionMapRef};
 use crate::request::{
    BackgroundNotify, DdlRequest, SenderBulkRequest, SenderDdlRequest, SenderWriteRequest,
-    WorkerRequest,
+    WorkerRequest, WorkerRequestWithTime,
 };
 use crate::schedule::scheduler::{LocalScheduler, SchedulerRef};
 use crate::sst::file::FileId;
@@ -469,8 +469,9 @@ impl<S: LogStore> WorkerStarter<S> {
            last_periodical_check_millis: now,
            flush_sender: self.flush_sender,
            flush_receiver: self.flush_receiver,
-            stalled_count: WRITE_STALL_TOTAL.with_label_values(&[&id_string]),
+            stalling_count: WRITE_STALLING.with_label_values(&[&id_string]),
            region_count: REGION_COUNT.with_label_values(&[&id_string]),
+            request_wait_time: REQUEST_WAIT_TIME.with_label_values(&[&id_string]),
            region_edit_queues: RegionEditQueues::default(),
            schema_metadata_manager: self.schema_metadata_manager,
        };
@@ -498,7 +499,7 @@ pub(crate) struct RegionWorker {
    /// The opening regions.
    opening_regions: OpeningRegionsRef,
    /// Request sender.
-    sender: Sender<WorkerRequest>,
+    sender: Sender<WorkerRequestWithTime>,
    /// Handle to the worker thread.
    handle: Mutex<Option<JoinHandle<()>>>,
    /// Whether to run the worker thread.
@@ -509,7 +510,8 @@ impl RegionWorker {
    /// Submits request to background worker thread.
    async fn submit_request(&self, request: WorkerRequest) -> Result<()> {
        ensure!(self.is_running(), WorkerStoppedSnafu { id: self.id });
-        if self.sender.send(request).await.is_err() {
+        let request_with_time = WorkerRequestWithTime::new(request);
+        if self.sender.send(request_with_time).await.is_err() {
            warn!(
                "Worker {} is already exited but the running flag is still true",
                self.id
@@ -531,7 +533,12 @@ impl RegionWorker {
            info!("Stop region worker {}", self.id);

            self.set_running(false);
-            if self.sender.send(WorkerRequest::Stop).await.is_err() {
+            if self
+                .sender
+                .send(WorkerRequestWithTime::new(WorkerRequest::Stop))
+                .await
+                .is_err()
+            {
                warn!("Worker {} is already exited before stop", self.id);
            }

@@ -669,9 +676,9 @@ struct RegionWorkerLoop<S> {
    /// Regions that are opening.
    opening_regions: OpeningRegionsRef,
    /// Request sender.
-    sender: Sender<WorkerRequest>,
+    sender: Sender<WorkerRequestWithTime>,
    /// Request receiver.
-    receiver: Receiver<WorkerRequest>,
+    receiver: Receiver<WorkerRequestWithTime>,
    /// WAL of the engine.
    wal: Wal<S>,
    /// Manages object stores for manifest and SSTs.
@@ -706,10 +713,12 @@ struct RegionWorkerLoop<S> {
    flush_sender: watch::Sender<()>,
    /// Watch channel receiver to wait for background flush job.
    flush_receiver: watch::Receiver<()>,
-    /// Gauge of stalled request count.
-    stalled_count: IntGauge,
+    /// Gauge of stalling request count.
+    stalling_count: IntGauge,
    /// Gauge of regions in the worker.
    region_count: IntGauge,
+    /// Histogram of request wait time for this worker.
+    request_wait_time: Histogram,
    /// Queues for region edit requests.
    region_edit_queues: RegionEditQueues,
    /// Database level metadata manager.
@@ -749,10 +758,16 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            tokio::select! {
                request_opt = self.receiver.recv() => {
                    match request_opt {
-                        Some(request) => match request {
-                            WorkerRequest::Write(sender_req) => write_req_buffer.push(sender_req),
-                            WorkerRequest::Ddl(sender_req) => ddl_req_buffer.push(sender_req),
-                            _ => general_req_buffer.push(request),
+                        Some(request_with_time) => {
+                            // Observe the wait time
+                            let wait_time = request_with_time.created_at.elapsed();
+                            self.request_wait_time.observe(wait_time.as_secs_f64());
+
+                            match request_with_time.request {
+                                WorkerRequest::Write(sender_req) => write_req_buffer.push(sender_req),
+                                WorkerRequest::Ddl(sender_req) => ddl_req_buffer.push(sender_req),
+                                req => general_req_buffer.push(req),
+                            }
                        },
                        // The channel is disconnected.
                        None => break,
@@ -791,11 +806,17 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            for _ in 1..self.config.worker_request_batch_size {
                // We have received one request so we start from 1.
                match self.receiver.try_recv() {
-                    Ok(req) => match req {
-                        WorkerRequest::Write(sender_req) => write_req_buffer.push(sender_req),
-                        WorkerRequest::Ddl(sender_req) => ddl_req_buffer.push(sender_req),
-                        _ => general_req_buffer.push(req),
-                    },
+                    Ok(request_with_time) => {
+                        // Observe the wait time
+                        let wait_time = request_with_time.created_at.elapsed();
+                        self.request_wait_time.observe(wait_time.as_secs_f64());
+
+                        match request_with_time.request {
+                            WorkerRequest::Write(sender_req) => write_req_buffer.push(sender_req),
+                            WorkerRequest::Ddl(sender_req) => ddl_req_buffer.push(sender_req),
+                            req => general_req_buffer.push(req),
+                        }
+                    }
                    // We still need to handle remaining requests.
                    Err(_) => break,
                }
--- a/src/mito2/src/worker/handle_manifest.rs
+++ b/src/mito2/src/worker/handle_manifest.rs
@@ -34,7 +34,7 @@ use crate::region::version::VersionBuilder;
 use crate::region::{MitoRegionRef, RegionLeaderState, RegionRoleState};
 use crate::request::{
    BackgroundNotify, OptionOutputTx, RegionChangeResult, RegionEditRequest, RegionEditResult,
-    RegionSyncRequest, TruncateResult, WorkerRequest,
+    RegionSyncRequest, TruncateResult, WorkerRequest, WorkerRequestWithTime,
 };
 use crate::sst::location;
 use crate::worker::{RegionWorkerLoop, WorkerListener};
@@ -230,7 +230,10 @@ impl<S> RegionWorkerLoop<S> {
                }),
            };
            // We don't set state back as the worker loop is already exited.
-            if let Err(res) = request_sender.send(notify).await {
+            if let Err(res) = request_sender
+                .send(WorkerRequestWithTime::new(notify))
+                .await
+            {
                warn!(
                    "Failed to send region edit result back to the worker, region_id: {}, res: {:?}",
                    region_id, res
@@ -318,10 +321,10 @@ impl<S> RegionWorkerLoop<S> {
                truncated_sequence: truncate.truncated_sequence,
            };
            let _ = request_sender
-                .send(WorkerRequest::Background {
+                .send(WorkerRequestWithTime::new(WorkerRequest::Background {
                    region_id: truncate.region_id,
                    notify: BackgroundNotify::Truncate(truncate_result),
-                })
+                }))
                .await
                .inspect_err(|_| warn!("failed to send truncate result"));
        });
@@ -364,7 +367,10 @@ impl<S> RegionWorkerLoop<S> {
                .on_notify_region_change_result_begin(region.region_id)
                .await;

-            if let Err(res) = request_sender.send(notify).await {
+            if let Err(res) = request_sender
+                .send(WorkerRequestWithTime::new(notify))
+                .await
+            {
                warn!(
                    "Failed to send region change result back to the worker, region_id: {}, res: {:?}",
                    region.region_id, res
--- a/src/mito2/src/worker/handle_write.rs
+++ b/src/mito2/src/worker/handle_write.rs
@@ -27,7 +27,9 @@ use store_api::storage::RegionId;

 use crate::error::{InvalidRequestSnafu, RegionStateSnafu, RejectWriteSnafu, Result};
 use crate::metrics;
-use crate::metrics::{WRITE_REJECT_TOTAL, WRITE_ROWS_TOTAL, WRITE_STAGE_ELAPSED};
+use crate::metrics::{
+    WRITE_REJECT_TOTAL, WRITE_ROWS_TOTAL, WRITE_STAGE_ELAPSED, WRITE_STALL_TOTAL,
+};
 use crate::region::{RegionLeaderState, RegionRoleState};
 use crate::region_write_ctx::RegionWriteCtx;
 use crate::request::{SenderBulkRequest, SenderWriteRequest, WriteRequest};
@@ -57,8 +59,9 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        }

        if self.write_buffer_manager.should_stall() && allow_stall {
-            self.stalled_count
-                .add((write_requests.len() + bulk_requests.len()) as i64);
+            let stalled_count = (write_requests.len() + bulk_requests.len()) as i64;
+            self.stalling_count.add(stalled_count);
+            WRITE_STALL_TOTAL.inc_by(stalled_count as u64);
            self.stalled_requests.append(write_requests, bulk_requests);
            self.listener.on_write_stall();
            return;
@@ -161,7 +164,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
    pub(crate) async fn handle_stalled_requests(&mut self) {
        // Handle stalled requests.
        let stalled = std::mem::take(&mut self.stalled_requests);
-        self.stalled_count.sub(stalled.stalled_count() as i64);
+        self.stalling_count.sub(stalled.stalled_count() as i64);
        // We already stalled these requests, don't stall them again.
        for (_, (_, mut requests, mut bulk)) in stalled.requests {
            self.handle_write_requests(&mut requests, &mut bulk, false)
@@ -172,7 +175,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
    /// Rejects all stalled requests.
    pub(crate) fn reject_stalled_requests(&mut self) {
        let stalled = std::mem::take(&mut self.stalled_requests);
-        self.stalled_count.sub(stalled.stalled_count() as i64);
+        self.stalling_count.sub(stalled.stalled_count() as i64);
        for (_, (_, mut requests, mut bulk)) in stalled.requests {
            reject_write_requests(&mut requests, &mut bulk);
        }
@@ -182,7 +185,8 @@ impl<S: LogStore> RegionWorkerLoop<S> {
    pub(crate) fn reject_region_stalled_requests(&mut self, region_id: &RegionId) {
        debug!("Rejects stalled requests for region {}", region_id);
        let (mut requests, mut bulk) = self.stalled_requests.remove(region_id);
-        self.stalled_count.sub((requests.len() + bulk.len()) as i64);
+        self.stalling_count
+            .sub((requests.len() + bulk.len()) as i64);
        reject_write_requests(&mut requests, &mut bulk);
    }

@@ -190,7 +194,8 @@ impl<S: LogStore> RegionWorkerLoop<S> {
    pub(crate) async fn handle_region_stalled_requests(&mut self, region_id: &RegionId) {
        debug!("Handles stalled requests for region {}", region_id);
        let (mut requests, mut bulk) = self.stalled_requests.remove(region_id);
-        self.stalled_count.sub((requests.len() + bulk.len()) as i64);
+        self.stalling_count
+            .sub((requests.len() + bulk.len()) as i64);
        self.handle_write_requests(&mut requests, &mut bulk, true)
            .await;
    }
@@ -251,7 +256,8 @@ impl<S> RegionWorkerLoop<S> {
                            "Region {} is altering, add request to pending writes",
                            region.region_id
                        );
-                        self.stalled_count.add(1);
+                        self.stalling_count.add(1);
+                        WRITE_STALL_TOTAL.inc();
                        self.stalled_requests.push(sender_req);
                        continue;
                    }
@@ -353,7 +359,8 @@ impl<S> RegionWorkerLoop<S> {
                            "Region {} is altering, add request to pending writes",
                            region.region_id
                        );
-                        self.stalled_count.add(1);
+                        self.stalling_count.add(1);
+                        WRITE_STALL_TOTAL.inc();
                        self.stalled_requests.push_bulk(bulk_req);
                        continue;
                    }
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -229,6 +229,7 @@ impl DispatchedTo {
 pub enum PipelineExecOutput {
    Transformed(TransformedOutput),
    DispatchedTo(DispatchedTo, Value),
+    Filtered,
 }

 #[derive(Debug)]
@@ -309,6 +310,10 @@ impl Pipeline {
        // process
        for processor in self.processors.iter() {
            val = processor.exec_mut(val)?;
+            if val.is_null() {
+                // line is filtered
+                return Ok(PipelineExecOutput::Filtered);
+            }
        }

        // dispatch, fast return if matched
@@ -525,9 +530,6 @@ transform:
            .into_transformed()
            .unwrap();

-        // println!("[DEBUG]schema_info: {:?}", schema_info.schema);
-        // println!("[DEBUG]re: {:?}", result.0.values);
-
        assert_eq!(schema_info.schema.len(), result.0.values.len());
        let test = vec![
            (
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -19,6 +19,7 @@ pub mod decolorize;
 pub mod digest;
 pub mod dissect;
 pub mod epoch;
+pub mod filter;
 pub mod gsub;
 pub mod join;
 pub mod json_parse;
@@ -54,6 +55,7 @@ use crate::error::{
    Result, UnsupportedProcessorSnafu,
 };
 use crate::etl::field::{Field, Fields};
+use crate::etl::processor::filter::FilterProcessor;
 use crate::etl::processor::json_parse::JsonParseProcessor;
 use crate::etl::processor::select::SelectProcessor;
 use crate::etl::processor::simple_extract::SimpleExtractProcessor;
@@ -146,6 +148,7 @@ pub enum ProcessorKind {
    Digest(DigestProcessor),
    Select(SelectProcessor),
    Vrl(VrlProcessor),
+    Filter(FilterProcessor),
 }

 #[derive(Debug, Default)]
@@ -226,6 +229,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind> {
        }
        vrl::PROCESSOR_VRL => ProcessorKind::Vrl(VrlProcessor::try_from(value)?),
        select::PROCESSOR_SELECT => ProcessorKind::Select(SelectProcessor::try_from(value)?),
+        filter::PROCESSOR_FILTER => ProcessorKind::Filter(FilterProcessor::try_from(value)?),
        _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(),
    };

--- a/src/pipeline/src/etl/processor/filter.rs
+++ b/src/pipeline/src/etl/processor/filter.rs
@@ -0,0 +1,242 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use ahash::{HashSet, HashSetExt};
+use snafu::OptionExt;
+
+use crate::error::{
+    Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
+    ValueMustBeMapSnafu,
+};
+use crate::etl::field::Fields;
+use crate::etl::processor::{
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, FIELDS_NAME, FIELD_NAME,
+};
+use crate::{Processor, Value};
+
+pub(crate) const PROCESSOR_FILTER: &str = "filter";
+
+const MATCH_MODE_NAME: &str = "mode";
+const MATCH_OP_NAME: &str = "match_op";
+const CASE_INSENSITIVE_NAME: &str = "case_insensitive";
+const TARGETS_NAME: &str = "targets";
+
+#[derive(Debug)]
+enum MatchMode {
+    SimpleMatch(MatchOp),
+}
+
+impl Default for MatchMode {
+    fn default() -> Self {
+        Self::SimpleMatch(MatchOp::default())
+    }
+}
+
+#[derive(Debug, Default)]
+enum MatchOp {
+    #[default]
+    In,
+    NotIn,
+}
+
+/// Filter out the whole line if matches.
+/// Ultimately it's a condition check, maybe we can use VRL to do more complex check.
+/// Implement simple string match for now. Can be extended later.
+#[derive(Debug, Default)]
+pub struct FilterProcessor {
+    fields: Fields,
+    mode: MatchMode,
+    case_insensitive: bool,
+    targets: HashSet<String>,
+}
+
+impl TryFrom<&yaml_rust::yaml::Hash> for FilterProcessor {
+    type Error = Error;
+
+    // match mode can be extended in the future
+    #[allow(clippy::single_match)]
+    fn try_from(value: &yaml_rust::yaml::Hash) -> std::result::Result<Self, Self::Error> {
+        let mut fields = Fields::default();
+        let mut mode = MatchMode::default();
+        let mut op = MatchOp::default();
+        let mut case_insensitive = true;
+        let mut targets = HashSet::new();
+
+        for (k, v) in value.iter() {
+            let key = k
+                .as_str()
+                .with_context(|| KeyMustBeStringSnafu { k: k.clone() })?;
+            match key {
+                FIELD_NAME => fields = Fields::one(yaml_new_field(v, FIELD_NAME)?),
+                FIELDS_NAME => fields = yaml_new_fields(v, FIELDS_NAME)?,
+                MATCH_MODE_NAME => match yaml_string(v, MATCH_MODE_NAME)?.as_str() {
+                    "simple" => mode = MatchMode::SimpleMatch(MatchOp::In),
+                    _ => {}
+                },
+                MATCH_OP_NAME => match yaml_string(v, MATCH_OP_NAME)?.as_str() {
+                    "in" => op = MatchOp::In,
+                    "not_in" => op = MatchOp::NotIn,
+                    _ => {}
+                },
+                CASE_INSENSITIVE_NAME => case_insensitive = yaml_bool(v, CASE_INSENSITIVE_NAME)?,
+                TARGETS_NAME => {
+                    yaml_strings(v, TARGETS_NAME)?
+                        .into_iter()
+                        .filter(|s| !s.is_empty())
+                        .for_each(|s| {
+                            targets.insert(s);
+                        });
+                }
+                _ => {}
+            }
+        }
+
+        if matches!(mode, MatchMode::SimpleMatch(_)) {
+            mode = MatchMode::SimpleMatch(op);
+        }
+
+        if targets.is_empty() {
+            return ProcessorMissingFieldSnafu {
+                processor: PROCESSOR_FILTER,
+                field: TARGETS_NAME.to_string(),
+            }
+            .fail();
+        }
+
+        if case_insensitive {
+            targets = targets.into_iter().map(|s| s.to_lowercase()).collect();
+        }
+
+        Ok(FilterProcessor {
+            fields,
+            mode,
+            case_insensitive,
+            targets,
+        })
+    }
+}
+
+impl FilterProcessor {
+    fn match_target(&self, input: String) -> bool {
+        let input = if self.case_insensitive {
+            input.to_lowercase()
+        } else {
+            input
+        };
+
+        match &self.mode {
+            MatchMode::SimpleMatch(op) => match op {
+                MatchOp::In => self.targets.contains(&input),
+                MatchOp::NotIn => !self.targets.contains(&input),
+            },
+        }
+    }
+}
+
+impl Processor for FilterProcessor {
+    fn kind(&self) -> &str {
+        PROCESSOR_FILTER
+    }
+
+    fn ignore_missing(&self) -> bool {
+        true
+    }
+
+    fn exec_mut(&self, mut val: Value) -> Result<Value> {
+        let v_map = val.as_map_mut().context(ValueMustBeMapSnafu)?;
+
+        for field in self.fields.iter() {
+            let index = field.input_field();
+            match v_map.get(index) {
+                Some(Value::String(s)) => {
+                    if self.match_target(s.clone()) {
+                        return Ok(Value::Null);
+                    }
+                }
+                Some(v) => {
+                    return ProcessorExpectStringSnafu {
+                        processor: self.kind(),
+                        v: v.clone(),
+                    }
+                    .fail();
+                }
+                None => {}
+            }
+        }
+
+        Ok(val)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use ahash::HashSet;
+
+    use crate::etl::field::{Field, Fields};
+    use crate::etl::processor::filter::{FilterProcessor, MatchMode, MatchOp};
+    use crate::{Map, Processor, Value};
+
+    #[test]
+    fn test_eq() {
+        let processor = FilterProcessor {
+            fields: Fields::one(Field::new("name", None)),
+            mode: MatchMode::SimpleMatch(MatchOp::In),
+            case_insensitive: false,
+            targets: HashSet::from_iter(vec!["John".to_string()]),
+        };
+
+        let val = Value::Map(Map::one("name", Value::String("John".to_string())));
+
+        let result = processor.exec_mut(val).unwrap();
+        assert_eq!(result, Value::Null);
+
+        let val = Value::Map(Map::one("name", Value::String("Wick".to_string())));
+        let expect = val.clone();
+        let result = processor.exec_mut(val).unwrap();
+        assert_eq!(result, expect);
+    }
+
+    #[test]
+    fn test_ne() {
+        let processor = FilterProcessor {
+            fields: Fields::one(Field::new("name", None)),
+            mode: MatchMode::SimpleMatch(MatchOp::NotIn),
+            case_insensitive: false,
+            targets: HashSet::from_iter(vec!["John".to_string()]),
+        };
+
+        let val = Value::Map(Map::one("name", Value::String("John".to_string())));
+        let expect = val.clone();
+        let result = processor.exec_mut(val).unwrap();
+        assert_eq!(result, expect);
+
+        let val = Value::Map(Map::one("name", Value::String("Wick".to_string())));
+        let result = processor.exec_mut(val).unwrap();
+        assert_eq!(result, Value::Null);
+    }
+
+    #[test]
+    fn test_case() {
+        let processor = FilterProcessor {
+            fields: Fields::one(Field::new("name", None)),
+            mode: MatchMode::SimpleMatch(MatchOp::In),
+            case_insensitive: true,
+            targets: HashSet::from_iter(vec!["john".to_string()]),
+        };
+
+        let val = Value::Map(Map::one("name", Value::String("JoHN".to_string())));
+        let result = processor.exec_mut(val).unwrap();
+        assert_eq!(result, Value::Null);
+    }
+}
--- a/src/promql/src/extension_plan.rs
+++ b/src/promql/src/extension_plan.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+mod absent;
 mod empty_metric;
 mod histogram_fold;
 mod instant_manipulate;
@@ -24,6 +25,7 @@ mod series_divide;
 mod test_util;
 mod union_distinct_on;

+pub use absent::{Absent, AbsentExec, AbsentStream};
 use datafusion::arrow::datatypes::{ArrowPrimitiveType, TimestampMillisecondType};
 pub use empty_metric::{build_special_time_expr, EmptyMetric, EmptyMetricExec, EmptyMetricStream};
 pub use histogram_fold::{HistogramFold, HistogramFoldExec, HistogramFoldStream};
--- a/src/promql/src/extension_plan/absent.rs
+++ b/src/promql/src/extension_plan/absent.rs
@@ -0,0 +1,654 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use datafusion::arrow::array::Array;
+use datafusion::common::{DFSchemaRef, Result as DataFusionResult};
+use datafusion::execution::context::TaskContext;
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion::physical_expr::{EquivalenceProperties, LexRequirement, PhysicalSortRequirement};
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::physical_plan::expressions::Column as ColumnExpr;
+use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PlanProperties,
+    RecordBatchStream, SendableRecordBatchStream,
+};
+use datafusion_common::DFSchema;
+use datafusion_expr::EmptyRelation;
+use datatypes::arrow;
+use datatypes::arrow::array::{ArrayRef, Float64Array, TimestampMillisecondArray};
+use datatypes::arrow::datatypes::{DataType, Field, SchemaRef, TimeUnit};
+use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::arrow_array::StringArray;
+use datatypes::compute::SortOptions;
+use futures::{ready, Stream, StreamExt};
+use greptime_proto::substrait_extension as pb;
+use prost::Message;
+use snafu::ResultExt;
+
+use crate::error::DeserializeSnafu;
+use crate::extension_plan::Millisecond;
+
+/// Maximum number of rows per output batch
+const ABSENT_BATCH_SIZE: usize = 8192;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct Absent {
+    start: Millisecond,
+    end: Millisecond,
+    step: Millisecond,
+    time_index_column: String,
+    value_column: String,
+    fake_labels: Vec<(String, String)>,
+    input: LogicalPlan,
+    output_schema: DFSchemaRef,
+}
+
+impl PartialOrd for Absent {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        // compare on fields except schema and input
+        (
+            self.start,
+            self.end,
+            self.step,
+            &self.time_index_column,
+            &self.value_column,
+            &self.fake_labels,
+        )
+            .partial_cmp(&(
+                other.start,
+                other.end,
+                other.step,
+                &other.time_index_column,
+                &other.value_column,
+                &other.fake_labels,
+            ))
+    }
+}
+
+impl UserDefinedLogicalNodeCore for Absent {
+    fn name(&self) -> &str {
+        Self::name()
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.output_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "PromAbsent: start={}, end={}, step={}",
+            self.start, self.end, self.step
+        )
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        inputs: Vec<LogicalPlan>,
+    ) -> DataFusionResult<Self> {
+        if inputs.is_empty() {
+            return Err(datafusion::error::DataFusionError::Internal(
+                "Absent must have at least one input".to_string(),
+            ));
+        }
+
+        Ok(Self {
+            start: self.start,
+            end: self.end,
+            step: self.step,
+            time_index_column: self.time_index_column.clone(),
+            value_column: self.value_column.clone(),
+            fake_labels: self.fake_labels.clone(),
+            input: inputs[0].clone(),
+            output_schema: self.output_schema.clone(),
+        })
+    }
+}
+
+impl Absent {
+    pub fn try_new(
+        start: Millisecond,
+        end: Millisecond,
+        step: Millisecond,
+        time_index_column: String,
+        value_column: String,
+        fake_labels: Vec<(String, String)>,
+        input: LogicalPlan,
+    ) -> DataFusionResult<Self> {
+        let mut fields = vec![
+            Field::new(
+                &time_index_column,
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new(&value_column, DataType::Float64, true),
+        ];
+
+        // remove duplicate fake labels
+        let mut fake_labels = fake_labels
+            .into_iter()
+            .collect::<HashMap<String, String>>()
+            .into_iter()
+            .collect::<Vec<_>>();
+        fake_labels.sort_unstable_by(|a, b| a.0.cmp(&b.0));
+        for (name, _) in fake_labels.iter() {
+            fields.push(Field::new(name, DataType::Utf8, true));
+        }
+
+        let output_schema = Arc::new(DFSchema::from_unqualified_fields(
+            fields.into(),
+            HashMap::new(),
+        )?);
+
+        Ok(Self {
+            start,
+            end,
+            step,
+            time_index_column,
+            value_column,
+            fake_labels,
+            input,
+            output_schema,
+        })
+    }
+
+    pub const fn name() -> &'static str {
+        "prom_absent"
+    }
+
+    pub fn to_execution_plan(&self, exec_input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+        let output_schema = Arc::new(self.output_schema.as_arrow().clone());
+        let properties = PlanProperties::new(
+            EquivalenceProperties::new(output_schema.clone()),
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        );
+        Arc::new(AbsentExec {
+            start: self.start,
+            end: self.end,
+            step: self.step,
+            time_index_column: self.time_index_column.clone(),
+            value_column: self.value_column.clone(),
+            fake_labels: self.fake_labels.clone(),
+            output_schema: output_schema.clone(),
+            input: exec_input,
+            properties,
+            metric: ExecutionPlanMetricsSet::new(),
+        })
+    }
+
+    pub fn serialize(&self) -> Vec<u8> {
+        pb::Absent {
+            start: self.start,
+            end: self.end,
+            step: self.step,
+            time_index_column: self.time_index_column.clone(),
+            value_column: self.value_column.clone(),
+            fake_labels: self
+                .fake_labels
+                .iter()
+                .map(|(name, value)| pb::LabelPair {
+                    key: name.clone(),
+                    value: value.clone(),
+                })
+                .collect(),
+        }
+        .encode_to_vec()
+    }
+
+    pub fn deserialize(bytes: &[u8]) -> DataFusionResult<Self> {
+        let pb_absent = pb::Absent::decode(bytes).context(DeserializeSnafu)?;
+        let placeholder_plan = LogicalPlan::EmptyRelation(EmptyRelation {
+            produce_one_row: false,
+            schema: Arc::new(DFSchema::empty()),
+        });
+        Self::try_new(
+            pb_absent.start,
+            pb_absent.end,
+            pb_absent.step,
+            pb_absent.time_index_column,
+            pb_absent.value_column,
+            pb_absent
+                .fake_labels
+                .iter()
+                .map(|label| (label.key.clone(), label.value.clone()))
+                .collect(),
+            placeholder_plan,
+        )
+    }
+}
+
+#[derive(Debug)]
+pub struct AbsentExec {
+    start: Millisecond,
+    end: Millisecond,
+    step: Millisecond,
+    time_index_column: String,
+    value_column: String,
+    fake_labels: Vec<(String, String)>,
+    output_schema: SchemaRef,
+    input: Arc<dyn ExecutionPlan>,
+    properties: PlanProperties,
+    metric: ExecutionPlanMetricsSet,
+}
+
+impl ExecutionPlan for AbsentExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.output_schema.clone()
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::SinglePartition]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+        vec![Some(LexRequirement::new(vec![PhysicalSortRequirement {
+            expr: Arc::new(
+                ColumnExpr::new_with_schema(&self.time_index_column, &self.input.schema()).unwrap(),
+            ),
+            options: Some(SortOptions {
+                descending: false,
+                nulls_first: false,
+            }),
+        }]))]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
+        assert!(!children.is_empty());
+        Ok(Arc::new(Self {
+            start: self.start,
+            end: self.end,
+            step: self.step,
+            time_index_column: self.time_index_column.clone(),
+            value_column: self.value_column.clone(),
+            fake_labels: self.fake_labels.clone(),
+            output_schema: self.output_schema.clone(),
+            input: children[0].clone(),
+            properties: self.properties.clone(),
+            metric: self.metric.clone(),
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> DataFusionResult<SendableRecordBatchStream> {
+        let baseline_metric = BaselineMetrics::new(&self.metric, partition);
+        let input = self.input.execute(partition, context)?;
+
+        Ok(Box::pin(AbsentStream {
+            end: self.end,
+            step: self.step,
+            time_index_column_index: self
+                .input
+                .schema()
+                .column_with_name(&self.time_index_column)
+                .unwrap() // Safety: we have checked the column name in `try_new`
+                .0,
+            output_schema: self.output_schema.clone(),
+            fake_labels: self.fake_labels.clone(),
+            input,
+            metric: baseline_metric,
+            // Buffer for streaming output timestamps
+            output_timestamps: Vec::new(),
+            // Current timestamp in the output range
+            output_ts_cursor: self.start,
+            input_finished: false,
+        }))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metric.clone_inner())
+    }
+
+    fn name(&self) -> &str {
+        "AbsentExec"
+    }
+}
+
+impl DisplayAs for AbsentExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "PromAbsentExec: start={}, end={}, step={}",
+                    self.start, self.end, self.step
+                )
+            }
+        }
+    }
+}
+
+pub struct AbsentStream {
+    end: Millisecond,
+    step: Millisecond,
+    time_index_column_index: usize,
+    output_schema: SchemaRef,
+    fake_labels: Vec<(String, String)>,
+    input: SendableRecordBatchStream,
+    metric: BaselineMetrics,
+    // Buffer for streaming output timestamps
+    output_timestamps: Vec<Millisecond>,
+    // Current timestamp in the output range
+    output_ts_cursor: Millisecond,
+    input_finished: bool,
+}
+
+impl RecordBatchStream for AbsentStream {
+    fn schema(&self) -> SchemaRef {
+        self.output_schema.clone()
+    }
+}
+
+impl Stream for AbsentStream {
+    type Item = DataFusionResult<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        loop {
+            if !self.input_finished {
+                match ready!(self.input.poll_next_unpin(cx)) {
+                    Some(Ok(batch)) => {
+                        let timer = std::time::Instant::now();
+                        if let Err(e) = self.process_input_batch(&batch) {
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                        self.metric.elapsed_compute().add_elapsed(timer);
+
+                        // If we have enough data for a batch, output it
+                        if self.output_timestamps.len() >= ABSENT_BATCH_SIZE {
+                            let timer = std::time::Instant::now();
+                            let result = self.flush_output_batch();
+                            self.metric.elapsed_compute().add_elapsed(timer);
+
+                            match result {
+                                Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))),
+                                Ok(None) => continue,
+                                Err(e) => return Poll::Ready(Some(Err(e))),
+                            }
+                        }
+                    }
+                    Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                    None => {
+                        self.input_finished = true;
+
+                        let timer = std::time::Instant::now();
+                        // Process any remaining absent timestamps
+                        if let Err(e) = self.process_remaining_absent_timestamps() {
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                        let result = self.flush_output_batch();
+                        self.metric.elapsed_compute().add_elapsed(timer);
+                        return Poll::Ready(result.transpose());
+                    }
+                }
+            } else {
+                return Poll::Ready(None);
+            }
+        }
+    }
+}
+
+impl AbsentStream {
+    fn process_input_batch(&mut self, batch: &RecordBatch) -> DataFusionResult<()> {
+        // Extract timestamps from this batch
+        let timestamp_array = batch.column(self.time_index_column_index);
+        let milli_ts_array = arrow::compute::cast(
+            timestamp_array,
+            &DataType::Timestamp(TimeUnit::Millisecond, None),
+        )?;
+        let timestamp_array = milli_ts_array
+            .as_any()
+            .downcast_ref::<TimestampMillisecondArray>()
+            .unwrap();
+
+        // Process against current output cursor position
+        for &input_ts in timestamp_array.values() {
+            // Generate absent timestamps up to this input timestamp
+            while self.output_ts_cursor < input_ts && self.output_ts_cursor <= self.end {
+                self.output_timestamps.push(self.output_ts_cursor);
+                self.output_ts_cursor += self.step;
+            }
+
+            // Skip the input timestamp if it matches our cursor
+            if self.output_ts_cursor == input_ts {
+                self.output_ts_cursor += self.step;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn process_remaining_absent_timestamps(&mut self) -> DataFusionResult<()> {
+        // Generate all remaining absent timestamps (input is finished)
+        while self.output_ts_cursor <= self.end {
+            self.output_timestamps.push(self.output_ts_cursor);
+            self.output_ts_cursor += self.step;
+        }
+        Ok(())
+    }
+
+    fn flush_output_batch(&mut self) -> DataFusionResult<Option<RecordBatch>> {
+        if self.output_timestamps.is_empty() {
+            return Ok(None);
+        }
+
+        let mut columns: Vec<ArrayRef> = Vec::with_capacity(self.output_schema.fields().len());
+        let num_rows = self.output_timestamps.len();
+        columns.push(Arc::new(TimestampMillisecondArray::from(
+            self.output_timestamps.clone(),
+        )) as _);
+        columns.push(Arc::new(Float64Array::from(vec![1.0; num_rows])) as _);
+
+        for (_, value) in self.fake_labels.iter() {
+            columns.push(Arc::new(StringArray::from_iter(std::iter::repeat_n(
+                Some(value.clone()),
+                num_rows,
+            ))) as _);
+        }
+
+        let batch = RecordBatch::try_new(self.output_schema.clone(), columns)?;
+
+        self.output_timestamps.clear();
+        Ok(Some(batch))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion::arrow::record_batch::RecordBatch;
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion::prelude::SessionContext;
+    use datatypes::arrow::array::{Float64Array, TimestampMillisecondArray};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_absent_basic() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new("value", DataType::Float64, true),
+        ]));
+
+        // Input has timestamps: 0, 2000, 4000
+        let timestamp_array = Arc::new(TimestampMillisecondArray::from(vec![0, 2000, 4000]));
+        let value_array = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0]));
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![timestamp_array, value_array]).unwrap();
+
+        let memory_exec = MemoryExec::try_new(&[vec![batch]], schema, None).unwrap();
+
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new("value", DataType::Float64, true),
+        ]));
+
+        let absent_exec = AbsentExec {
+            start: 0,
+            end: 5000,
+            step: 1000,
+            time_index_column: "timestamp".to_string(),
+            value_column: "value".to_string(),
+            fake_labels: vec![],
+            output_schema: output_schema.clone(),
+            input: Arc::new(memory_exec),
+            properties: PlanProperties::new(
+                EquivalenceProperties::new(output_schema.clone()),
+                Partitioning::UnknownPartitioning(1),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            ),
+            metric: ExecutionPlanMetricsSet::new(),
+        };
+
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
+        let mut stream = absent_exec.execute(0, task_ctx).unwrap();
+
+        // Collect all output batches
+        let mut output_timestamps = Vec::new();
+        while let Some(batch_result) = stream.next().await {
+            let batch = batch_result.unwrap();
+            let ts_array = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<TimestampMillisecondArray>()
+                .unwrap();
+            for i in 0..ts_array.len() {
+                if !ts_array.is_null(i) {
+                    let ts = ts_array.value(i);
+                    output_timestamps.push(ts);
+                }
+            }
+        }
+
+        // Should output absent timestamps: 1000, 3000, 5000
+        // (0, 2000, 4000 exist in input, so 1000, 3000, 5000 are absent)
+        assert_eq!(output_timestamps, vec![1000, 3000, 5000]);
+    }
+
+    #[tokio::test]
+    async fn test_absent_empty_input() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new("value", DataType::Float64, true),
+        ]));
+
+        // Empty input
+        let memory_exec = MemoryExec::try_new(&[vec![]], schema, None).unwrap();
+
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new("value", DataType::Float64, true),
+        ]));
+        let absent_exec = AbsentExec {
+            start: 0,
+            end: 2000,
+            step: 1000,
+            time_index_column: "timestamp".to_string(),
+            value_column: "value".to_string(),
+            fake_labels: vec![],
+            output_schema: output_schema.clone(),
+            input: Arc::new(memory_exec),
+            properties: PlanProperties::new(
+                EquivalenceProperties::new(output_schema.clone()),
+                Partitioning::UnknownPartitioning(1),
+                EmissionType::Incremental,
+                Boundedness::Bounded,
+            ),
+            metric: ExecutionPlanMetricsSet::new(),
+        };
+
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
+        let mut stream = absent_exec.execute(0, task_ctx).unwrap();
+
+        // Collect all output timestamps
+        let mut output_timestamps = Vec::new();
+        while let Some(batch_result) = stream.next().await {
+            let batch = batch_result.unwrap();
+            let ts_array = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<TimestampMillisecondArray>()
+                .unwrap();
+            for i in 0..ts_array.len() {
+                if !ts_array.is_null(i) {
+                    let ts = ts_array.value(i);
+                    output_timestamps.push(ts);
+                }
+            }
+        }
+
+        // Should output all timestamps in range: 0, 1000, 2000
+        assert_eq!(output_timestamps, vec![0, 1000, 2000]);
+    }
+}
--- a/src/promql/src/extension_plan/planner.rs
+++ b/src/promql/src/extension_plan/planner.rs
@@ -22,8 +22,8 @@ use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner};

 use crate::extension_plan::{
-    EmptyMetric, HistogramFold, InstantManipulate, RangeManipulate, ScalarCalculate, SeriesDivide,
-    SeriesNormalize, UnionDistinctOn,
+    Absent, EmptyMetric, HistogramFold, InstantManipulate, RangeManipulate, ScalarCalculate,
+    SeriesDivide, SeriesNormalize, UnionDistinctOn,
 };

 pub struct PromExtensionPlanner;
@@ -57,6 +57,8 @@ impl ExtensionPlanner for PromExtensionPlanner {
                physical_inputs[0].clone(),
                physical_inputs[1].clone(),
            )))
+        } else if let Some(node) = node.as_any().downcast_ref::<Absent>() {
+            Ok(Some(node.to_execution_plan(physical_inputs[0].clone())))
        } else {
            Ok(None)
        }
--- a/src/query/src/analyze.rs
+++ b/src/query/src/analyze.rs
@@ -237,7 +237,8 @@ fn create_output_batch(
            for (node, metric) in sub_stage_metrics.into_iter().enumerate() {
                builder.append_metric(1, node as _, metrics_to_string(metric, format)?);
            }
-            return Ok(TreeNodeRecursion::Stop);
+            // might have multiple merge scans, so continue
+            return Ok(TreeNodeRecursion::Continue);
        }
        Ok(TreeNodeRecursion::Continue)
    })?;
--- a/src/query/src/dist_plan/analyzer.rs
+++ b/src/query/src/dist_plan/analyzer.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

 use common_telemetry::debug;
@@ -38,6 +38,13 @@ use crate::dist_plan::merge_scan::MergeScanLogicalPlan;
 use crate::plan::ExtractExpr;
 use crate::query_engine::DefaultSerializer;

+#[cfg(test)]
+mod test;
+
+mod utils;
+
+pub(crate) use utils::{AliasMapping, AliasTracker};
+
 #[derive(Debug)]
 pub struct DistPlannerAnalyzer;

@@ -154,7 +161,33 @@ struct PlanRewriter {
    status: RewriterStatus,
    /// Partition columns of the table in current pass
    partition_cols: Option<Vec<String>>,
-    column_requirements: HashSet<Column>,
+    alias_tracker: Option<AliasTracker>,
+    /// use stack count as scope to determine column requirements is needed or not
+    /// i.e for a logical plan like:
+    /// ```ignore
+    /// 1: Projection: t.number
+    /// 2: Sort: t.pk1+t.pk2
+    /// 3. Projection: t.number, t.pk1, t.pk2
+    /// ```
+    /// `Sort` will make a column requirement for `t.pk1` at level 2.
+    /// Which making `Projection` at level 1 need to add a ref to `t.pk1` as well.
+    /// So that the expanded plan will be
+    /// ```ignore
+    /// Projection: t.number
+    ///   MergeSort: t.pk1
+    ///     MergeScan: remote_input=
+    /// Projection: t.number, "t.pk1+t.pk2" <--- the original `Projection` at level 1 get added with `t.pk1+t.pk2`
+    ///  Sort: t.pk1+t.pk2
+    ///    Projection: t.number, t.pk1, t.pk2
+    /// ```
+    /// Making `MergeSort` can have `t.pk1` as input.
+    /// Meanwhile `Projection` at level 3 doesn't need to add any new column because 3 > 2
+    /// and col requirements at level 2 is not applicable for level 3.
+    ///
+    /// see more details in test `expand_proj_step_aggr` and `expand_proj_sort_proj`
+    ///
+    /// TODO(discord9): a simpler solution to track column requirements for merge scan
+    column_requirements: Vec<(HashSet<Column>, usize)>,
    /// Whether to expand on next call
    /// This is used to handle the case where a plan is transformed, but need to be expanded from it's
    /// parent node. For example a Aggregate plan is split into two parts in frontend and datanode, and need
@@ -164,7 +197,7 @@ struct PlanRewriter {
    /// This is used to handle the case where a plan is transformed, but still
    /// need to push down as many node as possible before next partial/conditional/transformed commutative
    /// plan. I.e.
-    /// ```
+    /// ```ignore
    /// Limit:
    ///     Sort:
    /// ```
@@ -187,6 +220,15 @@ impl PlanRewriter {

    /// Return true if should stop and expand. The input plan is the parent node of current node
    fn should_expand(&mut self, plan: &LogicalPlan) -> bool {
+        debug!(
+            "Check should_expand at level: {}  with Stack:\n{}, ",
+            self.level,
+            self.stack
+                .iter()
+                .map(|(p, l)| format!("{l}:{}{}", "  ".repeat(l - 1), p.display()))
+                .collect::<Vec<String>>()
+                .join("\n"),
+        );
        if DFLogicalSubstraitConvertor
            .encode(plan, DefaultSerializer)
            .is_err()
@@ -200,18 +242,21 @@ impl PlanRewriter {
        }

        if self.expand_on_next_part_cond_trans_commutative {
-            let comm = Categorizer::check_plan(plan, self.partition_cols.clone());
+            let comm = Categorizer::check_plan(plan, self.get_aliased_partition_columns());
            match comm {
                Commutativity::PartialCommutative => {
                    // a small difference is that for partial commutative, we still need to
-                    // expand on next call(so `Limit` can be pushed down)
+                    // push down it(so `Limit` can be pushed down)
+
+                    // notice how limit needed to be expanded as well to make sure query is correct
+                    // i.e. `Limit fetch=10` need to be pushed down to the leaf node
                    self.expand_on_next_part_cond_trans_commutative = false;
                    self.expand_on_next_call = true;
                }
                Commutativity::ConditionalCommutative(_)
                | Commutativity::TransformedCommutative { .. } => {
-                    // for conditional commutative and transformed commutative, we can
-                    // expand now
+                    // again a new node that can be push down, we should just
+                    // do push down now and avoid further expansion
                    self.expand_on_next_part_cond_trans_commutative = false;
                    return true;
                }
@@ -219,11 +264,12 @@ impl PlanRewriter {
            }
        }

-        match Categorizer::check_plan(plan, self.partition_cols.clone()) {
+        match Categorizer::check_plan(plan, self.get_aliased_partition_columns()) {
            Commutativity::Commutative => {}
            Commutativity::PartialCommutative => {
                if let Some(plan) = partial_commutative_transformer(plan) {
-                    self.update_column_requirements(&plan);
+                    // notice this plan is parent of current node, so `self.level - 1` when updating column requirements
+                    self.update_column_requirements(&plan, self.level - 1);
                    self.expand_on_next_part_cond_trans_commutative = true;
                    self.stage.push(plan)
                }
@@ -232,7 +278,8 @@ impl PlanRewriter {
                if let Some(transformer) = transformer
                    && let Some(plan) = transformer(plan)
                {
-                    self.update_column_requirements(&plan);
+                    // notice this plan is parent of current node, so `self.level - 1` when updating column requirements
+                    self.update_column_requirements(&plan, self.level - 1);
                    self.expand_on_next_part_cond_trans_commutative = true;
                    self.stage.push(plan)
                }
@@ -242,12 +289,22 @@ impl PlanRewriter {
                    && let Some(transformer_actions) = transformer(plan)
                {
                    debug!(
-                        "PlanRewriter: transformed plan: {:?}\n from {plan}",
-                        transformer_actions.extra_parent_plans
+                        "PlanRewriter: transformed plan: {}\n from {plan}",
+                        transformer_actions
+                            .extra_parent_plans
+                            .iter()
+                            .enumerate()
+                            .map(|(i, p)| format!(
+                                "Extra {i}-th parent plan from parent to child = {}",
+                                p.display()
+                            ))
+                            .collect::<Vec<_>>()
+                            .join("\n")
                    );
                    if let Some(last_stage) = transformer_actions.extra_parent_plans.last() {
                        // update the column requirements from the last stage
-                        self.update_column_requirements(last_stage);
+                        // notice current plan's parent plan is where we need to apply the column requirements
+                        self.update_column_requirements(last_stage, self.level - 1);
                    }
                    self.stage
                        .extend(transformer_actions.extra_parent_plans.into_iter().rev());
@@ -265,9 +322,12 @@ impl PlanRewriter {
        false
    }

-    fn update_column_requirements(&mut self, plan: &LogicalPlan) {
+    /// Update the column requirements for the current plan, plan_level is the level of the plan
+    /// in the stack, which is used to determine if the column requirements are applicable
+    /// for other plans in the stack.
+    fn update_column_requirements(&mut self, plan: &LogicalPlan, plan_level: usize) {
        debug!(
-            "PlanRewriter: update column requirements for plan: {plan}\n withcolumn_requirements: {:?}",
+            "PlanRewriter: update column requirements for plan: {plan}\n with old column_requirements: {:?}",
            self.column_requirements
        );
        let mut container = HashSet::new();
@@ -276,9 +336,7 @@ impl PlanRewriter {
            let _ = expr_to_columns(&expr, &mut container);
        }

-        for col in container {
-            self.column_requirements.insert(col);
-        }
+        self.column_requirements.push((container, plan_level));
        debug!(
            "PlanRewriter: updated column requirements: {:?}",
            self.column_requirements
@@ -297,6 +355,45 @@ impl PlanRewriter {
        self.status = RewriterStatus::Unexpanded;
    }

+    /// Maybe update alias for original table columns in the plan
+    fn maybe_update_alias(&mut self, node: &LogicalPlan) {
+        if let Some(alias_tracker) = &mut self.alias_tracker {
+            alias_tracker.update_alias(node);
+            debug!(
+                "Current partition columns are: {:?}",
+                self.get_aliased_partition_columns()
+            );
+        } else if let LogicalPlan::TableScan(table_scan) = node {
+            self.alias_tracker = AliasTracker::new(table_scan);
+            debug!(
+                "Initialize partition columns: {:?} with table={}",
+                self.get_aliased_partition_columns(),
+                table_scan.table_name
+            );
+        }
+    }
+
+    fn get_aliased_partition_columns(&self) -> Option<AliasMapping> {
+        if let Some(part_cols) = self.partition_cols.as_ref() {
+            let Some(alias_tracker) = &self.alias_tracker else {
+                // no alias tracker meaning no table scan encountered
+                return None;
+            };
+            let mut aliased = HashMap::new();
+            for part_col in part_cols {
+                let all_alias = alias_tracker
+                    .get_all_alias_for_col(part_col)
+                    .cloned()
+                    .unwrap_or_default();
+
+                aliased.insert(part_col.clone(), all_alias);
+            }
+            Some(aliased)
+        } else {
+            None
+        }
+    }
+
    fn maybe_set_partitions(&mut self, plan: &LogicalPlan) {
        if self.partition_cols.is_some() {
            // only need to set once
@@ -342,10 +439,15 @@ impl PlanRewriter {
        }
        // store schema before expand
        let schema = on_node.schema().clone();
-        let mut rewriter = EnforceDistRequirementRewriter {
-            column_requirements: std::mem::take(&mut self.column_requirements),
-        };
+        let mut rewriter = EnforceDistRequirementRewriter::new(
+            std::mem::take(&mut self.column_requirements),
+            self.level,
+        );
+        debug!("PlanRewriter: enforce column requirements for node: {on_node} with rewriter: {rewriter:?}");
        on_node = on_node.rewrite(&mut rewriter)?.data;
+        debug!(
+            "PlanRewriter: after enforced column requirements for node: {on_node} with rewriter: {rewriter:?}"
+        );

        // add merge scan as the new root
        let mut node = MergeScanLogicalPlan::new(
@@ -364,7 +466,8 @@ impl PlanRewriter {
        }
        self.set_expanded();

-        // recover the schema
+        // recover the schema, this make sure after expand the schema is the same as old node
+        // because after expand the raw top node might have extra columns i.e. sorting columns for `Sort` node
        let node = LogicalPlanBuilder::from(node)
            .project(schema.iter().map(|(qualifier, field)| {
                Expr::Column(Column::new(qualifier.cloned(), field.name()))
@@ -381,42 +484,96 @@ impl PlanRewriter {
 /// Requirements enforced by this rewriter:
 /// - Enforce column requirements for `LogicalPlan::Projection` nodes. Makes sure the
 ///   required columns are available in the sub plan.
+///
+#[derive(Debug)]
 struct EnforceDistRequirementRewriter {
-    column_requirements: HashSet<Column>,
+    /// only enforce column requirements after the expanding node in question,
+    /// meaning only for node with `cur_level` <= `level` will consider adding those column requirements
+    /// TODO(discord9): a simpler solution to track column requirements for merge scan
+    column_requirements: Vec<(HashSet<Column>, usize)>,
+    /// only apply column requirements >= `cur_level`
+    /// this is used to avoid applying column requirements that are not needed
+    /// for the current node, i.e. the node is not in the scope of the column requirements
+    /// i.e, for this plan:
+    /// ```ignore
+    /// Aggregate: min(t.number)
+    ///   Projection: t.number
+    /// ```
+    /// when on `Projection` node, we don't need to apply the column requirements of `Aggregate` node
+    /// because the `Projection` node is not in the scope of the `Aggregate` node
+    cur_level: usize,
+}
+
+impl EnforceDistRequirementRewriter {
+    fn new(column_requirements: Vec<(HashSet<Column>, usize)>, cur_level: usize) -> Self {
+        Self {
+            column_requirements,
+            cur_level,
+        }
+    }
 }

 impl TreeNodeRewriter for EnforceDistRequirementRewriter {
    type Node = LogicalPlan;

    fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
-        if let LogicalPlan::Projection(ref projection) = node {
-            let mut column_requirements = std::mem::take(&mut self.column_requirements);
-            if column_requirements.is_empty() {
-                return Ok(Transformed::no(node));
-            }
-
-            for expr in &projection.expr {
-                let (qualifier, name) = expr.qualified_name();
-                let column = Column::new(qualifier, name);
-                column_requirements.remove(&column);
-            }
-            if column_requirements.is_empty() {
-                return Ok(Transformed::no(node));
-            }
-
-            let mut new_exprs = projection.expr.clone();
-            for col in &column_requirements {
-                new_exprs.push(Expr::Column(col.clone()));
-            }
-            let new_node =
-                node.with_new_exprs(new_exprs, node.inputs().into_iter().cloned().collect())?;
-            return Ok(Transformed::yes(new_node));
+        // check that node doesn't have multiple children, i.e. join/subquery
+        if node.inputs().len() > 1 {
+            return Err(datafusion_common::DataFusionError::Internal(
+                "EnforceDistRequirementRewriter: node with multiple inputs is not supported"
+                    .to_string(),
+            ));
        }
-
+        self.cur_level += 1;
        Ok(Transformed::no(node))
    }

    fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
+        self.cur_level -= 1;
+        // first get all applicable column requirements
+        let mut applicable_column_requirements = self
+            .column_requirements
+            .iter()
+            .filter(|(_, level)| *level >= self.cur_level)
+            .map(|(cols, _)| cols.clone())
+            .reduce(|mut acc, cols| {
+                acc.extend(cols);
+                acc
+            })
+            .unwrap_or_default();
+
+        debug!(
+            "EnforceDistRequirementRewriter: applicable column requirements at level {} = {:?} for node {}",
+            self.cur_level,
+            applicable_column_requirements,
+            node.display()
+        );
+
+        // make sure all projection applicable scope has the required columns
+        if let LogicalPlan::Projection(ref projection) = node {
+            for expr in &projection.expr {
+                let (qualifier, name) = expr.qualified_name();
+                let column = Column::new(qualifier, name);
+                applicable_column_requirements.remove(&column);
+            }
+            if applicable_column_requirements.is_empty() {
+                return Ok(Transformed::no(node));
+            }
+
+            let mut new_exprs = projection.expr.clone();
+            for col in &applicable_column_requirements {
+                new_exprs.push(Expr::Column(col.clone()));
+            }
+            let new_node =
+                node.with_new_exprs(new_exprs, node.inputs().into_iter().cloned().collect())?;
+            debug!(
+                "EnforceDistRequirementRewriter: added missing columns {:?} to projection node from old node: \n{node}\n Making new node: \n{new_node}",
+                applicable_column_requirements
+            );
+
+            // still need to continue for next projection if applicable
+            return Ok(Transformed::yes(new_node));
+        }
        Ok(Transformed::no(node))
    }
 }
@@ -432,6 +589,7 @@ impl TreeNodeRewriter for PlanRewriter {
        self.stage.clear();
        self.set_unexpanded();
        self.partition_cols = None;
+        self.alias_tracker = None;
        Ok(Transformed::no(node))
    }

@@ -454,8 +612,19 @@ impl TreeNodeRewriter for PlanRewriter {

        self.maybe_set_partitions(&node);

+        self.maybe_update_alias(&node);
+
        let Some(parent) = self.get_parent() else {
-            let node = self.expand(node)?;
+            debug!("Plan Rewriter: expand now for no parent found for node: {node}");
+            let node = self.expand(node);
+            debug!(
+                "PlanRewriter: expanded plan: {}",
+                match &node {
+                    Ok(n) => n.to_string(),
+                    Err(e) => format!("Error expanding plan: {e}"),
+                }
+            );
+            let node = node?;
            self.pop_stack();
            return Ok(Transformed::yes(node));
        };
@@ -483,160 +652,3 @@ impl TreeNodeRewriter for PlanRewriter {
        Ok(Transformed::no(node))
    }
 }
-
-#[cfg(test)]
-mod test {
-    use std::sync::Arc;
-
-    use datafusion::datasource::DefaultTableSource;
-    use datafusion::functions_aggregate::expr_fn::avg;
-    use datafusion_common::JoinType;
-    use datafusion_expr::{col, lit, Expr, LogicalPlanBuilder};
-    use table::table::adapter::DfTableProviderAdapter;
-    use table::table::numbers::NumbersTable;
-
-    use super::*;
-
-    #[ignore = "Projection is disabled for https://github.com/apache/arrow-datafusion/issues/6489"]
-    #[test]
-    fn transform_simple_projection_filter() {
-        let numbers_table = NumbersTable::table(0);
-        let table_source = Arc::new(DefaultTableSource::new(Arc::new(
-            DfTableProviderAdapter::new(numbers_table),
-        )));
-
-        let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
-            .unwrap()
-            .filter(col("number").lt(lit(10)))
-            .unwrap()
-            .project(vec![col("number")])
-            .unwrap()
-            .distinct()
-            .unwrap()
-            .build()
-            .unwrap();
-
-        let config = ConfigOptions::default();
-        let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
-        let expected = [
-            "Distinct:",
-            "  MergeScan [is_placeholder=false]",
-            "    Distinct:",
-            "      Projection: t.number",
-            "        Filter: t.number < Int32(10)",
-            "          TableScan: t",
-        ]
-        .join("\n");
-        assert_eq!(expected, result.to_string());
-    }
-
-    #[test]
-    fn transform_aggregator() {
-        let numbers_table = NumbersTable::table(0);
-        let table_source = Arc::new(DefaultTableSource::new(Arc::new(
-            DfTableProviderAdapter::new(numbers_table),
-        )));
-
-        let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
-            .unwrap()
-            .aggregate(Vec::<Expr>::new(), vec![avg(col("number"))])
-            .unwrap()
-            .build()
-            .unwrap();
-
-        let config = ConfigOptions::default();
-        let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
-        let expected = "Projection: avg(t.number)\
-        \n  MergeScan [is_placeholder=false]";
-        assert_eq!(expected, result.to_string());
-    }
-
-    #[test]
-    fn transform_distinct_order() {
-        let numbers_table = NumbersTable::table(0);
-        let table_source = Arc::new(DefaultTableSource::new(Arc::new(
-            DfTableProviderAdapter::new(numbers_table),
-        )));
-
-        let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
-            .unwrap()
-            .distinct()
-            .unwrap()
-            .sort(vec![col("number").sort(true, false)])
-            .unwrap()
-            .build()
-            .unwrap();
-
-        let config = ConfigOptions::default();
-        let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
-        let expected = ["Projection: t.number", "  MergeScan [is_placeholder=false]"].join("\n");
-        assert_eq!(expected, result.to_string());
-    }
-
-    #[test]
-    fn transform_single_limit() {
-        let numbers_table = NumbersTable::table(0);
-        let table_source = Arc::new(DefaultTableSource::new(Arc::new(
-            DfTableProviderAdapter::new(numbers_table),
-        )));
-
-        let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
-            .unwrap()
-            .limit(0, Some(1))
-            .unwrap()
-            .build()
-            .unwrap();
-
-        let config = ConfigOptions::default();
-        let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
-        let expected = "Projection: t.number\
-        \n  MergeScan [is_placeholder=false]";
-        assert_eq!(expected, result.to_string());
-    }
-
-    #[test]
-    fn transform_unalighed_join_with_alias() {
-        let left = NumbersTable::table(0);
-        let right = NumbersTable::table(1);
-        let left_source = Arc::new(DefaultTableSource::new(Arc::new(
-            DfTableProviderAdapter::new(left),
-        )));
-        let right_source = Arc::new(DefaultTableSource::new(Arc::new(
-            DfTableProviderAdapter::new(right),
-        )));
-
-        let right_plan = LogicalPlanBuilder::scan_with_filters("t", right_source, None, vec![])
-            .unwrap()
-            .alias("right")
-            .unwrap()
-            .build()
-            .unwrap();
-
-        let plan = LogicalPlanBuilder::scan_with_filters("t", left_source, None, vec![])
-            .unwrap()
-            .join_on(
-                right_plan,
-                JoinType::LeftSemi,
-                vec![col("t.number").eq(col("right.number"))],
-            )
-            .unwrap()
-            .limit(0, Some(1))
-            .unwrap()
-            .build()
-            .unwrap();
-
-        let config = ConfigOptions::default();
-        let result = DistPlannerAnalyzer {}.analyze(plan, &config).unwrap();
-        let expected = [
-            "Limit: skip=0, fetch=1",
-            "  LeftSemi Join:  Filter: t.number = right.number",
-            "    Projection: t.number",
-            "      MergeScan [is_placeholder=false]",
-            "    SubqueryAlias: right",
-            "      Projection: t.number",
-            "        MergeScan [is_placeholder=false]",
-        ]
-        .join("\n");
-        assert_eq!(expected, result.to_string());
-    }
-}
--- a/src/query/src/dist_plan/analyzer/test.rs
+++ b/src/query/src/dist_plan/analyzer/test.rs
--- a/src/query/src/dist_plan/analyzer/utils.rs
+++ b/src/query/src/dist_plan/analyzer/utils.rs
@@ -0,0 +1,318 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+
+use datafusion::datasource::DefaultTableSource;
+use datafusion_common::Column;
+use datafusion_expr::{Expr, LogicalPlan, TableScan};
+use table::metadata::TableType;
+use table::table::adapter::DfTableProviderAdapter;
+
+/// Mapping of original column in table to all the alias at current node
+pub type AliasMapping = HashMap<String, HashSet<Column>>;
+
+/// tracking aliases for the source table columns in the plan
+#[derive(Debug, Clone)]
+pub struct AliasTracker {
+    /// mapping from the original table name to the alias used in the plan
+    /// notice how one column might have multiple aliases in the plan
+    ///
+    pub mapping: AliasMapping,
+}
+
+impl AliasTracker {
+    pub fn new(table_scan: &TableScan) -> Option<Self> {
+        if let Some(source) = table_scan
+            .source
+            .as_any()
+            .downcast_ref::<DefaultTableSource>()
+        {
+            if let Some(provider) = source
+                .table_provider
+                .as_any()
+                .downcast_ref::<DfTableProviderAdapter>()
+            {
+                if provider.table().table_type() == TableType::Base {
+                    let info = provider.table().table_info();
+                    let schema = info.meta.schema.clone();
+                    let col_schema = schema.column_schemas();
+                    let mapping = col_schema
+                        .iter()
+                        .map(|col| {
+                            (
+                                col.name.clone(),
+                                HashSet::from_iter(std::iter::once(Column::new_unqualified(
+                                    col.name.clone(),
+                                ))),
+                            )
+                        })
+                        .collect();
+                    return Some(Self { mapping });
+                }
+            }
+        }
+
+        None
+    }
+
+    /// update alias for original columns
+    ///
+    /// only handle `Alias` with column in `Projection` node
+    pub fn update_alias(&mut self, node: &LogicalPlan) {
+        if let LogicalPlan::Projection(projection) = node {
+            // first collect all the alias mapping, i.e. the col_a AS b AS c AS d become `a->d`
+            // notice one column might have multiple aliases
+            let mut alias_mapping: AliasMapping = HashMap::new();
+            for expr in &projection.expr {
+                if let Expr::Alias(alias) = expr {
+                    let outer_alias = alias.clone();
+                    let mut cur_alias = alias.clone();
+                    while let Expr::Alias(alias) = *cur_alias.expr {
+                        cur_alias = alias;
+                    }
+                    if let Expr::Column(column) = *cur_alias.expr {
+                        alias_mapping
+                            .entry(column.name.clone())
+                            .or_default()
+                            .insert(Column::new(outer_alias.relation, outer_alias.name));
+                    }
+                } else if let Expr::Column(column) = expr {
+                    // identity mapping
+                    alias_mapping
+                        .entry(column.name.clone())
+                        .or_default()
+                        .insert(column.clone());
+                }
+            }
+
+            // update mapping using `alias_mapping`
+            let mut new_mapping = HashMap::new();
+            for (table_col_name, cur_columns) in std::mem::take(&mut self.mapping) {
+                let new_aliases = {
+                    let mut new_aliases = HashSet::new();
+                    for cur_column in &cur_columns {
+                        let new_alias_for_cur_column = alias_mapping
+                            .get(cur_column.name())
+                            .cloned()
+                            .unwrap_or_default();
+
+                        for new_alias in new_alias_for_cur_column {
+                            let is_table_ref_eq = match (&new_alias.relation, &cur_column.relation)
+                            {
+                                (Some(o), Some(c)) => o.resolved_eq(c),
+                                _ => true,
+                            };
+                            // is the same column if both name and table ref is eq
+                            if is_table_ref_eq {
+                                new_aliases.insert(new_alias.clone());
+                            }
+                        }
+                    }
+                    new_aliases
+                };
+
+                new_mapping.insert(table_col_name, new_aliases);
+            }
+
+            self.mapping = new_mapping;
+            common_telemetry::debug!(
+                "Updating alias tracker to {:?} using node: \n{node}",
+                self.mapping
+            );
+        }
+    }
+
+    pub fn get_all_alias_for_col(&self, col_name: &str) -> Option<&HashSet<Column>> {
+        self.mapping.get(col_name)
+    }
+
+    #[allow(unused)]
+    pub fn is_alias_for(&self, original_col: &str, cur_col: &Column) -> bool {
+        self.mapping
+            .get(original_col)
+            .map(|cols| cols.contains(cur_col))
+            .unwrap_or(false)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_telemetry::init_default_ut_logging;
+    use datafusion::error::Result as DfResult;
+    use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
+    use datafusion_expr::{col, LogicalPlanBuilder};
+
+    use super::*;
+    use crate::dist_plan::analyzer::test::TestTable;
+
+    #[derive(Debug)]
+    struct TrackerTester {
+        alias_tracker: Option<AliasTracker>,
+        mapping_at_each_level: Vec<AliasMapping>,
+    }
+
+    impl TreeNodeVisitor<'_> for TrackerTester {
+        type Node = LogicalPlan;
+
+        fn f_up(&mut self, node: &LogicalPlan) -> DfResult<TreeNodeRecursion> {
+            if let Some(alias_tracker) = &mut self.alias_tracker {
+                alias_tracker.update_alias(node);
+                self.mapping_at_each_level.push(
+                    self.alias_tracker
+                        .as_ref()
+                        .map(|a| a.mapping.clone())
+                        .unwrap_or_default()
+                        .clone(),
+                );
+            } else if let LogicalPlan::TableScan(table_scan) = node {
+                self.alias_tracker = AliasTracker::new(table_scan);
+                self.mapping_at_each_level.push(
+                    self.alias_tracker
+                        .as_ref()
+                        .map(|a| a.mapping.clone())
+                        .unwrap_or_default()
+                        .clone(),
+                );
+            }
+            Ok(TreeNodeRecursion::Continue)
+        }
+    }
+
+    #[test]
+    fn proj_alias_tracker() {
+        // use logging for better debugging
+        init_default_ut_logging();
+        let test_table = TestTable::table_with_name(0, "numbers".to_string());
+        let table_source = Arc::new(DefaultTableSource::new(Arc::new(
+            DfTableProviderAdapter::new(test_table),
+        )));
+        let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
+            .unwrap()
+            .project(vec![
+                col("number"),
+                col("pk3").alias("pk1"),
+                col("pk2").alias("pk3"),
+            ])
+            .unwrap()
+            .project(vec![
+                col("number"),
+                col("pk1").alias("pk2"),
+                col("pk3").alias("pk1"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let mut tracker_tester = TrackerTester {
+            alias_tracker: None,
+            mapping_at_each_level: Vec::new(),
+        };
+        plan.visit(&mut tracker_tester).unwrap();
+
+        assert_eq!(
+            tracker_tester.mapping_at_each_level,
+            vec![
+                HashMap::from([
+                    ("number".to_string(), HashSet::from(["number".into()])),
+                    ("pk1".to_string(), HashSet::from(["pk1".into()])),
+                    ("pk2".to_string(), HashSet::from(["pk2".into()])),
+                    ("pk3".to_string(), HashSet::from(["pk3".into()])),
+                    ("ts".to_string(), HashSet::from(["ts".into()]))
+                ]),
+                HashMap::from([
+                    ("number".to_string(), HashSet::from(["t.number".into()])),
+                    ("pk1".to_string(), HashSet::from([])),
+                    ("pk2".to_string(), HashSet::from(["pk3".into()])),
+                    ("pk3".to_string(), HashSet::from(["pk1".into()])),
+                    ("ts".to_string(), HashSet::from([]))
+                ]),
+                HashMap::from([
+                    ("number".to_string(), HashSet::from(["t.number".into()])),
+                    ("pk1".to_string(), HashSet::from([])),
+                    ("pk2".to_string(), HashSet::from(["pk1".into()])),
+                    ("pk3".to_string(), HashSet::from(["pk2".into()])),
+                    ("ts".to_string(), HashSet::from([]))
+                ])
+            ]
+        );
+    }
+
+    #[test]
+    fn proj_multi_alias_tracker() {
+        // use logging for better debugging
+        init_default_ut_logging();
+        let test_table = TestTable::table_with_name(0, "numbers".to_string());
+        let table_source = Arc::new(DefaultTableSource::new(Arc::new(
+            DfTableProviderAdapter::new(test_table),
+        )));
+        let plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
+            .unwrap()
+            .project(vec![
+                col("number"),
+                col("pk3").alias("pk1"),
+                col("pk3").alias("pk2"),
+            ])
+            .unwrap()
+            .project(vec![
+                col("number"),
+                col("pk2").alias("pk4"),
+                col("pk1").alias("pk5"),
+            ])
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let mut tracker_tester = TrackerTester {
+            alias_tracker: None,
+            mapping_at_each_level: Vec::new(),
+        };
+        plan.visit(&mut tracker_tester).unwrap();
+
+        assert_eq!(
+            tracker_tester.mapping_at_each_level,
+            vec![
+                HashMap::from([
+                    ("number".to_string(), HashSet::from(["number".into()])),
+                    ("pk1".to_string(), HashSet::from(["pk1".into()])),
+                    ("pk2".to_string(), HashSet::from(["pk2".into()])),
+                    ("pk3".to_string(), HashSet::from(["pk3".into()])),
+                    ("ts".to_string(), HashSet::from(["ts".into()]))
+                ]),
+                HashMap::from([
+                    ("number".to_string(), HashSet::from(["t.number".into()])),
+                    ("pk1".to_string(), HashSet::from([])),
+                    ("pk2".to_string(), HashSet::from([])),
+                    (
+                        "pk3".to_string(),
+                        HashSet::from(["pk1".into(), "pk2".into()])
+                    ),
+                    ("ts".to_string(), HashSet::from([]))
+                ]),
+                HashMap::from([
+                    ("number".to_string(), HashSet::from(["t.number".into()])),
+                    ("pk1".to_string(), HashSet::from([])),
+                    ("pk2".to_string(), HashSet::from([])),
+                    (
+                        "pk3".to_string(),
+                        HashSet::from(["pk4".into(), "pk5".into()])
+                    ),
+                    ("ts".to_string(), HashSet::from([]))
+                ])
+            ]
+        );
+    }
+}
--- a/src/query/src/dist_plan/commutativity.rs
+++ b/src/query/src/dist_plan/commutativity.rs
@@ -27,6 +27,7 @@ use promql::extension_plan::{
    EmptyMetric, InstantManipulate, RangeManipulate, SeriesDivide, SeriesNormalize,
 };

+use crate::dist_plan::analyzer::AliasMapping;
 use crate::dist_plan::merge_sort::{merge_sort_transformer, MergeSortLogicalPlan};
 use crate::dist_plan::MergeScanLogicalPlan;

@@ -139,9 +140,7 @@ pub fn step_aggr_to_upper_aggr(
        new_projection_exprs.push(aliased_output_aggr_expr);
    }
    let upper_aggr_plan = LogicalPlan::Aggregate(new_aggr);
-    debug!("Before recompute schema: {upper_aggr_plan:?}");
    let upper_aggr_plan = upper_aggr_plan.recompute_schema()?;
-    debug!("After recompute schema: {upper_aggr_plan:?}");
    // create a projection on top of the new aggregate plan
    let new_projection =
        Projection::try_new(new_projection_exprs, Arc::new(upper_aggr_plan.clone()))?;
@@ -222,7 +221,7 @@ pub enum Commutativity {
 pub struct Categorizer {}

 impl Categorizer {
-    pub fn check_plan(plan: &LogicalPlan, partition_cols: Option<Vec<String>>) -> Commutativity {
+    pub fn check_plan(plan: &LogicalPlan, partition_cols: Option<AliasMapping>) -> Commutativity {
        let partition_cols = partition_cols.unwrap_or_default();

        match plan {
@@ -247,7 +246,6 @@ impl Categorizer {
                        transformer: Some(Arc::new(|plan: &LogicalPlan| {
                            debug!("Before Step optimize: {plan}");
                            let ret = step_aggr_to_upper_aggr(plan);
-                            debug!("After Step Optimize: {ret:?}");
                            ret.ok().map(|s| TransformerAction {
                                extra_parent_plans: s.to_vec(),
                                new_child_plan: None,
@@ -264,7 +262,11 @@ impl Categorizer {
                        return commutativity;
                    }
                }
-                Commutativity::Commutative
+                // all group by expressions are partition columns can push down, unless
+                // another push down(including `Limit` or `Sort`) is already in progress(which will then prvent next cond commutative node from being push down).
+                // TODO(discord9): This is a temporary solution(that works), a better description of
+                // commutativity is needed under this situation.
+                Commutativity::ConditionalCommutative(None)
            }
            LogicalPlan::Sort(_) => {
                if partition_cols.is_empty() {
@@ -322,17 +324,20 @@ impl Categorizer {

    pub fn check_extension_plan(
        plan: &dyn UserDefinedLogicalNode,
-        partition_cols: &[String],
+        partition_cols: &AliasMapping,
    ) -> Commutativity {
        match plan.name() {
            name if name == SeriesDivide::name() => {
                let series_divide = plan.as_any().downcast_ref::<SeriesDivide>().unwrap();
                let tags = series_divide.tags().iter().collect::<HashSet<_>>();
-                for partition_col in partition_cols {
-                    if !tags.contains(partition_col) {
+
+                for all_alias in partition_cols.values() {
+                    let all_alias = all_alias.iter().map(|c| &c.name).collect::<HashSet<_>>();
+                    if tags.intersection(&all_alias).count() == 0 {
                        return Commutativity::NonCommutative;
                    }
                }
+
                Commutativity::Commutative
            }
            name if name == SeriesNormalize::name()
@@ -396,7 +401,7 @@ impl Categorizer {

    /// Return true if the given expr and partition cols satisfied the rule.
    /// In this case the plan can be treated as fully commutative.
-    fn check_partition(exprs: &[Expr], partition_cols: &[String]) -> bool {
+    fn check_partition(exprs: &[Expr], partition_cols: &AliasMapping) -> bool {
        let mut ref_cols = HashSet::new();
        for expr in exprs {
            expr.add_column_refs(&mut ref_cols);
@@ -405,8 +410,14 @@ impl Categorizer {
            .into_iter()
            .map(|c| c.name.clone())
            .collect::<HashSet<_>>();
-        for col in partition_cols {
-            if !ref_cols.contains(col) {
+        for all_alias in partition_cols.values() {
+            let all_alias = all_alias
+                .iter()
+                .map(|c| c.name.clone())
+                .collect::<HashSet<_>>();
+            // check if ref columns intersect with all alias of partition columns
+            // is empty, if it's empty, not all partition columns show up in `exprs`
+            if ref_cols.intersection(&all_alias).count() == 0 {
                return false;
            }
        }
@@ -424,7 +435,7 @@ pub type StageTransformer = Arc<dyn Fn(&LogicalPlan) -> Option<TransformerAction
 pub struct TransformerAction {
    /// list of plans that need to be applied to parent plans, in the order of parent to child.
    /// i.e. if this returns `[Projection, Aggregate]`, then the parent plan should be transformed to
-    /// ```
+    /// ```ignore
    /// Original Parent Plan:
    ///     Projection:
    ///         Aggregate:
@@ -453,7 +464,7 @@ mod test {
            fetch: None,
        });
        assert!(matches!(
-            Categorizer::check_plan(&plan, Some(vec![])),
+            Categorizer::check_plan(&plan, Some(Default::default())),
            Commutativity::Commutative
        ));
    }
--- a/src/query/src/dist_plan/merge_scan.rs
+++ b/src/query/src/dist_plan/merge_scan.rs
@@ -16,7 +16,7 @@ use std::any::Any;
 use std::sync::{Arc, Mutex};
 use std::time::Duration;

-use ahash::HashSet;
+use ahash::{HashMap, HashSet};
 use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SortOptions};
 use async_stream::stream;
 use common_catalog::parse_catalog_and_schema_from_db_string;
@@ -88,7 +88,11 @@ impl UserDefinedLogicalNodeCore for MergeScanLogicalPlan {
    }

    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "MergeScan [is_placeholder={}]", self.is_placeholder)
+        write!(
+            f,
+            "MergeScan [is_placeholder={}, remote_input=[\n{}\n]]",
+            self.is_placeholder, self.input
+        )
    }

    fn with_exprs_and_inputs(
@@ -143,7 +147,7 @@ pub struct MergeScanExec {
    metric: ExecutionPlanMetricsSet,
    properties: PlanProperties,
    /// Metrics from sub stages
-    sub_stage_metrics: Arc<Mutex<Vec<RecordBatchMetrics>>>,
+    sub_stage_metrics: Arc<Mutex<HashMap<RegionId, RecordBatchMetrics>>>,
    query_ctx: QueryContextRef,
    target_partition: usize,
    partition_cols: Vec<String>,
@@ -155,6 +159,7 @@ impl std::fmt::Debug for MergeScanExec {
            .field("table", &self.table)
            .field("regions", &self.regions)
            .field("schema", &self.schema)
+            .field("plan", &self.plan)
            .finish()
    }
 }
@@ -317,6 +322,12 @@ impl MergeScanExec {
                    if let Some(mut first_consume_timer) = first_consume_timer.take() {
                        first_consume_timer.stop();
                    }
+
+                    if let Some(metrics) = stream.metrics() {
+                        let mut sub_stage_metrics = sub_stage_metrics_moved.lock().unwrap();
+                        sub_stage_metrics.insert(region_id, metrics);
+                    }
+
                    yield Ok(batch);
                    // reset poll timer
                    poll_timer = Instant::now();
@@ -341,7 +352,8 @@ impl MergeScanExec {
                    metric.record_greptime_exec_cost(value as usize);

                    // record metrics from sub sgates
-                    sub_stage_metrics_moved.lock().unwrap().push(metrics);
+                    let mut sub_stage_metrics = sub_stage_metrics_moved.lock().unwrap();
+                    sub_stage_metrics.insert(region_id, metrics);
                }

                MERGE_SCAN_POLL_ELAPSED.observe(poll_duration.as_secs_f64());
@@ -409,7 +421,12 @@ impl MergeScanExec {
    }

    pub fn sub_stage_metrics(&self) -> Vec<RecordBatchMetrics> {
-        self.sub_stage_metrics.lock().unwrap().clone()
+        self.sub_stage_metrics
+            .lock()
+            .unwrap()
+            .values()
+            .cloned()
+            .collect()
    }

    pub fn partition_count(&self) -> usize {
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -27,6 +27,7 @@ use datafusion::datasource::DefaultTableSource;
 use datafusion::execution::context::SessionState;
 use datafusion::functions_aggregate::average::avg_udaf;
 use datafusion::functions_aggregate::count::count_udaf;
+use datafusion::functions_aggregate::expr_fn::first_value;
 use datafusion::functions_aggregate::grouping::grouping_udaf;
 use datafusion::functions_aggregate::min_max::{max_udaf, min_udaf};
 use datafusion::functions_aggregate::stddev::stddev_pop_udaf;
@@ -50,7 +51,7 @@ use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTi
 use datatypes::data_type::ConcreteDataType;
 use itertools::Itertools;
 use promql::extension_plan::{
-    build_special_time_expr, EmptyMetric, HistogramFold, InstantManipulate, Millisecond,
+    build_special_time_expr, Absent, EmptyMetric, HistogramFold, InstantManipulate, Millisecond,
    RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn,
 };
 use promql::functions::{
@@ -86,6 +87,8 @@ use crate::promql::error::{
 const SPECIAL_TIME_FUNCTION: &str = "time";
 /// `scalar()` function in PromQL.
 const SCALAR_FUNCTION: &str = "scalar";
+/// `absent()` function in PromQL
+const SPECIAL_ABSENT_FUNCTION: &str = "absent";
 /// `histogram_quantile` function in PromQL
 const SPECIAL_HISTOGRAM_QUANTILE: &str = "histogram_quantile";
 /// `vector` function in PromQL
@@ -124,7 +127,10 @@ struct PromPlannerContext {
    time_index_column: Option<String>,
    field_columns: Vec<String>,
    tag_columns: Vec<String>,
+    /// The matcher for field columns `__field__`.
    field_column_matcher: Option<Vec<Matcher>>,
+    /// The matcher for selectors (normal matchers).
+    selector_matcher: Vec<Matcher>,
    schema_name: Option<String>,
    /// The range in millisecond of range selector. None if there is no range selector.
    range: Option<Millisecond>,
@@ -148,6 +154,7 @@ impl PromPlannerContext {
        self.field_columns = vec![];
        self.tag_columns = vec![];
        self.field_column_matcher = None;
+        self.selector_matcher.clear();
        self.schema_name = None;
        self.range = None;
    }
@@ -191,18 +198,38 @@ impl PromPlanner {
        planner.prom_expr_to_plan(&stmt.expr, session_state).await
    }

-    #[async_recursion]
    pub async fn prom_expr_to_plan(
        &mut self,
        prom_expr: &PromExpr,
        session_state: &SessionState,
+    ) -> Result<LogicalPlan> {
+        self.prom_expr_to_plan_inner(prom_expr, false, session_state)
+            .await
+    }
+
+    /**
+    Converts a PromQL expression to a logical plan.
+
+    NOTE:
+        The `timestamp_fn` indicates whether the PromQL `timestamp()` function is being evaluated in the current context.
+        If `true`, the planner generates a logical plan that projects the timestamp (time index) column
+        as the value column for each input row, implementing the PromQL `timestamp()` function semantics.
+        If `false`, the planner generates the standard logical plan for the given PromQL expression.
+    */
+    #[async_recursion]
+    async fn prom_expr_to_plan_inner(
+        &mut self,
+        prom_expr: &PromExpr,
+        timestamp_fn: bool,
+        session_state: &SessionState,
    ) -> Result<LogicalPlan> {
        let res = match prom_expr {
            PromExpr::Aggregate(expr) => self.prom_aggr_expr_to_plan(session_state, expr).await?,
            PromExpr::Unary(expr) => self.prom_unary_expr_to_plan(session_state, expr).await?,
            PromExpr::Binary(expr) => self.prom_binary_expr_to_plan(session_state, expr).await?,
            PromExpr::Paren(ParenExpr { expr }) => {
-                self.prom_expr_to_plan(expr, session_state).await?
+                self.prom_expr_to_plan_inner(expr, timestamp_fn, session_state)
+                    .await?
            }
            PromExpr::Subquery(expr) => {
                self.prom_subquery_expr_to_plan(session_state, expr).await?
@@ -210,7 +237,8 @@ impl PromPlanner {
            PromExpr::NumberLiteral(lit) => self.prom_number_lit_to_plan(lit)?,
            PromExpr::StringLiteral(lit) => self.prom_string_lit_to_plan(lit)?,
            PromExpr::VectorSelector(selector) => {
-                self.prom_vector_selector_to_plan(selector).await?
+                self.prom_vector_selector_to_plan(selector, timestamp_fn)
+                    .await?
            }
            PromExpr::MatrixSelector(selector) => {
                self.prom_matrix_selector_to_plan(selector).await?
@@ -673,6 +701,7 @@ impl PromPlanner {
    async fn prom_vector_selector_to_plan(
        &mut self,
        vector_selector: &VectorSelector,
+        timestamp_fn: bool,
    ) -> Result<LogicalPlan> {
        let VectorSelector {
            name,
@@ -687,6 +716,15 @@ impl PromPlanner {
        let normalize = self
            .selector_to_series_normalize_plan(offset, matchers, false)
            .await?;
+
+        let normalize = if timestamp_fn {
+            // If evaluating the PromQL `timestamp()` function, project the time index column as the value column
+            // before wrapping with [`InstantManipulate`], so the output matches PromQL's `timestamp()` semantics.
+            self.create_timestamp_func_plan(normalize)?
+        } else {
+            normalize
+        };
+
        let manipulate = InstantManipulate::new(
            self.ctx.start,
            self.ctx.end,
@@ -704,6 +742,43 @@ impl PromPlanner {
        }))
    }

+    /// Builds a projection plan for the PromQL `timestamp()` function.
+    /// Projects the time index column as the value column for each row.
+    ///
+    /// # Arguments
+    /// * `normalize` - Input [`LogicalPlan`] for the normalized series.
+    ///
+    /// # Returns
+    /// Returns a [`Result<LogicalPlan>`] where the resulting logical plan projects the timestamp
+    /// column as the value column, along with the original tag and time index columns.
+    ///
+    /// # Timestamp vs. Time Function
+    ///
+    /// - **Timestamp Function (`timestamp()`)**: In PromQL, the `timestamp()` function returns the
+    ///   timestamp (time index) of each sample as the value column.
+    ///
+    /// - **Time Function (`time()`)**: The `time()` function returns the evaluation time of the query
+    ///   as a scalar value.
+    ///
+    /// # Side Effects
+    /// Updates the planner context's field columns to the timestamp column name.
+    ///
+    fn create_timestamp_func_plan(&mut self, normalize: LogicalPlan) -> Result<LogicalPlan> {
+        let time_expr = build_special_time_expr(self.ctx.time_index_column.as_ref().unwrap())
+            .alias(DEFAULT_FIELD_COLUMN);
+        self.ctx.field_columns = vec![time_expr.schema_name().to_string()];
+        let mut project_exprs = Vec::with_capacity(self.ctx.tag_columns.len() + 2);
+        project_exprs.push(self.create_time_index_column_expr()?);
+        project_exprs.push(time_expr);
+        project_exprs.extend(self.create_tag_column_exprs()?);
+
+        LogicalPlanBuilder::from(normalize)
+            .project(project_exprs)
+            .context(DataFusionPlanningSnafu)?
+            .build()
+            .context(DataFusionPlanningSnafu)
+    }
+
    async fn prom_matrix_selector_to_plan(
        &mut self,
        matrix_selector: &MatrixSelector,
@@ -762,13 +837,15 @@ impl PromPlanner {
            }
            SPECIAL_VECTOR_FUNCTION => return self.create_vector_plan(args).await,
            SCALAR_FUNCTION => return self.create_scalar_plan(args, session_state).await,
+            SPECIAL_ABSENT_FUNCTION => return self.create_absent_plan(args, session_state).await,
            _ => {}
        }

        // transform function arguments
        let args = self.create_function_args(&args.args)?;
        let input = if let Some(prom_expr) = &args.input {
-            self.prom_expr_to_plan(prom_expr, session_state).await?
+            self.prom_expr_to_plan_inner(prom_expr, func.name == "timestamp", session_state)
+                .await?
        } else {
            self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string());
            self.ctx.reset_table_name_and_schema();
@@ -932,6 +1009,7 @@ impl PromPlanner {
                );
                self.ctx.schema_name = Some(matcher.value.clone());
            } else if matcher.name != METRIC_NAME {
+                self.ctx.selector_matcher.push(matcher.clone());
                let _ = matchers.insert(matcher.clone());
            }
        }
@@ -1177,6 +1255,13 @@ impl PromPlanner {
    ) -> Result<Vec<DfExpr>> {
        let mut exprs = Vec::with_capacity(label_matchers.matchers.len());
        for matcher in label_matchers.matchers {
+            if matcher.name == SCHEMA_COLUMN_MATCHER
+                || matcher.name == DB_COLUMN_MATCHER
+                || matcher.name == FIELD_COLUMN_MATCHER
+            {
+                continue;
+            }
+
            let col = if table_schema
                .field_with_unqualified_name(&matcher.name)
                .is_err()
@@ -1654,7 +1739,7 @@ impl PromPlanner {

                ScalarFunc::GeneratedExpr
            }
-            "sort" | "sort_desc" | "sort_by_label" | "sort_by_label_desc" => {
+            "sort" | "sort_desc" | "sort_by_label" | "sort_by_label_desc" | "timestamp" => {
                // These functions are not expression but a part of plan,
                // they are processed by `prom_call_expr_to_plan`.
                for value in &self.ctx.field_columns {
@@ -2265,10 +2350,14 @@ impl PromPlanner {
        let input_plan = self.prom_expr_to_plan(&input, session_state).await?;

        if !self.ctx.has_le_tag() {
-            return ColumnNotFoundSnafu {
-                col: LE_COLUMN_NAME.to_string(),
-            }
-            .fail();
+            // Return empty result instead of error when 'le' column is not found
+            // This handles the case when histogram metrics don't exist
+            return Ok(LogicalPlan::EmptyRelation(
+                datafusion::logical_expr::EmptyRelation {
+                    produce_one_row: false,
+                    schema: Arc::new(DFSchema::empty()),
+                },
+            ));
        }
        let time_index_column =
            self.ctx
@@ -2376,6 +2465,69 @@ impl PromPlanner {
        Ok(scalar_plan)
    }

+    /// Create a [SPECIAL_ABSENT_FUNCTION] plan
+    async fn create_absent_plan(
+        &mut self,
+        args: &PromFunctionArgs,
+        session_state: &SessionState,
+    ) -> Result<LogicalPlan> {
+        if args.args.len() != 1 {
+            return FunctionInvalidArgumentSnafu {
+                fn_name: SPECIAL_ABSENT_FUNCTION.to_string(),
+            }
+            .fail();
+        }
+        let input = self.prom_expr_to_plan(&args.args[0], session_state).await?;
+
+        let time_index_expr = self.create_time_index_column_expr()?;
+        let first_field_expr =
+            self.create_field_column_exprs()?
+                .pop()
+                .with_context(|| ValueNotFoundSnafu {
+                    table: self.ctx.table_name.clone().unwrap_or_default(),
+                })?;
+        let first_value_expr = first_value(first_field_expr, None);
+
+        let ordered_aggregated_input = LogicalPlanBuilder::from(input)
+            .aggregate(
+                vec![time_index_expr.clone()],
+                vec![first_value_expr.clone()],
+            )
+            .context(DataFusionPlanningSnafu)?
+            .sort(vec![time_index_expr.sort(true, false)])
+            .context(DataFusionPlanningSnafu)?
+            .build()
+            .context(DataFusionPlanningSnafu)?;
+
+        let fake_labels = self
+            .ctx
+            .selector_matcher
+            .iter()
+            .filter_map(|matcher| match matcher.op {
+                MatchOp::Equal => Some((matcher.name.clone(), matcher.value.clone())),
+                _ => None,
+            })
+            .collect::<Vec<_>>();
+
+        // Create the absent plan
+        let absent_plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(
+                Absent::try_new(
+                    self.ctx.start,
+                    self.ctx.end,
+                    self.ctx.interval,
+                    self.ctx.time_index_column.as_ref().unwrap().clone(),
+                    self.ctx.field_columns[0].clone(),
+                    fake_labels,
+                    ordered_aggregated_input,
+                )
+                .context(DataFusionPlanningSnafu)?,
+            ),
+        });
+
+        Ok(absent_plan)
+    }
+
    /// Try to build a DataFusion Literal Expression from PromQL Expr, return
    /// `None` if the input is not a literal expression.
    fn try_build_literal_expr(expr: &PromExpr) -> Option<DfExpr> {
@@ -4659,4 +4811,53 @@ Filter: up.field_0 IS NOT NULL [timestamp:Timestamp(Millisecond, None), field_0:

        assert_eq!(plan.display_indent_schema().to_string(), expected);
    }
+
+    #[tokio::test]
+    async fn test_histogram_quantile_missing_le_column() {
+        let mut eval_stmt = EvalStmt {
+            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
+            start: UNIX_EPOCH,
+            end: UNIX_EPOCH
+                .checked_add(Duration::from_secs(100_000))
+                .unwrap(),
+            interval: Duration::from_secs(5),
+            lookback_delta: Duration::from_secs(1),
+        };
+
+        // Test case: histogram_quantile with a table that doesn't have 'le' column
+        let case = r#"histogram_quantile(0.99, sum by(pod,instance,le) (rate(non_existent_histogram_bucket{instance=~"xxx"}[1m])))"#;
+
+        let prom_expr = parser::parse(case).unwrap();
+        eval_stmt.expr = prom_expr;
+
+        // Create a table provider with a table that doesn't have 'le' column
+        let table_provider = build_test_table_provider_with_fields(
+            &[(
+                DEFAULT_SCHEMA_NAME.to_string(),
+                "non_existent_histogram_bucket".to_string(),
+            )],
+            &["pod", "instance"], // Note: no 'le' column
+        )
+        .await;
+
+        // Should return empty result instead of error
+        let result =
+            PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state()).await;
+
+        // This should succeed now (returning empty result) instead of failing with "Cannot find column le"
+        assert!(
+            result.is_ok(),
+            "Expected successful plan creation with empty result, but got error: {:?}",
+            result.err()
+        );
+
+        // Verify that the result is an EmptyRelation
+        let plan = result.unwrap();
+        match plan {
+            LogicalPlan::EmptyRelation(_) => {
+                // This is what we expect
+            }
+            _ => panic!("Expected EmptyRelation, but got: {:?}", plan),
+        }
+    }
 }
--- a/src/servers/src/grpc/flight.rs
+++ b/src/servers/src/grpc/flight.rs
@@ -36,6 +36,7 @@ use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use futures::{future, ready, Stream};
 use futures_util::{StreamExt, TryStreamExt};
 use prost::Message;
+use session::context::{QueryContext, QueryContextRef};
 use snafu::{ensure, ResultExt};
 use table::table_name::TableName;
 use tokio::sync::mpsc;
@@ -188,6 +189,7 @@ impl FlightCraft for GreptimeRequestHandler {
        let ticket = request.into_inner().ticket;
        let request =
            GreptimeRequest::decode(ticket.as_ref()).context(error::InvalidFlightTicketSnafu)?;
+        let query_ctx = QueryContext::arc();

        // The Grpc protocol pass query by Flight. It needs to be wrapped under a span, in order to record stream
        let span = info_span!(
@@ -202,6 +204,7 @@ impl FlightCraft for GreptimeRequestHandler {
                output,
                TracingContext::from_current_span(),
                flight_compression,
+                query_ctx,
            );
            Ok(Response::new(stream))
        }
@@ -371,15 +374,25 @@ fn to_flight_data_stream(
    output: Output,
    tracing_context: TracingContext,
    flight_compression: FlightCompression,
+    query_ctx: QueryContextRef,
 ) -> TonicStream<FlightData> {
    match output.data {
        OutputData::Stream(stream) => {
-            let stream = FlightRecordBatchStream::new(stream, tracing_context, flight_compression);
+            let stream = FlightRecordBatchStream::new(
+                stream,
+                tracing_context,
+                flight_compression,
+                query_ctx,
+            );
            Box::pin(stream) as _
        }
        OutputData::RecordBatches(x) => {
-            let stream =
-                FlightRecordBatchStream::new(x.as_stream(), tracing_context, flight_compression);
+            let stream = FlightRecordBatchStream::new(
+                x.as_stream(),
+                tracing_context,
+                flight_compression,
+                query_ctx,
+            );
            Box::pin(stream) as _
        }
        OutputData::AffectedRows(rows) => {
--- a/src/servers/src/grpc/flight/stream.rs
+++ b/src/servers/src/grpc/flight/stream.rs
@@ -25,6 +25,7 @@ use futures::channel::mpsc;
 use futures::channel::mpsc::Sender;
 use futures::{SinkExt, Stream, StreamExt};
 use pin_project::{pin_project, pinned_drop};
+use session::context::QueryContextRef;
 use snafu::ResultExt;
 use tokio::task::JoinHandle;

@@ -46,10 +47,12 @@ impl FlightRecordBatchStream {
        recordbatches: SendableRecordBatchStream,
        tracing_context: TracingContext,
        compression: FlightCompression,
+        query_ctx: QueryContextRef,
    ) -> Self {
+        let should_send_partial_metrics = query_ctx.explain_verbose();
        let (tx, rx) = mpsc::channel::<TonicResult<FlightMessage>>(1);
        let join_handle = common_runtime::spawn_global(async move {
-            Self::flight_data_stream(recordbatches, tx)
+            Self::flight_data_stream(recordbatches, tx, should_send_partial_metrics)
                .trace(tracing_context.attach(info_span!("flight_data_stream")))
                .await
        });
@@ -69,6 +72,7 @@ impl FlightRecordBatchStream {
    async fn flight_data_stream(
        mut recordbatches: SendableRecordBatchStream,
        mut tx: Sender<TonicResult<FlightMessage>>,
+        should_send_partial_metrics: bool,
    ) {
        let schema = recordbatches.schema().arrow_schema().clone();
        if let Err(e) = tx.send(Ok(FlightMessage::Schema(schema))).await {
@@ -88,6 +92,17 @@ impl FlightRecordBatchStream {
                        warn!(e; "stop sending Flight data");
                        return;
                    }
+                    if should_send_partial_metrics {
+                        if let Some(metrics) = recordbatches
+                            .metrics()
+                            .and_then(|m| serde_json::to_string(&m).ok())
+                        {
+                            if let Err(e) = tx.send(Ok(FlightMessage::Metrics(metrics))).await {
+                                warn!(e; "stop sending Flight data");
+                                return;
+                            }
+                        }
+                    }
                }
                Err(e) => {
                    let e = Err(e).context(error::CollectRecordbatchSnafu);
@@ -154,6 +169,7 @@ mod test {
    use datatypes::schema::{ColumnSchema, Schema};
    use datatypes::vectors::Int32Vector;
    use futures::StreamExt;
+    use session::context::QueryContext;

    use super::*;

@@ -175,6 +191,7 @@ mod test {
            recordbatches,
            TracingContext::default(),
            FlightCompression::default(),
+            QueryContext::arc(),
        );

        let mut raw_data = Vec::with_capacity(2);
--- a/src/servers/src/grpc/greptime_handler.rs
+++ b/src/servers/src/grpc/greptime_handler.rs
@@ -42,6 +42,7 @@ use session::hints::READ_PREFERENCE_HINT;
 use snafu::{OptionExt, ResultExt};
 use table::TableRef;
 use tokio::sync::mpsc;
+use tokio::sync::mpsc::error::TrySendError;

 use crate::error::Error::UnsupportedAuthScheme;
 use crate::error::{
@@ -176,8 +177,9 @@ impl GreptimeRequestHandler {
                let result = result
                    .map(|x| DoPutResponse::new(request_id, x))
                    .map_err(Into::into);
-                if result_sender.try_send(result).is_err() {
-                    warn!(r#""DoPut" client maybe unreachable, abort handling its message"#);
+                if let Err(e)= result_sender.try_send(result)
+                    && let TrySendError::Closed(_) = e {
+                    warn!(r#""DoPut" client with request_id {} maybe unreachable, abort handling its message"#, request_id);
                    break;
                }
            }
--- a/src/servers/src/grpc/prom_query_gateway.rs
+++ b/src/servers/src/grpc/prom_query_gateway.rs
@@ -121,7 +121,7 @@ impl PrometheusGatewayService {
        let result = self.handler.do_query(&query, ctx).await;
        let (metric_name, mut result_type) =
            match retrieve_metric_name_and_result_type(&query.query) {
-                Ok((metric_name, result_type)) => (metric_name.unwrap_or_default(), result_type),
+                Ok((metric_name, result_type)) => (metric_name, result_type),
                Err(err) => {
                    return PrometheusJsonResponse::error(err.status_code(), err.output_msg())
                }
--- a/src/servers/src/http/prom_store.rs
+++ b/src/servers/src/http/prom_store.rs
@@ -38,7 +38,7 @@ use crate::error::{self, InternalSnafu, PipelineSnafu, Result};
 use crate::http::extractor::PipelineInfo;
 use crate::http::header::{write_cost_header_map, GREPTIME_DB_HEADER_METRICS};
 use crate::http::PromValidationMode;
-use crate::prom_store::{snappy_decompress, zstd_decompress};
+use crate::prom_store::{extract_schema_from_read_request, snappy_decompress, zstd_decompress};
 use crate::proto::{PromSeriesProcessor, PromWriteRequest};
 use crate::query_handler::{PipelineHandlerRef, PromStoreProtocolHandlerRef, PromStoreResponse};

@@ -117,6 +117,7 @@ pub async fn remote_write(
    let is_zstd = content_encoding.contains(VM_ENCODING);

    let mut processor = PromSeriesProcessor::default_processor();
+
    if let Some(pipeline_name) = pipeline_info.pipeline_name {
        let pipeline_def = PipelineDefinition::from_name(
            &pipeline_name,
@@ -184,13 +185,19 @@ pub async fn remote_read(
 ) -> Result<PromStoreResponse> {
    let db = params.db.clone().unwrap_or_default();
    query_ctx.set_channel(Channel::Prometheus);
+
+    let request = decode_remote_read_request(body).await?;
+
+    // Extract schema from special labels and set it in query context
+    if let Some(schema) = extract_schema_from_read_request(&request) {
+        query_ctx.set_current_schema(&schema);
+    }
+
    let query_ctx = Arc::new(query_ctx);
    let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_READ_ELAPSED
        .with_label_values(&[db.as_str()])
        .start_timer();

-    let request = decode_remote_read_request(body).await?;
-
    state.prom_store_handler.read(request, query_ctx).await
 }

--- a/src/servers/src/http/prometheus.rs
+++ b/src/servers/src/http/prometheus.rs
@@ -56,7 +56,7 @@ use crate::error::{
    TableNotFoundSnafu, UnexpectedResultSnafu,
 };
 use crate::http::header::collect_plan_metrics;
-use crate::prom_store::{FIELD_NAME_LABEL, METRIC_NAME_LABEL};
+use crate::prom_store::{DATABASE_LABEL, FIELD_NAME_LABEL, METRIC_NAME_LABEL, SCHEMA_LABEL};
 use crate::prometheus_handler::PrometheusHandlerRef;

 /// For [ValueType::Vector] result type
@@ -318,7 +318,7 @@ async fn do_instant_query(
 ) -> PrometheusJsonResponse {
    let result = handler.do_query(prom_query, query_ctx).await;
    let (metric_name, result_type) = match retrieve_metric_name_and_result_type(&prom_query.query) {
-        Ok((metric_name, result_type)) => (metric_name.unwrap_or_default(), result_type),
+        Ok((metric_name, result_type)) => (metric_name, result_type),
        Err(err) => return PrometheusJsonResponse::error(err.status_code(), err.output_msg()),
    };
    PrometheusJsonResponse::from_query_result(result, metric_name, result_type).await
@@ -428,7 +428,7 @@ async fn do_range_query(
    let result = handler.do_query(prom_query, query_ctx).await;
    let metric_name = match retrieve_metric_name_and_result_type(&prom_query.query) {
        Err(err) => return PrometheusJsonResponse::error(err.status_code(), err.output_msg()),
-        Ok((metric_name, _)) => metric_name.unwrap_or_default(),
+        Ok((metric_name, _)) => metric_name,
    };
    PrometheusJsonResponse::from_query_result(result, metric_name, ValueType::Matrix).await
 }
@@ -824,13 +824,52 @@ pub(crate) fn try_update_catalog_schema(ctx: &mut QueryContext, catalog: &str, s
 }

 fn promql_expr_to_metric_name(expr: &PromqlExpr) -> Option<String> {
-    find_metric_name_and_matchers(expr, |name, matchers| {
-        name.clone().or(matchers
-            .find_matchers(METRIC_NAME)
-            .into_iter()
-            .next()
-            .map(|m| m.value))
-    })
+    let mut metric_names = HashSet::new();
+    collect_metric_names(expr, &mut metric_names);
+
+    // Return the metric name only if there's exactly one unique metric name
+    if metric_names.len() == 1 {
+        metric_names.into_iter().next()
+    } else {
+        None
+    }
+}
+
+/// Recursively collect all metric names from a PromQL expression
+fn collect_metric_names(expr: &PromqlExpr, metric_names: &mut HashSet<String>) {
+    match expr {
+        PromqlExpr::Aggregate(AggregateExpr { expr, .. }) => {
+            collect_metric_names(expr, metric_names)
+        }
+        PromqlExpr::Unary(UnaryExpr { expr }) => collect_metric_names(expr, metric_names),
+        PromqlExpr::Binary(BinaryExpr { lhs, rhs, .. }) => {
+            collect_metric_names(lhs, metric_names);
+            collect_metric_names(rhs, metric_names);
+        }
+        PromqlExpr::Paren(ParenExpr { expr }) => collect_metric_names(expr, metric_names),
+        PromqlExpr::Subquery(SubqueryExpr { expr, .. }) => collect_metric_names(expr, metric_names),
+        PromqlExpr::VectorSelector(VectorSelector { name, matchers, .. }) => {
+            if let Some(name) = name {
+                metric_names.insert(name.clone());
+            } else if let Some(matcher) = matchers.find_matchers(METRIC_NAME).into_iter().next() {
+                metric_names.insert(matcher.value);
+            }
+        }
+        PromqlExpr::MatrixSelector(MatrixSelector { vs, .. }) => {
+            let VectorSelector { name, matchers, .. } = vs;
+            if let Some(name) = name {
+                metric_names.insert(name.clone());
+            } else if let Some(matcher) = matchers.find_matchers(METRIC_NAME).into_iter().next() {
+                metric_names.insert(matcher.value);
+            }
+        }
+        PromqlExpr::Call(Call { args, .. }) => {
+            args.args
+                .iter()
+                .for_each(|e| collect_metric_names(e, metric_names));
+        }
+        PromqlExpr::NumberLiteral(_) | PromqlExpr::StringLiteral(_) | PromqlExpr::Extension(_) => {}
+    }
 }

 fn find_metric_name_and_matchers<E, F>(expr: &PromqlExpr, f: F) -> Option<E>
@@ -995,6 +1034,19 @@ pub async fn label_values_query(
        let mut field_columns = field_columns.into_iter().collect::<Vec<_>>();
        field_columns.sort_unstable();
        return PrometheusJsonResponse::success(PrometheusResponse::LabelValues(field_columns));
+    } else if label_name == SCHEMA_LABEL || label_name == DATABASE_LABEL {
+        let catalog_manager = handler.catalog_manager();
+
+        match retrieve_schema_names(&query_ctx, catalog_manager, params.matches.0).await {
+            Ok(schema_names) => {
+                return PrometheusJsonResponse::success(PrometheusResponse::LabelValues(
+                    schema_names,
+                ));
+            }
+            Err(e) => {
+                return PrometheusJsonResponse::error(e.status_code(), e.output_msg());
+            }
+        }
    }

    let queries = params.matches.0;
@@ -1112,53 +1164,51 @@ async fn retrieve_field_names(
    Ok(field_columns)
 }

-/// Try to parse and extract the name of referenced metric from the promql query.
-///
-/// Returns the metric name if a single metric is referenced, otherwise None.
-fn retrieve_metric_name_from_promql(query: &str) -> Option<String> {
-    let promql_expr = promql_parser::parser::parse(query).ok()?;
+async fn retrieve_schema_names(
+    query_ctx: &QueryContext,
+    catalog_manager: CatalogManagerRef,
+    matches: Vec<String>,
+) -> Result<Vec<String>> {
+    let mut schemas = Vec::new();
+    let catalog = query_ctx.current_catalog();

-    struct MetricNameVisitor {
-        metric_name: Option<String>,
-    }
+    let candidate_schemas = catalog_manager
+        .schema_names(catalog, Some(query_ctx))
+        .await
+        .context(CatalogSnafu)?;

-    impl promql_parser::util::ExprVisitor for MetricNameVisitor {
-        type Error = ();
-
-        fn pre_visit(&mut self, plan: &PromqlExpr) -> std::result::Result<bool, Self::Error> {
-            let query_metric_name = match plan {
-                PromqlExpr::VectorSelector(vs) => vs
-                    .matchers
-                    .find_matchers(METRIC_NAME)
-                    .into_iter()
-                    .next()
-                    .map(|m| m.value)
-                    .or_else(|| vs.name.clone()),
-                PromqlExpr::MatrixSelector(ms) => ms
-                    .vs
-                    .matchers
-                    .find_matchers(METRIC_NAME)
-                    .into_iter()
-                    .next()
-                    .map(|m| m.value)
-                    .or_else(|| ms.vs.name.clone()),
-                _ => return Ok(true),
-            };
-
-            // set it to empty string if multiple metrics are referenced.
-            if self.metric_name.is_some() && query_metric_name.is_some() {
-                self.metric_name = Some(String::new());
-            } else {
-                self.metric_name = query_metric_name.or_else(|| self.metric_name.clone());
+    for schema in candidate_schemas {
+        let mut found = true;
+        for match_item in &matches {
+            if let Some(table_name) = retrieve_metric_name_from_promql(match_item) {
+                let exists = catalog_manager
+                    .table_exists(catalog, &schema, &table_name, Some(query_ctx))
+                    .await
+                    .context(CatalogSnafu)?;
+                if !exists {
+                    found = false;
+                    break;
+                }
            }
+        }

-            Ok(true)
+        if found {
+            schemas.push(schema);
        }
    }

-    let mut visitor = MetricNameVisitor { metric_name: None };
-    promql_parser::util::walk_expr(&mut visitor, &promql_expr).ok()?;
-    visitor.metric_name
+    schemas.sort_unstable();
+
+    Ok(schemas)
+}
+
+/// Try to parse and extract the name of referenced metric from the promql query.
+///
+/// Returns the metric name if exactly one unique metric is referenced, otherwise None.
+/// Multiple references to the same metric are allowed.
+fn retrieve_metric_name_from_promql(query: &str) -> Option<String> {
+    let promql_expr = promql_parser::parser::parse(query).ok()?;
+    promql_expr_to_metric_name(&promql_expr)
 }

 #[derive(Debug, Default, Serialize, Deserialize)]
@@ -1275,3 +1325,205 @@ pub async fn parse_query(
        PrometheusJsonResponse::error(StatusCode::InvalidArguments, "query is required")
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use promql_parser::parser::value::ValueType;
+
+    use super::*;
+
+    struct TestCase {
+        name: &'static str,
+        promql: &'static str,
+        expected_metric: Option<&'static str>,
+        expected_type: ValueType,
+        should_error: bool,
+    }
+
+    #[test]
+    fn test_retrieve_metric_name_and_result_type() {
+        let test_cases = &[
+            // Single metric cases
+            TestCase {
+                name: "simple metric",
+                promql: "cpu_usage",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "metric with selector",
+                promql: r#"cpu_usage{instance="localhost"}"#,
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "metric with range selector",
+                promql: "cpu_usage[5m]",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Matrix,
+                should_error: false,
+            },
+            TestCase {
+                name: "metric with __name__ matcher",
+                promql: r#"{__name__="cpu_usage"}"#,
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "metric with unary operator",
+                promql: "-cpu_usage",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            // Aggregation and function cases
+            TestCase {
+                name: "metric with aggregation",
+                promql: "sum(cpu_usage)",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "complex aggregation",
+                promql: r#"sum by (instance) (cpu_usage{job="node"})"#,
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            // Same metric binary operations
+            TestCase {
+                name: "same metric addition",
+                promql: "cpu_usage + cpu_usage",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "metric with scalar addition",
+                promql: r#"sum(rate(cpu_usage{job="node"}[5m])) by (instance) + 100"#,
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            // Multiple metrics cases
+            TestCase {
+                name: "different metrics addition",
+                promql: "cpu_usage + memory_usage",
+                expected_metric: None,
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "different metrics subtraction",
+                promql: "network_in - network_out",
+                expected_metric: None,
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            // Unless operator cases
+            TestCase {
+                name: "unless with different metrics",
+                promql: "cpu_usage unless memory_usage",
+                expected_metric: None,
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            TestCase {
+                name: "unless with same metric",
+                promql: "cpu_usage unless cpu_usage",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Vector,
+                should_error: false,
+            },
+            // Subquery cases
+            TestCase {
+                name: "basic subquery",
+                promql: "cpu_usage[5m:1m]",
+                expected_metric: Some("cpu_usage"),
+                expected_type: ValueType::Matrix,
+                should_error: false,
+            },
+            TestCase {
+                name: "subquery with multiple metrics",
+                promql: "(cpu_usage + memory_usage)[5m:1m]",
+                expected_metric: None,
+                expected_type: ValueType::Matrix,
+                should_error: false,
+            },
+            // Literal values
+            TestCase {
+                name: "scalar value",
+                promql: "42",
+                expected_metric: None,
+                expected_type: ValueType::Scalar,
+                should_error: false,
+            },
+            TestCase {
+                name: "string literal",
+                promql: r#""hello world""#,
+                expected_metric: None,
+                expected_type: ValueType::String,
+                should_error: false,
+            },
+            // Error cases
+            TestCase {
+                name: "invalid syntax",
+                promql: "cpu_usage{invalid=",
+                expected_metric: None,
+                expected_type: ValueType::Vector,
+                should_error: true,
+            },
+            TestCase {
+                name: "empty query",
+                promql: "",
+                expected_metric: None,
+                expected_type: ValueType::Vector,
+                should_error: true,
+            },
+            TestCase {
+                name: "malformed brackets",
+                promql: "cpu_usage[5m",
+                expected_metric: None,
+                expected_type: ValueType::Vector,
+                should_error: true,
+            },
+        ];
+
+        for test_case in test_cases {
+            let result = retrieve_metric_name_and_result_type(test_case.promql);
+
+            if test_case.should_error {
+                assert!(
+                    result.is_err(),
+                    "Test '{}' should have failed but succeeded with: {:?}",
+                    test_case.name,
+                    result
+                );
+            } else {
+                let (metric_name, value_type) = result.unwrap_or_else(|e| {
+                    panic!(
+                        "Test '{}' should have succeeded but failed with error: {}",
+                        test_case.name, e
+                    )
+                });
+
+                let expected_metric_name = test_case.expected_metric.map(|s| s.to_string());
+                assert_eq!(
+                    metric_name, expected_metric_name,
+                    "Test '{}': metric name mismatch. Expected: {:?}, Got: {:?}",
+                    test_case.name, expected_metric_name, metric_name
+                );
+
+                assert_eq!(
+                    value_type, test_case.expected_type,
+                    "Test '{}': value type mismatch. Expected: {:?}, Got: {:?}",
+                    test_case.name, test_case.expected_type, value_type
+                );
+            }
+        }
+    }
+}
--- a/src/servers/src/http/result/prometheus_resp.rs
+++ b/src/servers/src/http/result/prometheus_resp.rs
@@ -118,7 +118,7 @@ impl PrometheusJsonResponse {
    /// Convert from `Result<Output>`
    pub async fn from_query_result(
        result: Result<Output>,
-        metric_name: String,
+        metric_name: Option<String>,
        result_type: ValueType,
    ) -> Self {
        let response: Result<Self> = try {
@@ -182,7 +182,7 @@ impl PrometheusJsonResponse {
    /// Convert [RecordBatches] to [PromData]
    fn record_batches_to_data(
        batches: RecordBatches,
-        metric_name: String,
+        metric_name: Option<String>,
        result_type: ValueType,
    ) -> Result<PrometheusResponse> {
        // infer semantic type of each column from schema.
@@ -230,7 +230,6 @@ impl PrometheusJsonResponse {
            reason: "no value column found".to_string(),
        })?;

-        let metric_name = (METRIC_NAME, metric_name.as_str());
        // Preserves the order of output tags.
        // Tag order matters, e.g., after sorc and sort_desc, the output order must be kept.
        let mut buffer = IndexMap::<Vec<(&str, &str)>, Vec<(f64, String)>>::new();
@@ -276,9 +275,10 @@ impl PrometheusJsonResponse {
                    }

                    // retrieve tags
-                    // TODO(ruihang): push table name `__metric__`
                    let mut tags = Vec::with_capacity(num_label_columns + 1);
-                    tags.push(metric_name);
+                    if let Some(metric_name) = &metric_name {
+                        tags.push((METRIC_NAME, metric_name.as_str()));
+                    }
                    for (tag_column, tag_name) in tag_columns.iter().zip(tag_names.iter()) {
                        // TODO(ruihang): add test for NULL tag
                        if let Some(tag_value) = tag_column.get_data(row_index) {
--- a/src/servers/src/mysql/federated.rs
+++ b/src/servers/src/mysql/federated.rs
@@ -170,7 +170,7 @@ fn select_variable(query: &str, query_context: QueryContextRef) -> Option<Output

    // skip the first "select"
    for var in vars.iter().skip(1) {
-        let var = var.trim_matches(|c| c == ' ' || c == ',');
+        let var = var.trim_matches(|c| c == ' ' || c == ',' || c == ';');
        let var_as: Vec<&str> = var
            .split(" as ")
            .map(|x| {
@@ -185,6 +185,9 @@ fn select_variable(query: &str, query_context: QueryContextRef) -> Option<Output
        let value = match var_as[0] {
            "session.time_zone" | "time_zone" => query_context.timezone().to_string(),
            "system_time_zone" => system_timezone_name(),
+            "max_execution_time" | "session.max_execution_time" => {
+                query_context.query_timeout_as_millis().to_string()
+            }
            _ => VAR_VALUES
                .get(var_as[0])
                .map(|v| v.to_string())
@@ -352,11 +355,11 @@ mod test {
        // complex variables
        let query = "/* mysql-connector-java-8.0.17 (Revision: 16a712ddb3f826a1933ab42b0039f7fb9eebc6ec) */SELECT  @@session.auto_increment_increment AS auto_increment_increment, @@character_set_client AS character_set_client, @@character_set_connection AS character_set_connection, @@character_set_results AS character_set_results, @@character_set_server AS character_set_server, @@collation_server AS collation_server, @@collation_connection AS collation_connection, @@init_connect AS init_connect, @@interactive_timeout AS interactive_timeout, @@license AS license, @@lower_case_table_names AS lower_case_table_names, @@max_allowed_packet AS max_allowed_packet, @@net_write_timeout AS net_write_timeout, @@performance_schema AS performance_schema, @@sql_mode AS sql_mode, @@system_time_zone AS system_time_zone, @@time_zone AS time_zone, @@transaction_isolation AS transaction_isolation, @@wait_timeout AS wait_timeout;";
        let expected = "\
-+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+---------------+-----------------------+---------------+
-| auto_increment_increment | character_set_client | character_set_connection | character_set_results | character_set_server | collation_server | collation_connection | init_connect | interactive_timeout | license | lower_case_table_names | max_allowed_packet | net_write_timeout | performance_schema | sql_mode | system_time_zone | time_zone     | transaction_isolation | wait_timeout; |
-+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+---------------+-----------------------+---------------+
-| 0                        | 0                    | 0                        | 0                     | 0                    | 0                | 0                    | 0            | 31536000            | 0       | 0                      | 134217728          | 31536000          | 0                  | 0        | Asia/Shanghai    | Asia/Shanghai | REPEATABLE-READ       | 31536000      |
-+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+---------------+-----------------------+---------------+";
+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+---------------+-----------------------+--------------+
+| auto_increment_increment | character_set_client | character_set_connection | character_set_results | character_set_server | collation_server | collation_connection | init_connect | interactive_timeout | license | lower_case_table_names | max_allowed_packet | net_write_timeout | performance_schema | sql_mode | system_time_zone | time_zone     | transaction_isolation | wait_timeout |
+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+---------------+-----------------------+--------------+
+| 0                        | 0                    | 0                        | 0                     | 0                    | 0                | 0                    | 0            | 31536000            | 0       | 0                      | 134217728          | 31536000          | 0                  | 0        | Asia/Shanghai    | Asia/Shanghai | REPEATABLE-READ       | 31536000     |
+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+---------------+-----------------------+--------------+";
        test(query, expected);

        let query = "show variables";
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -167,6 +167,9 @@ async fn run_custom_pipeline(
            PipelineExecOutput::DispatchedTo(dispatched_to, val) => {
                push_to_map!(dispatched, dispatched_to, val, arr_len);
            }
+            PipelineExecOutput::Filtered => {
+                continue;
+            }
        }
    }

--- a/src/servers/src/prom_store.rs
+++ b/src/servers/src/prom_store.rs
@@ -19,7 +19,7 @@ use std::collections::BTreeMap;
 use std::hash::{Hash, Hasher};

 use api::prom_store::remote::label_matcher::Type as MatcherType;
-use api::prom_store::remote::{Label, Query, Sample, TimeSeries, WriteRequest};
+use api::prom_store::remote::{Label, Query, ReadRequest, Sample, TimeSeries, WriteRequest};
 use api::v1::RowInsertRequests;
 use common_grpc::precision::Precision;
 use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE};
@@ -44,6 +44,9 @@ pub const METRIC_NAME_LABEL_BYTES: &[u8] = b"__name__";
 pub const DATABASE_LABEL: &str = "__database__";
 pub const DATABASE_LABEL_BYTES: &[u8] = b"__database__";

+pub const SCHEMA_LABEL: &str = "__schema__";
+pub const SCHEMA_LABEL_BYTES: &[u8] = b"__schema__";
+
 pub const PHYSICAL_TABLE_LABEL: &str = "__physical_table__";
 pub const PHYSICAL_TABLE_LABEL_BYTES: &[u8] = b"__physical_table__";

@@ -73,6 +76,29 @@ pub fn table_name(q: &Query) -> Result<String> {
        })
 }

+/// Extract schema from remote read request. Returns the first schema found from any query's matchers.
+/// Prioritizes __schema__ over __database__ labels.
+pub fn extract_schema_from_read_request(request: &ReadRequest) -> Option<String> {
+    for query in &request.queries {
+        for matcher in &query.matchers {
+            if matcher.name == SCHEMA_LABEL && matcher.r#type == MatcherType::Eq as i32 {
+                return Some(matcher.value.clone());
+            }
+        }
+    }
+
+    // If no __schema__ found, look for __database__
+    for query in &request.queries {
+        for matcher in &query.matchers {
+            if matcher.name == DATABASE_LABEL && matcher.r#type == MatcherType::Eq as i32 {
+                return Some(matcher.value.clone());
+            }
+        }
+    }
+
+    None
+}
+
 /// Create a DataFrame from a remote Query
 #[tracing::instrument(skip_all)]
 pub fn query_to_plan(dataframe: DataFrame, q: &Query) -> Result<LogicalPlan> {
@@ -91,7 +117,7 @@ pub fn query_to_plan(dataframe: DataFrame, q: &Query) -> Result<LogicalPlan> {
    for m in label_matches {
        let name = &m.name;

-        if name == METRIC_NAME_LABEL {
+        if name == METRIC_NAME_LABEL || name == SCHEMA_LABEL || name == DATABASE_LABEL {
            continue;
        }

--- a/src/servers/src/proto.rs
+++ b/src/servers/src/proto.rs
@@ -34,7 +34,7 @@ use crate::http::PromValidationMode;
 use crate::pipeline::run_pipeline;
 use crate::prom_row_builder::{PromCtx, TablesBuilder};
 use crate::prom_store::{
-    DATABASE_LABEL_BYTES, METRIC_NAME_LABEL_BYTES, PHYSICAL_TABLE_LABEL_BYTES,
+    DATABASE_LABEL_BYTES, METRIC_NAME_LABEL_BYTES, PHYSICAL_TABLE_LABEL_BYTES, SCHEMA_LABEL_BYTES,
 };
 use crate::query_handler::PipelineHandlerRef;
 use crate::repeated_field::{Clear, RepeatedField};
@@ -199,10 +199,17 @@ impl PromTimeSeries {
                        self.table_name = decode_string(&label.value, prom_validation_mode)?;
                        self.labels.truncate(self.labels.len() - 1); // remove last label
                    }
-                    DATABASE_LABEL_BYTES => {
+                    SCHEMA_LABEL_BYTES => {
                        self.schema = Some(decode_string(&label.value, prom_validation_mode)?);
                        self.labels.truncate(self.labels.len() - 1); // remove last label
                    }
+                    DATABASE_LABEL_BYTES => {
+                        // Only set schema from __database__ if __schema__ hasn't been set yet
+                        if self.schema.is_none() {
+                            self.schema = Some(decode_string(&label.value, prom_validation_mode)?);
+                        }
+                        self.labels.truncate(self.labels.len() - 1); // remove last label
+                    }
                    PHYSICAL_TABLE_LABEL_BYTES => {
                        self.physical_table =
                            Some(decode_string(&label.value, prom_validation_mode)?);
--- a/tests-integration/src/grpc.rs
+++ b/tests-integration/src/grpc.rs
@@ -882,11 +882,14 @@ CREATE TABLE {table_name} (
            let region_id = RegionId::new(table_id, *region);

            let stream = region_server
-                .handle_remote_read(RegionQueryRequest {
-                    region_id: region_id.as_u64(),
-                    plan: plan.to_vec(),
-                    ..Default::default()
-                })
+                .handle_remote_read(
+                    RegionQueryRequest {
+                        region_id: region_id.as_u64(),
+                        plan: plan.to_vec(),
+                        ..Default::default()
+                    },
+                    QueryContext::arc(),
+                )
                .await
                .unwrap();

--- a/tests-integration/src/instance.rs
+++ b/tests-integration/src/instance.rs
@@ -249,11 +249,14 @@ mod tests {
            let region_id = RegionId::new(table_id, *region);

            let stream = region_server
-                .handle_remote_read(QueryRequest {
-                    region_id: region_id.as_u64(),
-                    plan: plan.to_vec(),
-                    ..Default::default()
-                })
+                .handle_remote_read(
+                    QueryRequest {
+                        region_id: region_id.as_u64(),
+                        plan: plan.to_vec(),
+                        ..Default::default()
+                    },
+                    QueryContext::arc(),
+                )
                .await
                .unwrap();

--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -16,7 +16,10 @@ use std::collections::BTreeMap;
 use std::io::Write;
 use std::str::FromStr;

-use api::prom_store::remote::WriteRequest;
+use api::prom_store::remote::label_matcher::Type as MatcherType;
+use api::prom_store::remote::{
+    Label, LabelMatcher, Query, ReadRequest, ReadResponse, Sample, TimeSeries, WriteRequest,
+};
 use auth::user_provider_from_option;
 use axum::http::{HeaderName, HeaderValue, StatusCode};
 use chrono::Utc;
@@ -94,6 +97,7 @@ macro_rules! http_tests {
                test_dashboard_path,
                test_prometheus_remote_write,
                test_prometheus_remote_special_labels,
+                test_prometheus_remote_schema_labels,
                test_prometheus_remote_write_with_pipeline,
                test_vm_proto_remote_write,

@@ -112,6 +116,7 @@ macro_rules! http_tests {
                test_pipeline_with_hint_vrl,
                test_pipeline_2,
                test_pipeline_skip_error,
+                test_pipeline_filter,

                test_otlp_metrics,
                test_otlp_traces_v0,
@@ -780,6 +785,89 @@ pub async fn test_prom_http_api(store_type: StorageType) {
        serde_json::from_value::<PrometheusResponse>(json!(["host1", "host2"])).unwrap()
    );

+    // special labels
+    let res = client
+        .get("/v1/prometheus/api/v1/label/__schema__/values?start=0&end=600")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body = serde_json::from_str::<PrometheusJsonResponse>(&res.text().await).unwrap();
+    assert_eq!(body.status, "success");
+    assert_eq!(
+        body.data,
+        serde_json::from_value::<PrometheusResponse>(json!([
+            "greptime_private",
+            "information_schema",
+            "public"
+        ]))
+        .unwrap()
+    );
+
+    // special labels
+    let res = client
+        .get("/v1/prometheus/api/v1/label/__schema__/values?match[]=demo&start=0&end=600")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body = serde_json::from_str::<PrometheusJsonResponse>(&res.text().await).unwrap();
+    assert_eq!(body.status, "success");
+    assert_eq!(
+        body.data,
+        serde_json::from_value::<PrometheusResponse>(json!(["public"])).unwrap()
+    );
+
+    // special labels
+    let res = client
+        .get("/v1/prometheus/api/v1/label/__database__/values?match[]=demo&start=0&end=600")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body = serde_json::from_str::<PrometheusJsonResponse>(&res.text().await).unwrap();
+    assert_eq!(body.status, "success");
+    assert_eq!(
+        body.data,
+        serde_json::from_value::<PrometheusResponse>(json!(["public"])).unwrap()
+    );
+
+    // special labels
+    let res = client
+        .get("/v1/prometheus/api/v1/label/__database__/values?match[]=multi_labels{idc=\"idc1\", env=\"dev\"}&start=0&end=600")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body = serde_json::from_str::<PrometheusJsonResponse>(&res.text().await).unwrap();
+    assert_eq!(body.status, "success");
+    assert_eq!(
+        body.data,
+        serde_json::from_value::<PrometheusResponse>(json!(["public"])).unwrap()
+    );
+
+    // match special labels.
+    let res = client
+        .get("/v1/prometheus/api/v1/label/host/values?match[]=multi_labels{__schema__=\"public\", idc=\"idc1\", env=\"dev\"}&start=0&end=600")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body = serde_json::from_str::<PrometheusJsonResponse>(&res.text().await).unwrap();
+    assert_eq!(body.status, "success");
+    assert_eq!(
+        body.data,
+        serde_json::from_value::<PrometheusResponse>(json!(["host1", "host2"])).unwrap()
+    );
+
+    // match special labels.
+    let res = client
+        .get("/v1/prometheus/api/v1/label/host/values?match[]=multi_labels{__schema__=\"information_schema\", idc=\"idc1\", env=\"dev\"}&start=0&end=600")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body = serde_json::from_str::<PrometheusJsonResponse>(&res.text().await).unwrap();
+    assert_eq!(body.status, "success");
+    assert_eq!(
+        body.data,
+        serde_json::from_value::<PrometheusResponse>(json!([])).unwrap()
+    );
+
    // search field name
    let res = client
        .get("/v1/prometheus/api/v1/label/__field__/values?match[]=demo")
@@ -1137,6 +1225,7 @@ write_cache_path = ""
 write_cache_size = "5GiB"
 sst_write_buffer_size = "8MiB"
 parallel_scan_channel_size = 32
+max_concurrent_scan_files = 128
 allow_stale_entries = false
 min_compaction_interval = "0s"

@@ -1464,6 +1553,188 @@ pub async fn test_prometheus_remote_write_with_pipeline(store_type: StorageType)
    guard.remove_all().await;
 }

+pub async fn test_prometheus_remote_schema_labels(store_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_prom_app_with_frontend(store_type, "test_prometheus_remote_schema_labels").await;
+    let client = TestClient::new(app).await;
+
+    // Create test schemas
+    let res = client
+        .post("/v1/sql?sql=create database test_schema_1")
+        .header("Content-Type", "application/x-www-form-urlencoded")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let res = client
+        .post("/v1/sql?sql=create database test_schema_2")
+        .header("Content-Type", "application/x-www-form-urlencoded")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // Write data with __schema__ label
+    let schema_series = TimeSeries {
+        labels: vec![
+            Label {
+                name: "__name__".to_string(),
+                value: "metric_with_schema".to_string(),
+            },
+            Label {
+                name: "__schema__".to_string(),
+                value: "test_schema_1".to_string(),
+            },
+            Label {
+                name: "instance".to_string(),
+                value: "host1".to_string(),
+            },
+        ],
+        samples: vec![Sample {
+            value: 100.0,
+            timestamp: 1000,
+        }],
+        ..Default::default()
+    };
+
+    let write_request = WriteRequest {
+        timeseries: vec![schema_series],
+        ..Default::default()
+    };
+    let serialized_request = write_request.encode_to_vec();
+    let compressed_request =
+        prom_store::snappy_compress(&serialized_request).expect("failed to encode snappy");
+
+    let res = client
+        .post("/v1/prometheus/write")
+        .header("Content-Encoding", "snappy")
+        .body(compressed_request)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::NO_CONTENT);
+
+    // Read data from test_schema_1 using __schema__ matcher
+    let read_request = ReadRequest {
+        queries: vec![Query {
+            start_timestamp_ms: 500,
+            end_timestamp_ms: 1500,
+            matchers: vec![
+                LabelMatcher {
+                    name: "__name__".to_string(),
+                    value: "metric_with_schema".to_string(),
+                    r#type: MatcherType::Eq as i32,
+                },
+                LabelMatcher {
+                    name: "__schema__".to_string(),
+                    value: "test_schema_1".to_string(),
+                    r#type: MatcherType::Eq as i32,
+                },
+            ],
+            ..Default::default()
+        }],
+        ..Default::default()
+    };
+
+    let serialized_read_request = read_request.encode_to_vec();
+    let compressed_read_request =
+        prom_store::snappy_compress(&serialized_read_request).expect("failed to encode snappy");
+
+    let mut result = client
+        .post("/v1/prometheus/read")
+        .body(compressed_read_request)
+        .send()
+        .await;
+    assert_eq!(result.status(), StatusCode::OK);
+
+    let response_body = result.chunk().await.unwrap();
+    let decompressed_response = prom_store::snappy_decompress(&response_body).unwrap();
+    let read_response = ReadResponse::decode(&decompressed_response[..]).unwrap();
+
+    assert_eq!(read_response.results.len(), 1);
+    assert_eq!(read_response.results[0].timeseries.len(), 1);
+
+    let timeseries = &read_response.results[0].timeseries[0];
+    assert_eq!(timeseries.samples.len(), 1);
+    assert_eq!(timeseries.samples[0].value, 100.0);
+    assert_eq!(timeseries.samples[0].timestamp, 1000);
+
+    // write data to unknown schema
+    let unknown_schema_series = TimeSeries {
+        labels: vec![
+            Label {
+                name: "__name__".to_string(),
+                value: "metric_unknown_schema".to_string(),
+            },
+            Label {
+                name: "__schema__".to_string(),
+                value: "unknown_schema".to_string(),
+            },
+            Label {
+                name: "instance".to_string(),
+                value: "host2".to_string(),
+            },
+        ],
+        samples: vec![Sample {
+            value: 200.0,
+            timestamp: 2000,
+        }],
+        ..Default::default()
+    };
+
+    let unknown_write_request = WriteRequest {
+        timeseries: vec![unknown_schema_series],
+        ..Default::default()
+    };
+    let serialized_unknown_request = unknown_write_request.encode_to_vec();
+    let compressed_unknown_request =
+        prom_store::snappy_compress(&serialized_unknown_request).expect("failed to encode snappy");
+
+    // Write data to unknown schema
+    let res = client
+        .post("/v1/prometheus/write")
+        .header("Content-Encoding", "snappy")
+        .body(compressed_unknown_request)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::BAD_REQUEST);
+
+    // Read data from unknown schema
+    let unknown_read_request = ReadRequest {
+        queries: vec![Query {
+            start_timestamp_ms: 1500,
+            end_timestamp_ms: 2500,
+            matchers: vec![
+                LabelMatcher {
+                    name: "__name__".to_string(),
+                    value: "metric_unknown_schema".to_string(),
+                    r#type: MatcherType::Eq as i32,
+                },
+                LabelMatcher {
+                    name: "__schema__".to_string(),
+                    value: "unknown_schema".to_string(),
+                    r#type: MatcherType::Eq as i32,
+                },
+            ],
+            ..Default::default()
+        }],
+        ..Default::default()
+    };
+
+    let serialized_unknown_read_request = unknown_read_request.encode_to_vec();
+    let compressed_unknown_read_request =
+        prom_store::snappy_compress(&serialized_unknown_read_request)
+            .expect("failed to encode snappy");
+
+    let unknown_result = client
+        .post("/v1/prometheus/read")
+        .body(compressed_unknown_read_request)
+        .send()
+        .await;
+    assert_eq!(unknown_result.status(), StatusCode::BAD_REQUEST);
+
+    guard.remove_all().await;
+}
+
 pub async fn test_vm_proto_remote_write(store_type: StorageType) {
    common_telemetry::init_default_ut_logging();
    let (app, mut guard) =
@@ -1945,6 +2216,78 @@ transform:
    guard.remove_all().await;
 }

+pub async fn test_pipeline_filter(store_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_http_app_with_frontend(store_type, "test_pipeline_filter").await;
+
+    // handshake
+    let client = TestClient::new(app).await;
+
+    let pipeline_body = r#"
+processors:
+  - date:
+      field: time
+      formats:
+        - "%Y-%m-%d %H:%M:%S%.3f"
+  - filter:
+      field: name
+      targets:
+        - John
+transform:
+  - field: name
+    type: string
+  - field: time
+    type: time
+    index: timestamp
+"#;
+
+    // 1. create pipeline
+    let res = client
+        .post("/v1/events/pipelines/test")
+        .header("Content-Type", "application/x-yaml")
+        .body(pipeline_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 2. write data
+    let data_body = r#"
+[
+  {
+    "time": "2024-05-25 20:16:37.217",
+    "name": "John"
+  },
+  {
+    "time": "2024-05-25 20:16:37.218",
+    "name": "JoHN"
+  },
+  {
+    "time": "2024-05-25 20:16:37.328",
+    "name": "Jane"
+  }
+]
+"#;
+
+    let res = client
+        .post("/v1/events/logs?db=public&table=logs1&pipeline_name=test")
+        .header("Content-Type", "application/json")
+        .body(data_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    validate_data(
+        "pipeline_filter",
+        &client,
+        "select * from logs1",
+        "[[\"Jane\",1716668197328000000]]",
+    )
+    .await;
+
+    guard.remove_all().await;
+}
+
 pub async fn test_pipeline_dispatcher(storage_type: StorageType) {
    common_telemetry::init_default_ut_logging();
    let (app, mut guard) =
--- a/tests/cases/distributed/explain/join_10_tables.result
+++ b/tests/cases/distributed/explain/join_10_tables.result
@@ -84,17 +84,37 @@ limit 1;
 |_|_Inner Join: t_2.ts = t_3.ts, t_2.vin = t_3.vin_|
 |_|_Inner Join: t_1.ts = t_2.ts, t_1.vin = t_2.vin_|
 |_|_Filter: t_1.vin IS NOT NULL_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_1_|
+|_| ]]_|
 |_|_Filter: t_2.vin IS NOT NULL_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_2_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_3_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_4_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_5_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_6_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_7_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_8_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_9_|
+|_| ]]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: t_10_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [ts@0 DESC], fetch=1_|
 |_|_SortExec: TopK(fetch=1), expr=[ts@0 DESC], preserve_partitioning=[true]_|
 |_|_CoalesceBatchesExec: target_batch_size=8192_|
--- a/tests/cases/distributed/explain/multi_partitions.result
+++ b/tests/cases/distributed/explain/multi_partitions.result
@@ -26,7 +26,12 @@ explain SELECT * FROM demo WHERE ts > cast(1000000000 as timestamp) ORDER BY hos
 | plan_type_| plan_|
 +-+-+
 | logical_plan_| MergeSort: demo.host ASC NULLS LAST_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Sort: demo.host ASC NULLS LAST_|
+|_|_Projection: demo.host, demo.ts, demo.cpu, demo.memory, demo.disk_util_|
+|_|_Filter: demo.ts > arrow_cast(Int64(1000000000), Utf8("Timestamp(Millisecond, None)"))_|
+|_|_TableScan: demo_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [host@0 ASC NULLS LAST]_|
 |_|_MergeScanExec: REDACTED
 |_|_|
--- a/tests/cases/distributed/explain/order_by.result
+++ b/tests/cases/distributed/explain/order_by.result
@@ -12,7 +12,12 @@ EXPLAIN SELECT DISTINCT i%2 FROM integers ORDER BY 1;
 +-+-+
 | plan_type_| plan_|
 +-+-+
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[ |
+|_| Sort: integers.i % Int64(2) ASC NULLS LAST_|
+|_|_Distinct:_|
+|_|_Projection: integers.i % Int64(2)_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | MergeScanExec: REDACTED
 |_|_|
 +-+-+
@@ -35,7 +40,11 @@ EXPLAIN SELECT a, b FROM test ORDER BY a, b;
 +-+-+
 | plan_type_| plan_|
 +-+-+
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| Sort: test.a ASC NULLS LAST, test.b ASC NULLS LAST |
+|_|_Projection: test.a, test.b_|
+|_|_TableScan: test_|
+|_| ]]_|
 | physical_plan | MergeScanExec: REDACTED
 |_|_|
 +-+-+
@@ -50,7 +59,12 @@ EXPLAIN SELECT DISTINCT a, b FROM test ORDER BY a, b;
 +-+-+
 | plan_type_| plan_|
 +-+-+
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| Sort: test.a ASC NULLS LAST, test.b ASC NULLS LAST |
+|_|_Distinct:_|
+|_|_Projection: test.a, test.b_|
+|_|_TableScan: test_|
+|_| ]]_|
 | physical_plan | MergeScanExec: REDACTED
 |_|_|
 +-+-+
--- a/tests/cases/distributed/explain/single_partition.result
+++ b/tests/cases/distributed/explain/single_partition.result
@@ -12,7 +12,11 @@ EXPLAIN SELECT COUNT(*) FROM single_partition;
 +-+-+
 | plan_type_| plan_|
 +-+-+
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| Projection: count(*)_|
+|_|_Aggregate: groupBy=[[]], aggr=[[count(single_partition.j) AS count(*)]] |
+|_|_TableScan: single_partition_|
+|_| ]]_|
 | physical_plan | MergeScanExec: REDACTED
 |_|_|
 +-+-+
@@ -27,7 +31,11 @@ EXPLAIN SELECT SUM(i) FROM single_partition;
 +-+-+
 | plan_type_| plan_|
 +-+-+
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| Projection: sum(single_partition.i)_|
+|_|_Aggregate: groupBy=[[]], aggr=[[sum(single_partition.i)]] |
+|_|_TableScan: single_partition_|
+|_| ]]_|
 | physical_plan | MergeScanExec: REDACTED
 |_|_|
 +-+-+
@@ -42,7 +50,11 @@ EXPLAIN SELECT * FROM single_partition ORDER BY i DESC;
 +-+-+
 | plan_type_| plan_|
 +-+-+
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| Sort: single_partition.i DESC NULLS FIRST_|
+|_|_Projection: single_partition.i, single_partition.j, single_partition.k |
+|_|_TableScan: single_partition_|
+|_| ]]_|
 | physical_plan | MergeScanExec: REDACTED
 |_|_|
 +-+-+
--- a/tests/cases/distributed/explain/step_aggr.result
+++ b/tests/cases/distributed/explain/step_aggr.result
@@ -55,7 +55,10 @@ FROM
 +-+-+
 | logical_plan_| Projection: sum(count(integers.i)) AS count(integers.i), sum(sum(integers.i)) AS sum(integers.i), uddsketch_calc(Float64(0.5), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),integers.i))) AS uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll_merge(hll(integers.i))) AS hll_count(hll(integers.i))_|
 |_|_Aggregate: groupBy=[[]], aggr=[[sum(count(integers.i)), sum(sum(integers.i)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_merge(hll(integers.i))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[]], aggr=[[count(integers.i), sum(integers.i), uddsketch_state(Int64(128), Float64(0.01), CAST(integers.i AS Float64)), hll(CAST(integers.i AS Utf8))]]_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | ProjectionExec: expr=[sum(count(integers.i))@0 as count(integers.i), sum(sum(integers.i))@1 as sum(integers.i), uddsketch_calc(0.5, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),integers.i))@2) as uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll_merge(hll(integers.i))@3) as hll_count(hll(integers.i))] |
 |_|_AggregateExec: mode=Final, gby=[], aggr=[sum(count(integers.i)), sum(sum(integers.i)), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_merge(hll(integers.i))]_|
 |_|_CoalescePartitionsExec_|
@@ -156,7 +159,10 @@ ORDER BY
 | logical_plan_| Sort: integers.ts ASC NULLS LAST_|
 |_|_Projection: integers.ts, sum(count(integers.i)) AS count(integers.i), sum(sum(integers.i)) AS sum(integers.i), uddsketch_calc(Float64(0.5), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),integers.i))) AS uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll_merge(hll(integers.i))) AS hll_count(hll(integers.i))_|
 |_|_Aggregate: groupBy=[[integers.ts]], aggr=[[sum(count(integers.i)), sum(sum(integers.i)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_merge(hll(integers.i))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[integers.ts]], aggr=[[count(integers.i), sum(integers.i), uddsketch_state(Int64(128), Float64(0.01), CAST(integers.i AS Float64)), hll(CAST(integers.i AS Utf8))]]_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [ts@0 ASC NULLS LAST]_|
 |_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true]_|
 |_|_ProjectionExec: expr=[ts@0 as ts, sum(count(integers.i))@1 as count(integers.i), sum(sum(integers.i))@2 as sum(integers.i), uddsketch_calc(0.5, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),integers.i))@3) as uddsketch_calc(Float64(0.5),uddsketch_state(Int64(128),Float64(0.01),integers.i)), hll_count(hll_merge(hll(integers.i))@4) as hll_count(hll(integers.i))] |
--- a/tests/cases/distributed/explain/step_aggr_advance.result
+++ b/tests/cases/distributed/explain/step_aggr_advance.result
@@ -0,0 +1,974 @@
+CREATE TABLE IF NOT EXISTS aggr_optimize_not (
+  a STRING NULL,
+  b STRING NULL,
+  c STRING NULL,
+  d STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  greptime_value DOUBLE NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (a, b, c, d)
+) PARTITION ON COLUMNS (a, b, c) (a < 'b', a >= 'b',);
+
+Affected Rows: 0
+
+-- Case 0: group by columns are the same as partition columns.
+-- This query shouldn't push down aggregation even if group by columns are partitioned.
+-- because sort is already pushed down.
+-- If it does, it will cause a wrong result.
+-- explain at 0s, 5s and 10s. No point at 0s.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr_optimize_not [2m]));
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                               |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[max(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                             |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c                                                                                    |
+|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
+|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
+|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                  |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                          |
+|               |   SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                    |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2])                         |
+|               |       SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                     |
+|               |         CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                 |
+|               |           RepartitionExec: partitioning=REDACTED
+|               |             AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2])                          |
+|               |               ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b, c@4 as c]                         |
+|               |                 SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                         |
+|               |                   MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                     |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr_optimize_not [2m]));
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b, c@4 as c] REDACTED
+|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 1: group by columns are prefix of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_optimize_not [2m]));
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                   |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                  |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b                                                                                                         |
+|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
+|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
+|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                  |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST]                                                                                                                                                              |
+|               |   SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                        |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1])                                      |
+|               |       SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                         |
+|               |         CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                 |
+|               |           RepartitionExec: partitioning=REDACTED
+|               |             AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1])                                       |
+|               |               ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b]                                   |
+|               |                 SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                         |
+|               |                   MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                     |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_optimize_not [2m]));
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1]) REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1]) REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b] REDACTED
+|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 2: group by columns are prefix of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optimize_not [2m]));
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                       |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.greptime_timestamp]], aggr=[[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                                       |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a                                                                                                                              |
+|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
+|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
+|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                  |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST]                                                                                                                                                                                  |
+|               |   SortExec: expr=[a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                            |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0])                                                   |
+|               |       SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                             |
+|               |         CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                 |
+|               |           RepartitionExec: partitioning=REDACTED
+|               |             AggregateExec: mode=Partial, gby=[a@2 as a, greptime_timestamp@0 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0])                                                    |
+|               |               ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a]                                             |
+|               |                 SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                         |
+|               |                   MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                     |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optimize_not [2m]));
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0]) REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, greptime_timestamp@0 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0]) REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a] REDACTED
+|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 3: group by columns are superset of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                           |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[count(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                      |
+|               |     MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                             |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                               |
+|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                  |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST]                                                                                                                      |
+|               |   AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d, greptime_timestamp@4 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted                                   |
+|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST], preserve_partitioning=[true]                                                                                              |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                   |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, d@5 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted                                    |
+|               |             SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                             |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                     |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d, greptime_timestamp@4 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, d@5 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted REDACTED
+|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 4: group by columns are not prefix of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                               |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                             |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                    |
+|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
+|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
+|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                  |
+| physical_plan | SortPreservingMergeExec: [b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                          |
+|               |   SortExec: expr=[b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                    |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                   |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                   |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           AggregateExec: mode=Partial, gby=[b@2 as b, c@3 as c, d@4 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                      |
+|               |             ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), b@3 as b, c@4 as c, d@5 as d]                           |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                     |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[b@2 as b, c@3 as c, d@4 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), b@3 as b, c@4 as c, d@5 as d] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 5: a simple sum
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain sum(aggr_optimize_not);
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                      |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                 |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.greptime_timestamp]], aggr=[[sum(aggr_optimize_not.greptime_value)]]                                                                                                             |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, aggr_optimize_not.greptime_value                                                                                                                                    |
+|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST |
+|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                   |
+|               | PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp]                                                                                                                |
+|               |   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                             |
+|               |     Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST        |
+|               |       Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-300000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                  |
+|               |         TableScan: aggr_optimize_not                                                                                                                                                                                      |
+|               | ]]                                                                                                                                                                                                                        |
+| physical_plan | SortPreservingMergeExec: [greptime_timestamp@0 ASC NULLS LAST]                                                                                                                                                            |
+|               |   SortExec: expr=[greptime_timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                      |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)]                                                                                  |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                         |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)]                                                                                     |
+|               |             ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, greptime_value@5 as greptime_value]                                                                                                         |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                           |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze sum(aggr_optimize_not);
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [greptime_timestamp@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[greptime_timestamp@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, greptime_value@5 as greptime_value] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- TODO(discord9): more cases for aggr push down interacting with partitioning&tql
+CREATE TABLE IF NOT EXISTS aggr_optimize_not_count (
+  a STRING NULL,
+  b STRING NULL,
+  c STRING NULL,
+  d STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  greptime_value DOUBLE NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (a, b, c, d)
+) PARTITION ON COLUMNS (a, b, c) (a < 'b', a >= 'b',);
+
+Affected Rows: 0
+
+-- Case 6: Test average rate (sum/count like)
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
+
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
+|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                      |
+|               |     SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |       Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |         Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                   |
+|               |           Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |             MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                         |
+|               |               MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                                 |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                                |
+|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                           |
+|               |           Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                                                                                                                                                  |
+|               |             MergeSort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                           |
+|               |               MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |   Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.d                                                                                                                                                                                                                             |
+|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |           Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                  |
+|               |             Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |               TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| physical_plan | ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, greptime_timestamp@4 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@0 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                               |
+|               |   CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |     REDACTED
+|               |       AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                                   |
+|               |         SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |           CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |             RepartitionExec: partitioning=REDACTED
+|               |               AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                                    |
+|               |                 ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c]                                                                                                                                                                                                                                                                                                           |
+|               |                   SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |                     MergeScanExec: REDACTED
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |             AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                             |
+|               |               SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |                 CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |                   RepartitionExec: partitioning=REDACTED
+|               |                     AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                              |
+|               |                       ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c]                                                                                                                                                                                                                                                                                                     |
+|               |                         SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |                           MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, greptime_timestamp@4 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@0 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c] REDACTED
+|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_CoalescePartitionsExec REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c] REDACTED
+|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 0_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 7: aggregate without sort should be pushed down. This one push down for include all partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c;
+
+---------------+----------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                   |
+---------------+----------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                        |
+|               | Projection: min(aggr_optimize_not.greptime_value)                                                                                      |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c]], aggr=[[min(aggr_optimize_not.greptime_value)]] |
+|               |     TableScan: aggr_optimize_not                                                                                                       |
+|               | ]]                                                                                                                                     |
+| physical_plan | MergeScanExec: REDACTED
+|               |                                                                                                                                        |
+---------------+----------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c;
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_ProjectionExec: expr=[min(aggr_optimize_not.greptime_value)@3 as min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_ProjectionExec: expr=[min(aggr_optimize_not.greptime_value)@3 as min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 8: aggregate without sort should be pushed down. This one push down for include all partition columns then some
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c,
+  d;
+
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                        |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                             |
+|               | Projection: min(aggr_optimize_not.greptime_value)                                                                                                           |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d]], aggr=[[min(aggr_optimize_not.greptime_value)]] |
+|               |     TableScan: aggr_optimize_not                                                                                                                            |
+|               | ]]                                                                                                                                                          |
+| physical_plan | MergeScanExec: REDACTED
+|               |                                                                                                                                                             |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c,
+  d;
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_ProjectionExec: expr=[min(aggr_optimize_not.greptime_value)@4 as min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_ProjectionExec: expr=[min(aggr_optimize_not.greptime_value)@4 as min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 9: aggregate without sort should be pushed down. This one push down for step aggr push down.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+---------------+------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                   |
+---------------+------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: min(min(aggr_optimize_not.greptime_value)) AS min(aggr_optimize_not.greptime_value)                        |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b]], aggr=[[min(min(aggr_optimize_not.greptime_value))]] |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                    |
+|               | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b]], aggr=[[min(aggr_optimize_not.greptime_value)]]        |
+|               |   TableScan: aggr_optimize_not                                                                                         |
+|               | ]]                                                                                                                     |
+| physical_plan | ProjectionExec: expr=[min(min(aggr_optimize_not.greptime_value))@2 as min(aggr_optimize_not.greptime_value)]           |
+|               |   AggregateExec: mode=SinglePartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(min(aggr_optimize_not.greptime_value))]   |
+|               |     MergeScanExec: REDACTED
+|               |                                                                                                                        |
+---------------+------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_ProjectionExec: expr=[min(min(aggr_optimize_not.greptime_value))@2 as min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=SinglePartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(min(aggr_optimize_not.greptime_value))] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 10: aggregate without sort should be pushed down. This one push down for step aggr push down with complex aggr
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value) + max(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                |
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: min(min(aggr_optimize_not.greptime_value)) + max(max(aggr_optimize_not.greptime_value)) AS min(aggr_optimize_not.greptime_value) + max(aggr_optimize_not.greptime_value)                |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b]], aggr=[[min(min(aggr_optimize_not.greptime_value)), max(max(aggr_optimize_not.greptime_value))]]                                  |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                 |
+|               | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b]], aggr=[[min(aggr_optimize_not.greptime_value), max(aggr_optimize_not.greptime_value)]]                                              |
+|               |   TableScan: aggr_optimize_not                                                                                                                                                                      |
+|               | ]]                                                                                                                                                                                                  |
+| physical_plan | ProjectionExec: expr=[min(min(aggr_optimize_not.greptime_value))@2 + max(max(aggr_optimize_not.greptime_value))@3 as min(aggr_optimize_not.greptime_value) + max(aggr_optimize_not.greptime_value)] |
+|               |   AggregateExec: mode=SinglePartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(min(aggr_optimize_not.greptime_value)), max(max(aggr_optimize_not.greptime_value))]                                    |
+|               |     MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                     |
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value) + max(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_ProjectionExec: expr=[min(min(aggr_optimize_not.greptime_value))@2 + max(max(aggr_optimize_not.greptime_value))@3 as min(aggr_optimize_not.greptime_value) + max(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=SinglePartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(min(aggr_optimize_not.greptime_value)), max(max(aggr_optimize_not.greptime_value))] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value), max(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value), max(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value), max(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[min(aggr_optimize_not.greptime_value), max(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+-- Case 11: aggregate with subquery
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  a,
+  min(greptime_value)
+FROM
+  (
+    SELECT
+      a,
+      b,
+      greptime_value
+    FROM
+      aggr_optimize_not
+    ORDER BY
+      a,
+      b
+  )
+GROUP BY
+  a;
+
+---------------+------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                   |
+---------------+------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: aggr_optimize_not.a, min(min(aggr_optimize_not.greptime_value)) AS min(aggr_optimize_not.greptime_value)   |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a]], aggr=[[min(min(aggr_optimize_not.greptime_value))]]                      |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                    |
+|               | Aggregate: groupBy=[[aggr_optimize_not.a]], aggr=[[min(aggr_optimize_not.greptime_value)]]                             |
+|               |   Projection: aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.greptime_value                               |
+|               |     TableScan: aggr_optimize_not                                                                                       |
+|               | ]]                                                                                                                     |
+| physical_plan | ProjectionExec: expr=[a@0 as a, min(min(aggr_optimize_not.greptime_value))@1 as min(aggr_optimize_not.greptime_value)] |
+|               |   AggregateExec: mode=SinglePartitioned, gby=[a@0 as a], aggr=[min(min(aggr_optimize_not.greptime_value))]             |
+|               |     MergeScanExec: REDACTED
+|               |                                                                                                                        |
+---------------+------------------------------------------------------------------------------------------------------------------------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  a,
+  min(greptime_value)
+FROM
+  (
+    SELECT
+      a,
+      b,
+      greptime_value
+    FROM
+      aggr_optimize_not
+    ORDER BY
+      a,
+      b
+  )
+GROUP BY
+  a;
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_ProjectionExec: expr=[a@0 as a, min(min(aggr_optimize_not.greptime_value))@1 as min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_AggregateExec: mode=SinglePartitioned, gby=[a@0 as a], aggr=[min(min(aggr_optimize_not.greptime_value))] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[min(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 0_|
+-+-+-+
+
+drop table aggr_optimize_not_count;
+
+Affected Rows: 0
+
+drop table aggr_optimize_not;
+
+Affected Rows: 0
+
--- a/tests/cases/distributed/explain/step_aggr_advance.sql
+++ b/tests/cases/distributed/explain/step_aggr_advance.sql
@@ -0,0 +1,307 @@
+CREATE TABLE IF NOT EXISTS aggr_optimize_not (
+  a STRING NULL,
+  b STRING NULL,
+  c STRING NULL,
+  d STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  greptime_value DOUBLE NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (a, b, c, d)
+) PARTITION ON COLUMNS (a, b, c) (a < 'b', a >= 'b',);
+
+-- Case 0: group by columns are the same as partition columns.
+-- This query shouldn't push down aggregation even if group by columns are partitioned.
+-- because sort is already pushed down.
+-- If it does, it will cause a wrong result.
+-- explain at 0s, 5s and 10s. No point at 0s.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr_optimize_not [2m]));
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr_optimize_not [2m]));
+
+-- Case 1: group by columns are prefix of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_optimize_not [2m]));
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_optimize_not [2m]));
+
+-- Case 2: group by columns are prefix of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optimize_not [2m]));
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optimize_not [2m]));
+
+-- Case 3: group by columns are superset of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+-- Case 4: group by columns are not prefix of partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr_optimize_not [2m]));
+
+-- Case 5: a simple sum
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain sum(aggr_optimize_not);
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze sum(aggr_optimize_not);
+
+-- TODO(discord9): more cases for aggr push down interacting with partitioning&tql
+CREATE TABLE IF NOT EXISTS aggr_optimize_not_count (
+  a STRING NULL,
+  b STRING NULL,
+  c STRING NULL,
+  d STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  greptime_value DOUBLE NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (a, b, c, d)
+) PARTITION ON COLUMNS (a, b, c) (a < 'b', a >= 'b',);
+
+-- Case 6: Test average rate (sum/count like)
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+tql analyze (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
+
+-- Case 7: aggregate without sort should be pushed down. This one push down for include all partition columns.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c;
+
+-- Case 8: aggregate without sort should be pushed down. This one push down for include all partition columns then some
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c,
+  d;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b,
+  c,
+  d;
+
+-- Case 9: aggregate without sort should be pushed down. This one push down for step aggr push down.
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+-- Case 10: aggregate without sort should be pushed down. This one push down for step aggr push down with complex aggr
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  min(greptime_value) + max(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  min(greptime_value) + max(greptime_value)
+FROM
+  aggr_optimize_not
+GROUP BY
+  a,
+  b;
+
+
+-- Case 11: aggregate with subquery
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+EXPLAIN
+SELECT
+  a,
+  min(greptime_value)
+FROM
+  (
+    SELECT
+      a,
+      b,
+      greptime_value
+    FROM
+      aggr_optimize_not
+    ORDER BY
+      a,
+      b
+  )
+GROUP BY
+  a;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE
+SELECT
+  a,
+  min(greptime_value)
+FROM
+  (
+    SELECT
+      a,
+      b,
+      greptime_value
+    FROM
+      aggr_optimize_not
+    ORDER BY
+      a,
+      b
+  )
+GROUP BY
+  a;
+
+drop table aggr_optimize_not_count;
+
+drop table aggr_optimize_not;
--- a/tests/cases/distributed/explain/step_aggr_basic.result
+++ b/tests/cases/distributed/explain/step_aggr_basic.result
@@ -50,7 +50,10 @@ FROM
 +-+-+
 | logical_plan_| Projection: sum(count(integers.i)) AS count(integers.i)_|
 |_|_Aggregate: groupBy=[[]], aggr=[[sum(count(integers.i))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[]], aggr=[[count(integers.i)]]_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | ProjectionExec: expr=[sum(count(integers.i))@0 as count(integers.i)]_|
 |_|_AggregateExec: mode=Final, gby=[], aggr=[sum(count(integers.i))]_|
 |_|_CoalescePartitionsExec_|
@@ -144,7 +147,10 @@ ORDER BY
 | logical_plan_| Sort: integers.ts ASC NULLS LAST, count(integers.i) ASC NULLS LAST_|
 |_|_Projection: integers.ts, sum(count(integers.i)) AS count(integers.i)_|
 |_|_Aggregate: groupBy=[[integers.ts]], aggr=[[sum(count(integers.i))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[integers.ts]], aggr=[[count(integers.i)]]_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [ts@0 ASC NULLS LAST, count(integers.i)@1 ASC NULLS LAST]_|
 |_|_SortExec: expr=[ts@0 ASC NULLS LAST, count(integers.i)@1 ASC NULLS LAST], preserve_partitioning=[true]_|
 |_|_ProjectionExec: expr=[ts@0 as ts, sum(count(integers.i))@1 as count(integers.i)]_|
@@ -253,7 +259,10 @@ ORDER BY
 | logical_plan_| Sort: time_window ASC NULLS LAST, count(integers.i) ASC NULLS LAST_|
 |_|_Projection: date_bin(Utf8("1 hour"),integers.ts) AS time_window, sum(count(integers.i)) AS count(integers.i)_|
 |_|_Aggregate: groupBy=[[date_bin(Utf8("1 hour"),integers.ts)]], aggr=[[sum(count(integers.i))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[date_bin(CAST(Utf8("1 hour") AS Interval(MonthDayNano)), integers.ts)]], aggr=[[count(integers.i)]]_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [time_window@0 ASC NULLS LAST, count(integers.i)@1 ASC NULLS LAST]_|
 |_|_SortExec: expr=[time_window@0 ASC NULLS LAST, count(integers.i)@1 ASC NULLS LAST], preserve_partitioning=[true]_|
 |_|_ProjectionExec: expr=[date_bin(Utf8("1 hour"),integers.ts)@0 as time_window, sum(count(integers.i))@1 as count(integers.i)]_|
@@ -369,7 +378,10 @@ ORDER BY
 | logical_plan_| Sort: integers.ts + Int64(1) ASC NULLS LAST, integers.i / Int64(2) ASC NULLS LAST_|
 |_|_Projection: integers.ts + Int64(1), integers.i / Int64(2), sum(count(integers.i)) AS count(integers.i)_|
 |_|_Aggregate: groupBy=[[integers.ts + Int64(1), integers.i / Int64(2)]], aggr=[[sum(count(integers.i))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[CAST(integers.ts AS Int64) + Int64(1), integers.i / Int64(2)]], aggr=[[count(integers.i)]]_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [integers.ts + Int64(1)@0 ASC NULLS LAST, integers.i / Int64(2)@1 ASC NULLS LAST]_|
 |_|_SortExec: expr=[integers.ts + Int64(1)@0 ASC NULLS LAST, integers.i / Int64(2)@1 ASC NULLS LAST], preserve_partitioning=[true]_|
 |_|_ProjectionExec: expr=[integers.ts + Int64(1)@0 as integers.ts + Int64(1), integers.i / Int64(2)@1 as integers.i / Int64(2), sum(count(integers.i))@2 as count(integers.i)]_|
@@ -497,7 +509,10 @@ FROM
 +-+-+
 | logical_plan_| Projection: uddsketch_calc(Float64(0.5), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_merge(Int64(128),Float64(0.01),sink_table.udd_state))) AS udd_result, hll_count(hll_merge(hll_merge(sink_table.hll_state))) AS hll_result_|
 |_|_Aggregate: groupBy=[[]], aggr=[[uddsketch_merge(Int64(128), Float64(0.01), uddsketch_merge(Int64(128),Float64(0.01),sink_table.udd_state)), hll_merge(hll_merge(sink_table.hll_state))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[]], aggr=[[uddsketch_merge(Int64(128), Float64(0.01), sink_table.udd_state), hll_merge(sink_table.hll_state)]]_|
+|_|_TableScan: sink_table_|
+|_| ]]_|
 | physical_plan | ProjectionExec: expr=[uddsketch_calc(0.5, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_merge(Int64(128),Float64(0.01),sink_table.udd_state))@0) as udd_result, hll_count(hll_merge(hll_merge(sink_table.hll_state))@1) as hll_result] |
 |_|_AggregateExec: mode=Final, gby=[], aggr=[uddsketch_merge(Int64(128),Float64(0.01),uddsketch_merge(Int64(128),Float64(0.01),sink_table.udd_state)), hll_merge(hll_merge(sink_table.hll_state))]_|
 |_|_CoalescePartitionsExec_|
--- a/tests/cases/distributed/explain/step_aggr_massive.result
+++ b/tests/cases/distributed/explain/step_aggr_massive.result
@@ -247,7 +247,11 @@ GROUP BY
 +-+-+
 | logical_plan_| Projection: base_table.env, base_table.service_name, base_table.city, base_table.page, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)) AS lcp_state, max(max(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)) AS max_lcp, min(min(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)) AS min_lcp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)) AS fmp_state, max(max(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)) AS max_fmp, min(min(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)) AS min_fmp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)) AS fcp_state, max(max(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)) AS max_fcp, min(min(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)) AS min_fcp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)) AS fp_state, max(max(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)) AS max_fp, min(min(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)) AS min_fp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)) AS tti_state, max(max(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)) AS max_tti, min(min(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)) AS min_tti, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)) AS fid_state, max(max(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)) AS max_fid, min(min(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)) AS min_fid, max(max(base_table.shard_key)) AS shard_key, arrow_cast(date_bin(Utf8("60 seconds"),base_table.time),Utf8("Timestamp(Second, None)"))_|
 |_|_Aggregate: groupBy=[[base_table.env, base_table.service_name, base_table.city, base_table.page, arrow_cast(date_bin(Utf8("60 seconds"),base_table.time),Utf8("Timestamp(Second, None)"))]], aggr=[[uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)), max(max(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)), min(min(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)), max(max(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)), min(min(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)), max(max(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)), min(min(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)), max(max(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)), min(min(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)), max(max(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)), min(min(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)), uddsketch_merge(Int64(128), Float64(0.01), uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)), max(max(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)), min(min(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)), max(max(base_table.shard_key))]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[base_table.env, base_table.service_name, base_table.city, base_table.page, arrow_cast(date_bin(CAST(Utf8("60 seconds") AS Interval(MonthDayNano)), base_table.time), Utf8("Timestamp(Second, None)"))]], aggr=[[uddsketch_state(Int64(128), Float64(0.01), CAST(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE CAST(NULL AS Int64) END AS Float64)), max(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE CAST(NULL AS Int64) END), min(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE CAST(NULL AS Int64) END), uddsketch_state(Int64(128), Float64(0.01), CAST(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE CAST(NULL AS Int64) END AS Float64)), max(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE CAST(NULL AS Int64) END), min(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE CAST(NULL AS Int64) END), uddsketch_state(Int64(128), Float64(0.01), CAST(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE CAST(NULL AS Int64) END AS Float64)), max(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE CAST(NULL AS Int64) END), min(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE CAST(NULL AS Int64) END), uddsketch_state(Int64(128), Float64(0.01), CAST(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE CAST(NULL AS Int64) END AS Float64)), max(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE CAST(NULL AS Int64) END), min(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE CAST(NULL AS Int64) END), uddsketch_state(Int64(128), Float64(0.01), CAST(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE CAST(NULL AS Int64) END AS Float64)), max(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE CAST(NULL AS Int64) END), min(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE CAST(NULL AS Int64) END), uddsketch_state(Int64(128), Float64(0.01), CAST(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE CAST(NULL AS Int64) END AS Float64)), max(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE CAST(NULL AS Int64) END), min(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE CAST(NULL AS Int64) END), max(base_table.shard_key)]]_|
+|_|_Filter: (base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) OR base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) OR base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) OR base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) OR base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) OR base_table.fid > Int64(0) AND base_table.fid < Int64(3000000)) AND CAST(base_table.time AS Timestamp(Millisecond, Some("+00:00"))) >= CAST(now() AS Timestamp(Millisecond, Some("+00:00")))_|
+|_|_TableScan: base_table_|
+|_| ]]_|
 | physical_plan | ProjectionExec: expr=[env@0 as env, service_name@1 as service_name, city@2 as city, page@3 as page, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END))@5 as lcp_state, max(max(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END))@6 as max_lcp, min(min(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END))@7 as min_lcp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END))@8 as fmp_state, max(max(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END))@9 as max_fmp, min(min(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END))@10 as min_fmp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END))@11 as fcp_state, max(max(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END))@12 as max_fcp, min(min(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END))@13 as min_fcp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END))@14 as fp_state, max(max(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END))@15 as max_fp, min(min(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END))@16 as min_fp, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END))@17 as tti_state, max(max(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END))@18 as max_tti, min(min(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END))@19 as min_tti, uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END))@20 as fid_state, max(max(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END))@21 as max_fid, min(min(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END))@22 as min_fid, max(max(base_table.shard_key))@23 as shard_key, arrow_cast(date_bin(Utf8("60 seconds"),base_table.time),Utf8("Timestamp(Second, None)"))@4 as arrow_cast(date_bin(Utf8("60 seconds"),base_table.time),Utf8("Timestamp(Second, None)"))] |
 |_|_AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, service_name@1 as service_name, city@2 as city, page@3 as page, arrow_cast(date_bin(Utf8("60 seconds"),base_table.time),Utf8("Timestamp(Second, None)"))@4 as arrow_cast(date_bin(Utf8("60 seconds"),base_table.time),Utf8("Timestamp(Second, None)"))], aggr=[uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)), max(max(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)), min(min(CASE WHEN base_table.lcp > Int64(0) AND base_table.lcp < Int64(3000000) THEN base_table.lcp ELSE NULL END)), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)), max(max(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)), min(min(CASE WHEN base_table.fmp > Int64(0) AND base_table.fmp < Int64(3000000) THEN base_table.fmp ELSE NULL END)), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)), max(max(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)), min(min(CASE WHEN base_table.fcp > Int64(0) AND base_table.fcp < Int64(3000000) THEN base_table.fcp ELSE NULL END)), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)), max(max(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)), min(min(CASE WHEN base_table.fp > Int64(0) AND base_table.fp < Int64(3000000) THEN base_table.fp ELSE NULL END)), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)), max(max(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)), min(min(CASE WHEN base_table.tti > Int64(0) AND base_table.tti < Int64(3000000) THEN base_table.tti ELSE NULL END)), uddsketch_merge(Int64(128),Float64(0.01),uddsketch_state(Int64(128),Float64(0.01),CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)), max(max(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)), min(min(CASE WHEN base_table.fid > Int64(0) AND base_table.fid < Int64(3000000) THEN base_table.fid ELSE NULL END)), max(max(base_table.shard_key))]_|
 |_|_CoalesceBatchesExec: target_batch_size=8192_|
@@ -624,7 +628,11 @@ where
 +-+-+
 | logical_plan_| Projection: count(*) AS count(*)_|
 |_|_Aggregate: groupBy=[[]], aggr=[[sum(count(*)) AS count(*)]]_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Aggregate: groupBy=[[]], aggr=[[count(base_table.time) AS count(*)]]_|
+|_|_Filter: CAST(base_table.time AS Timestamp(Millisecond, Some("+00:00"))) >= CAST(now() AS Timestamp(Millisecond, Some("+00:00")))_|
+|_|_TableScan: base_table_|
+|_| ]]_|
 | physical_plan | AggregateExec: mode=Final, gby=[], aggr=[count(*)]_|
 |_|_CoalescePartitionsExec_|
 |_|_AggregateExec: mode=Partial, gby=[], aggr=[count(*)]_|
--- a/tests/cases/distributed/explain/subqueries.result
+++ b/tests/cases/distributed/explain/subqueries.result
@@ -14,9 +14,14 @@ EXPLAIN SELECT * FROM integers WHERE i IN ((SELECT i FROM integers)) ORDER BY i;
 +-+-+
 | logical_plan_| Sort: integers.i ASC NULLS LAST_|
 |_|_LeftSemi Join: integers.i = __correlated_sq_1.i_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: integers_|
+|_| ]]_|
 |_|_SubqueryAlias: __correlated_sq_1_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| Projection: integers.i_|
+|_|_TableScan: integers_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [i@0 ASC NULLS LAST]_|
 |_|_SortExec: expr=[i@0 ASC NULLS LAST], preserve_partitioning=[true]_|
 |_|_CoalesceBatchesExec: target_batch_size=8192_|
@@ -43,10 +48,14 @@ EXPLAIN SELECT * FROM integers i1 WHERE EXISTS(SELECT i FROM integers WHERE i=i1
 | logical_plan_| Sort: i1.i ASC NULLS LAST_|
 |_|_LeftSemi Join: i1.i = __correlated_sq_1.i_|
 |_|_SubqueryAlias: i1_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: integers_|
+|_| ]]_|
 |_|_SubqueryAlias: __correlated_sq_1_|
 |_|_Projection: integers.i_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: integers_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [i@0 ASC NULLS LAST]_|
 |_|_SortExec: expr=[i@0 ASC NULLS LAST], preserve_partitioning=[true]_|
 |_|_CoalesceBatchesExec: target_batch_size=8192_|
@@ -85,9 +94,13 @@ order by t.i desc;
 |_|_Cross Join:_|
 |_|_Filter: integers.i IS NOT NULL_|
 |_|_Projection: integers.i_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: integers_|
+|_| ]]_|
 |_|_Projection:_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: other_|
+|_| ]]_|
 | physical_plan | SortPreservingMergeExec: [i@0 DESC]_|
 |_|_SortExec: expr=[i@0 DESC], preserve_partitioning=[true]_|
 |_|_CrossJoinExec_|
@@ -116,9 +129,15 @@ EXPLAIN INSERT INTO other SELECT i, 2 FROM integers WHERE i=(SELECT MAX(i) FROM
 |                     |   Projection: integers.i AS i, TimestampMillisecond(2, None) AS j |
 |                     |     Inner Join: integers.i = __scalar_sq_1.max(integers.i)        |
 |                     |       Projection: integers.i                                      |
-|                     |         MergeScan [is_placeholder=false]                          |
+|                     |         MergeScan [is_placeholder=false, remote_input=[           |
+|                     | TableScan: integers                                               |
+|                     | ]]                                                                |
 |                     |       SubqueryAlias: __scalar_sq_1                                |
-|                     |         MergeScan [is_placeholder=false]                          |
+|                     |         MergeScan [is_placeholder=false, remote_input=[           |
+|                     | Projection: max(integers.i)                                       |
+|                     |   Aggregate: groupBy=[[]], aggr=[[max(integers.i)]]               |
+|                     |     TableScan: integers                                           |
+|                     | ]]                                                                |
 | physical_plan_error | Error during planning: failed to resolve catalog: datafusion      |
 +---------------------+-------------------------------------------------------------------+

--- a/tests/cases/distributed/optimizer/filter_push_down.result
+++ b/tests/cases/distributed/optimizer/filter_push_down.result
@@ -252,10 +252,14 @@ EXPLAIN SELECT * FROM (SELECT 0=1 AS cond FROM integers i1, integers i2) a1 WHER
 |_|_Cross Join:_|
 |_|_SubqueryAlias: i1_|
 |_|_Projection:_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[ |
+|_| TableScan: integers_|
+|_| ]]_|
 |_|_SubqueryAlias: i2_|
 |_|_Projection:_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[ |
+|_| TableScan: integers_|
+|_| ]]_|
 | physical_plan | CoalescePartitionsExec_|
 |_|_ProjectionExec: expr=[false as cond]_|
 |_|_CrossJoinExec_|
--- a/tests/cases/distributed/optimizer/order_by.result
+++ b/tests/cases/distributed/optimizer/order_by.result
@@ -4,7 +4,10 @@ explain select * from numbers;
 +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                 |
 +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                     |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                      |
+|               | Projection: numbers.number                                                                                                                                                                                                           |
+|               |   TableScan: numbers                                                                                                                                                                                                                 |
+|               | ]]                                                                                                                                                                                                                                   |
 | physical_plan | StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                      |
 +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -15,7 +18,11 @@ explain select * from numbers order by number desc;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Sort: numbers.number DESC NULLS FIRST                                                                                                                                                                                                  |
+|               |   Projection: numbers.number                                                                                                                                                                                                           |
+|               |     TableScan: numbers                                                                                                                                                                                                                 |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: expr=[number@0 DESC], preserve_partitioning=[false]                                                                                                                                                                          |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
@@ -27,7 +34,11 @@ explain select * from numbers order by number asc;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Sort: numbers.number ASC NULLS LAST                                                                                                                                                                                                    |
+|               |   Projection: numbers.number                                                                                                                                                                                                           |
+|               |     TableScan: numbers                                                                                                                                                                                                                 |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                                |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
@@ -39,7 +50,12 @@ explain select * from numbers order by number desc limit 10;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Limit: skip=0, fetch=10                                                                                                                                                                                                                |
+|               |   Sort: numbers.number DESC NULLS FIRST                                                                                                                                                                                                |
+|               |     Projection: numbers.number                                                                                                                                                                                                         |
+|               |       TableScan: numbers                                                                                                                                                                                                               |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: TopK(fetch=10), expr=[number@0 DESC], preserve_partitioning=[false]                                                                                                                                                          |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
@@ -51,7 +67,12 @@ explain select * from numbers order by number asc limit 10;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Limit: skip=0, fetch=10                                                                                                                                                                                                                |
+|               |   Sort: numbers.number ASC NULLS LAST                                                                                                                                                                                                  |
+|               |     Projection: numbers.number                                                                                                                                                                                                         |
+|               |       TableScan: numbers                                                                                                                                                                                                               |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: TopK(fetch=10), expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
--- a/tests/cases/standalone/common/mysql.result
+++ b/tests/cases/standalone/common/mysql.result
@@ -2,19 +2,19 @@
 SELECt @@tx_isolation;

 +-----------------+
-| @@tx_isolation; |
+| @@tx_isolation  |
 +-----------------+
-| 0               |
+| REPEATABLE-READ |
 +-----------------+

 -- SQLNESS PROTOCOL MYSQL
 SELECT @@version_comment;

-+--------------------+
-| @@version_comment; |
-+--------------------+
-| 0                  |
-+--------------------+
+-------------------+
+| @@version_comment |
+-------------------+
+| Greptime          |
+-------------------+

 -- SQLNESS PROTOCOL MYSQL
 SHOW DATABASES;
--- a/tests/cases/standalone/common/order/order_by_exceptions.result
+++ b/tests/cases/standalone/common/order/order_by_exceptions.result
@@ -70,8 +70,14 @@ EXPLAIN SELECT a % 2, b FROM test UNION SELECT a % 2 AS k, b FROM test ORDER BY
 | logical_plan  | Sort: Int64(-1) ASC NULLS LAST                                                                             |
 |               |   Aggregate: groupBy=[[test.a % Int64(2), test.b]], aggr=[[]]                                              |
 |               |     Union                                                                                                  |
-|               |       MergeScan [is_placeholder=false]                                                                     |
-|               |       MergeScan [is_placeholder=false]                                                                     |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                      |
+|               | Projection: CAST(test.a AS Int64) % Int64(2) AS test.a % Int64(2), test.b                                  |
+|               |   TableScan: test                                                                                          |
+|               | ]]                                                                                                         |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                      |
+|               | Projection: CAST(test.a AS Int64) % Int64(2) AS test.a % Int64(2), test.b                                  |
+|               |   TableScan: test                                                                                          |
+|               | ]]                                                                                                         |
 | physical_plan | CoalescePartitionsExec                                                                                     |
 |               |   AggregateExec: mode=SinglePartitioned, gby=[test.a % Int64(2)@0 as test.a % Int64(2), b@1 as b], aggr=[] |
 |               |     InterleaveExec                                                                                         |
--- a/tests/cases/standalone/common/promql/absent.result
+++ b/tests/cases/standalone/common/promql/absent.result
@@ -0,0 +1,122 @@
+create table t (
+  ts timestamp(3) time index,
+  job STRING,
+  instance STRING,
+  val DOUBLE,
+  PRIMARY KEY(job, instance),
+);
+
+Affected Rows: 0
+
+insert into t values
+    (0, 'job1', 'instance1', 1),
+    (0, 'job2', 'instance2', 2),
+    (5000, 'job1', 'instance3',3),
+    (5000, 'job2', 'instance4',4),
+    (10000, 'job1', 'instance5',5),
+    (10000, 'job2', 'instance6',6),
+    (15000, 'job1', 'instance7',7),
+    (15000, 'job2', 'instance8',8);
+
+Affected Rows: 8
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="job1"});
+
++
++
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="job2"});
+
++
++
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="job3"});
+
+---------------------+-----+------+
+| ts                  | val | job  |
+---------------------+-----+------+
+| 1970-01-01T00:00:00 | 1.0 | job3 |
+| 1970-01-01T00:00:05 | 1.0 | job3 |
+| 1970-01-01T00:00:10 | 1.0 | job3 |
+| 1970-01-01T00:00:15 | 1.0 | job3 |
+---------------------+-----+------+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(nonexistent_table);
+
+---------------------+-------+
+| time                | value |
+---------------------+-------+
+| 1970-01-01T00:00:00 | 1.0   |
+| 1970-01-01T00:00:05 | 1.0   |
+| 1970-01-01T00:00:10 | 1.0   |
+| 1970-01-01T00:00:15 | 1.0   |
+---------------------+-------+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="nonexistent_job"});
+
+---------------------+-----+-----------------+
+| ts                  | val | job             |
+---------------------+-----+-----------------+
+| 1970-01-01T00:00:00 | 1.0 | nonexistent_job |
+| 1970-01-01T00:00:05 | 1.0 | nonexistent_job |
+| 1970-01-01T00:00:10 | 1.0 | nonexistent_job |
+| 1970-01-01T00:00:15 | 1.0 | nonexistent_job |
+---------------------+-----+-----------------+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (1000, 1000, '1s') absent(t{job="job1"});
+
+---------------------+-----+------+
+| ts                  | val | job  |
+---------------------+-----+------+
+| 1970-01-01T00:16:40 | 1.0 | job1 |
+---------------------+-----+------+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="nonexistent_job1", job="nonexistent_job2"});
+
+---------------------+-----+------------------+
+| ts                  | val | job              |
+---------------------+-----+------------------+
+| 1970-01-01T00:00:00 | 1.0 | nonexistent_job2 |
+| 1970-01-01T00:00:05 | 1.0 | nonexistent_job2 |
+| 1970-01-01T00:00:10 | 1.0 | nonexistent_job2 |
+| 1970-01-01T00:00:15 | 1.0 | nonexistent_job2 |
+---------------------+-----+------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job=~"nonexistent_job1", job!="nonexistent_job2"});
+
+---------------------+-----+
+| ts                  | val |
+---------------------+-----+
+| 1970-01-01T00:00:00 | 1.0 |
+| 1970-01-01T00:00:05 | 1.0 |
+| 1970-01-01T00:00:10 | 1.0 |
+| 1970-01-01T00:00:15 | 1.0 |
+---------------------+-----+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') sum(t{job="job2"});
+
+---------------------+------------+
+| ts                  | sum(t.val) |
+---------------------+------------+
+| 1970-01-01T00:00:00 | 2.0        |
+| 1970-01-01T00:00:05 | 6.0        |
+| 1970-01-01T00:00:10 | 12.0       |
+| 1970-01-01T00:00:15 | 20.0       |
+---------------------+------------+
+
+-- ABSENT is not supported for aggregation functions for now
+-- tql eval (0, 15, '5s') absent(sum(t{job="job2"}));
+-- tql eval (0, 15, '5s') absent(sum(t{job="job3"}));
+drop table t;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/promql/absent.sql
+++ b/tests/cases/standalone/common/promql/absent.sql
@@ -0,0 +1,50 @@
+create table t (
+  ts timestamp(3) time index,
+  job STRING,
+  instance STRING,
+  val DOUBLE,
+  PRIMARY KEY(job, instance),
+);
+
+insert into t values
+    (0, 'job1', 'instance1', 1),
+    (0, 'job2', 'instance2', 2),
+    (5000, 'job1', 'instance3',3),
+    (5000, 'job2', 'instance4',4),
+    (10000, 'job1', 'instance5',5),
+    (10000, 'job2', 'instance6',6),
+    (15000, 'job1', 'instance7',7),
+    (15000, 'job2', 'instance8',8);
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="job1"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="job2"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="job3"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(nonexistent_table);
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="nonexistent_job"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (1000, 1000, '1s') absent(t{job="job1"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job="nonexistent_job1", job="nonexistent_job2"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') absent(t{job=~"nonexistent_job1", job!="nonexistent_job2"});
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 15, '5s') sum(t{job="job2"});
+
+-- ABSENT is not supported for aggregation functions for now
+-- tql eval (0, 15, '5s') absent(sum(t{job="job2"}));
+-- tql eval (0, 15, '5s') absent(sum(t{job="job3"}));
+
+drop table t;
--- a/tests/cases/standalone/common/promql/simple_histogram.result
+++ b/tests/cases/standalone/common/promql/simple_histogram.result
@@ -332,3 +332,34 @@ drop table histogram4_bucket;

 Affected Rows: 0

+tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fff) (rate(greptime_servers_postgres_query_elapsed_bucket{instance=~"xxx"}[1m])));
+
++
++
+
+-- test case where table exists but doesn't have 'le' column should raise error
+CREATE TABLE greptime_servers_postgres_query_elapsed_no_le (
+    pod STRING,
+    instance STRING,
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE,
+    PRIMARY KEY (pod, instance)
+);
+
+Affected Rows: 0
+
+-- should return empty result instead of error when 'le' column is missing
+tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, le) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
+
++
++
+
+tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fbf) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
+
++
++
+
+drop table greptime_servers_postgres_query_elapsed_no_le;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/promql/simple_histogram.sql
+++ b/tests/cases/standalone/common/promql/simple_histogram.sql
@@ -187,3 +187,20 @@ insert into histogram4_bucket values
 tql eval (2900, 3000, '100s') histogram_quantile(0.9, histogram4_bucket);

 drop table histogram4_bucket;
+
+tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fff) (rate(greptime_servers_postgres_query_elapsed_bucket{instance=~"xxx"}[1m])));
+
+-- test case where table exists but doesn't have 'le' column should raise error
+CREATE TABLE greptime_servers_postgres_query_elapsed_no_le (
+    pod STRING,
+    instance STRING,
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE,
+    PRIMARY KEY (pod, instance)
+);
+
+-- should return empty result instead of error when 'le' column is missing
+tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, le) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
+tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fbf) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
+
+drop table greptime_servers_postgres_query_elapsed_no_le;
--- a/tests/cases/standalone/common/promql/timestamp_fn.result
+++ b/tests/cases/standalone/common/promql/timestamp_fn.result
@@ -0,0 +1,160 @@
+-- Test `timestamp()` function
+-- timestamp() returns the timestamp of each sample as seconds since Unix epoch
+create table timestamp_test (ts timestamp time index, val double);
+
+Affected Rows: 0
+
+insert into timestamp_test values
+  (0, 1.0),
+  (1000, 2.0),
+  (60000, 3.0),
+  (3600000, 4.0),
+   -- 2021-01-01 00:00:00
+  (1609459200000, 5.0),
+   -- 2021-01-01 00:01:00
+  (1609459260000, 6.0);
+
+Affected Rows: 6
+
+-- Test timestamp() with time series
+tql eval (0, 3600, '30s') timestamp(timestamp_test);
+
+---------------------+--------+
+| ts                  | value  |
+---------------------+--------+
+| 1970-01-01T00:00:00 | 0.0    |
+| 1970-01-01T00:00:30 | 1.0    |
+| 1970-01-01T00:01:00 | 60.0   |
+| 1970-01-01T00:01:30 | 60.0   |
+| 1970-01-01T00:02:00 | 60.0   |
+| 1970-01-01T00:02:30 | 60.0   |
+| 1970-01-01T00:03:00 | 60.0   |
+| 1970-01-01T00:03:30 | 60.0   |
+| 1970-01-01T00:04:00 | 60.0   |
+| 1970-01-01T00:04:30 | 60.0   |
+| 1970-01-01T00:05:00 | 60.0   |
+| 1970-01-01T00:05:30 | 60.0   |
+| 1970-01-01T00:06:00 | 60.0   |
+| 1970-01-01T01:00:00 | 3600.0 |
+---------------------+--------+
+
+-- Test timestamp() with specific time range
+tql eval (0, 60, '30s') timestamp(timestamp_test);
+
+---------------------+-------+
+| ts                  | value |
+---------------------+-------+
+| 1970-01-01T00:00:00 | 0.0   |
+| 1970-01-01T00:00:30 | 1.0   |
+| 1970-01-01T00:01:00 | 60.0  |
+---------------------+-------+
+
+tql eval (0, 60, '30s') -timestamp(timestamp_test);
+
+---------------------+-----------+
+| ts                  | (- value) |
+---------------------+-----------+
+| 1970-01-01T00:00:00 | -0.0      |
+| 1970-01-01T00:00:30 | -1.0      |
+| 1970-01-01T00:01:00 | -60.0     |
+---------------------+-----------+
+
+-- Test timestamp() with 2021 data
+tql eval (1609459200, 1609459260, '30s') timestamp(timestamp_test);
+
+---------------------+--------------+
+| ts                  | value        |
+---------------------+--------------+
+| 2021-01-01T00:00:00 | 1609459200.0 |
+| 2021-01-01T00:00:30 | 1609459200.0 |
+| 2021-01-01T00:01:00 | 1609459260.0 |
+---------------------+--------------+
+
+-- Test timestamp() with arithmetic operations
+tql eval (0, 60, '30s') timestamp(timestamp_test) + 1;
+
+---------------------+--------------------+
+| ts                  | value + Float64(1) |
+---------------------+--------------------+
+| 1970-01-01T00:00:00 | 1.0                |
+| 1970-01-01T00:00:30 | 2.0                |
+| 1970-01-01T00:01:00 | 61.0               |
+---------------------+--------------------+
+
+-- Test timestamp() with boolean operations
+tql eval (0, 60, '30s') timestamp(timestamp_test) > bool 30;
+
+---------------------+---------------------+
+| ts                  | value > Float64(30) |
+---------------------+---------------------+
+| 1970-01-01T00:00:00 | 0.0                 |
+| 1970-01-01T00:00:30 | 0.0                 |
+| 1970-01-01T00:01:00 | 1.0                 |
+---------------------+---------------------+
+
+-- Test timestamp() with time functions
+tql eval (0, 60, '30s') timestamp(timestamp_test) - time();
+
+---------------------+----------------------------+
+| ts                  | value - ts / Float64(1000) |
+---------------------+----------------------------+
+| 1970-01-01T00:00:00 | 0.0                        |
+| 1970-01-01T00:00:30 | -29.0                      |
+| 1970-01-01T00:01:00 | 0.0                        |
+---------------------+----------------------------+
+
+-- Test timestamp() with other functions
+tql eval (0, 60, '30s') abs(timestamp(timestamp_test) - avg(timestamp(timestamp_test))) > 20;
+
+Error: 1004(InvalidArguments), Invalid function argument for unknown
+
+tql eval (0, 60, '30s') timestamp(timestamp_test) == 60;
+
+---------------------+-------+
+| ts                  | value |
+---------------------+-------+
+| 1970-01-01T00:01:00 | 60.0  |
+---------------------+-------+
+
+-- Test timestamp() with multiple metrics
+create table timestamp_test2 (ts timestamp time index, val double);
+
+Affected Rows: 0
+
+insert into timestamp_test2 values
+  (0, 10.0),
+  (1000, 20.0),
+  (60000, 30.0);
+
+Affected Rows: 3
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 60, '30s') timestamp(timestamp_test) + timestamp(timestamp_test2);
+
+---------------------+----------------------------------------------+
+| ts                  | timestamp_test.value + timestamp_test2.value |
+---------------------+----------------------------------------------+
+| 1970-01-01T00:00:00 | 0.0                                          |
+| 1970-01-01T00:00:30 | 2.0                                          |
+| 1970-01-01T00:01:00 | 120.0                                        |
+---------------------+----------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 60, '30s') timestamp(timestamp_test) == timestamp(timestamp_test2);
+
+---------------------+-------+---------------------+-------+
+| ts                  | value | ts                  | value |
+---------------------+-------+---------------------+-------+
+| 1970-01-01T00:00:00 | 0.0   | 1970-01-01T00:00:00 | 0.0   |
+| 1970-01-01T00:00:30 | 1.0   | 1970-01-01T00:00:30 | 1.0   |
+| 1970-01-01T00:01:00 | 60.0  | 1970-01-01T00:01:00 | 60.0  |
+---------------------+-------+---------------------+-------+
+
+drop table timestamp_test;
+
+Affected Rows: 0
+
+drop table timestamp_test2;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/promql/timestamp_fn.sql
+++ b/tests/cases/standalone/common/promql/timestamp_fn.sql
@@ -0,0 +1,57 @@
+-- Test `timestamp()` function
+-- timestamp() returns the timestamp of each sample as seconds since Unix epoch
+
+create table timestamp_test (ts timestamp time index, val double);
+
+insert into timestamp_test values
+  (0, 1.0),
+  (1000, 2.0),
+  (60000, 3.0),
+  (3600000, 4.0),
+   -- 2021-01-01 00:00:00
+  (1609459200000, 5.0),
+   -- 2021-01-01 00:01:00
+  (1609459260000, 6.0);
+
+-- Test timestamp() with time series
+tql eval (0, 3600, '30s') timestamp(timestamp_test);
+
+-- Test timestamp() with specific time range
+tql eval (0, 60, '30s') timestamp(timestamp_test);
+
+tql eval (0, 60, '30s') -timestamp(timestamp_test);
+
+-- Test timestamp() with 2021 data
+tql eval (1609459200, 1609459260, '30s') timestamp(timestamp_test);
+
+-- Test timestamp() with arithmetic operations
+tql eval (0, 60, '30s') timestamp(timestamp_test) + 1;
+
+-- Test timestamp() with boolean operations
+tql eval (0, 60, '30s') timestamp(timestamp_test) > bool 30;
+
+-- Test timestamp() with time functions
+tql eval (0, 60, '30s') timestamp(timestamp_test) - time();
+
+-- Test timestamp() with other functions
+tql eval (0, 60, '30s') abs(timestamp(timestamp_test) - avg(timestamp(timestamp_test))) > 20;
+
+tql eval (0, 60, '30s') timestamp(timestamp_test) == 60;
+
+-- Test timestamp() with multiple metrics
+create table timestamp_test2 (ts timestamp time index, val double);
+
+insert into timestamp_test2 values
+  (0, 10.0),
+  (1000, 20.0),
+  (60000, 30.0);
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 60, '30s') timestamp(timestamp_test) + timestamp(timestamp_test2);
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 60, '30s') timestamp(timestamp_test) == timestamp(timestamp_test2);
+
+drop table timestamp_test;
+
+drop table timestamp_test2;
--- a/tests/cases/standalone/common/range/nest.result
+++ b/tests/cases/standalone/common/range/nest.result
@@ -57,7 +57,9 @@ EXPLAIN SELECT ts, host, min(val) RANGE '5s' FROM host ALIGN '5s';
 +-+-+
 | logical_plan_| RangeSelect: range_exprs=[min(host.val) RANGE 5s], align=5000ms, align_to=0ms, align_by=[host.host], time_index=ts |
 |_|_Projection: host.ts, host.host, host.val_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| TableScan: host_|
+|_| ]]_|
 | physical_plan | RangeSelectExec: range_expr=[min(host.val) RANGE 5s], align=5000ms, align_to=0ms, align_by=[host@1], time_index=ts |
 |_|_CoalescePartitionsExec_|
 |_|_MergeScanExec: REDACTED
--- a/tests/cases/standalone/common/system/max_execution_time.result
+++ b/tests/cases/standalone/common/system/max_execution_time.result
@@ -0,0 +1,208 @@
+-- Test default values
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 0                    |
+----------------------+
+
+-- Test basic settings
+-- Using global variable
+-- SQLNESS PROTOCOL MYSQL
+SET MAX_EXECUTION_TIME = 1000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 1000                 |
+----------------------+
+
+-- Using session variable
+-- SQLNESS PROTOCOL MYSQL
+SET SESSION MAX_EXECUTION_TIME = 2000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@session.max_execution_time;
+
+------------------------------+
+| @@session.max_execution_time |
+------------------------------+
+| 2000                         |
+------------------------------+
+
+-- Test different formats
+-- Using session variable
+-- SQLNESS PROTOCOL MYSQL
+SET @@SESSION.MAX_EXECUTION_TIME = 3000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@session.max_execution_time;
+
+------------------------------+
+| @@session.max_execution_time |
+------------------------------+
+| 3000                         |
+------------------------------+
+
+-- Using local variable
+-- SQLNESS PROTOCOL MYSQL
+SET LOCAL MAX_EXECUTION_TIME = 4000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 4000                 |
+----------------------+
+
+-- Test case insensitivity
+-- set
+-- Lowercase
+-- SQLNESS PROTOCOL MYSQL
+set max_execution_time = 5000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 5000                 |
+----------------------+
+
+-- Mixed case
+-- SQLNESS PROTOCOL MYSQL
+SET max_EXECUTION_time = 6000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 6000                 |
+----------------------+
+
+-- Uppercase
+-- SQLNESS PROTOCOL MYSQL
+SET MAX_EXECUTION_TIME = 7000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 7000                 |
+----------------------+
+
+-- select
+-- Lowercase
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 8000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 8000                 |
+----------------------+
+
+-- Mixed case
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 9000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_Execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 9000                 |
+----------------------+
+
+-- Uppercase
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 10000;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@MAX_EXECUTION_TIME;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 10000                |
+----------------------+
+
+-- Test the boundary
+-- Negative value (not allowed)
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = -1;
+
+Failed to execute query, err: MySqlError { ERROR 1235 (42000): (Unsupported): Not supported: Unsupported timeout expr -1 in set variable statement }
+
+-- Maximum value for u64
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 18446744073709551615;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 18446744073709551615 |
+----------------------+
+
+-- Maximum value for u64 + 1 (out of range)
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 18446744073709551616;
+
+Failed to execute query, err: MySqlError { ERROR 1235 (42000): (Unsupported): Not supported: Invalid timeout expr 18446744073709551616 in set variable statement }
+
+--minimum value for u64
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 0;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+----------------------+
+| @@max_execution_time |
+----------------------+
+| 0                    |
+----------------------+
+
--- a/tests/cases/standalone/common/system/max_execution_time.sql
+++ b/tests/cases/standalone/common/system/max_execution_time.sql
@@ -0,0 +1,101 @@
+-- Test default values
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Test basic settings
+-- Using global variable
+-- SQLNESS PROTOCOL MYSQL
+SET MAX_EXECUTION_TIME = 1000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Using session variable
+-- SQLNESS PROTOCOL MYSQL
+SET SESSION MAX_EXECUTION_TIME = 2000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@session.max_execution_time;
+
+-- Test different formats
+-- Using session variable
+-- SQLNESS PROTOCOL MYSQL
+SET @@SESSION.MAX_EXECUTION_TIME = 3000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@session.max_execution_time;
+
+-- Using local variable
+-- SQLNESS PROTOCOL MYSQL
+SET LOCAL MAX_EXECUTION_TIME = 4000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Test case insensitivity
+-- set
+-- Lowercase
+-- SQLNESS PROTOCOL MYSQL
+set max_execution_time = 5000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Mixed case
+-- SQLNESS PROTOCOL MYSQL
+SET max_EXECUTION_time = 6000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Uppercase
+-- SQLNESS PROTOCOL MYSQL
+SET MAX_EXECUTION_TIME = 7000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- select
+-- Lowercase
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 8000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Mixed case
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 9000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_Execution_time;
+
+-- Uppercase
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 10000;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@MAX_EXECUTION_TIME;
+
+-- Test the boundary
+-- Negative value (not allowed)
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = -1;
+
+-- Maximum value for u64
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 18446744073709551615;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
+
+-- Maximum value for u64 + 1 (out of range)
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 18446744073709551616;
+
+--minimum value for u64
+-- SQLNESS PROTOCOL MYSQL
+SET max_execution_time = 0;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT @@max_execution_time;
--- a/tests/cases/standalone/common/tql-explain-analyze/explain.result
+++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result
@@ -12,13 +12,19 @@ Affected Rows: 3
 -- SQLNESS REPLACE (peers.*) REDACTED
 TQL EXPLAIN (0, 10, '5s') test;

-+---------------+-------------------------------------------------+
-| plan_type     | plan                                            |
-+---------------+-------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                |
+---------------+--------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                         |
+---------------+--------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                              |
+|               | PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[j]                    |
+|               |   PromSeriesDivide: tags=["k"]                                                                               |
+|               |     Sort: test.k ASC NULLS FIRST, test.j ASC NULLS FIRST                                                     |
+|               |       Filter: test.j >= TimestampMillisecond(-300000, None) AND test.j <= TimestampMillisecond(300000, None) |
+|               |         TableScan: test                                                                                      |
+|               | ]]                                                                                                           |
 | physical_plan | MergeScanExec: REDACTED
-|               |                                                 |
-+---------------+-------------------------------------------------+
+|               |                                                                                                              |
+---------------+--------------------------------------------------------------------------------------------------------------+

 -- 'lookback' parameter is not fully supported, the test has to be updated
 -- explain at 0s, 5s and 10s. No point at 0s.
@@ -26,26 +32,38 @@ TQL EXPLAIN (0, 10, '5s') test;
 -- SQLNESS REPLACE (peers.*) REDACTED
 TQL EXPLAIN (0, 10, '1s', '2s') test;

-+---------------+-------------------------------------------------+
-| plan_type     | plan                                            |
-+---------------+-------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                |
+---------------+----------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                     |
+---------------+----------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                          |
+|               | PromInstantManipulate: range=[0..0], lookback=[2000], interval=[300000], time index=[j]                  |
+|               |   PromSeriesDivide: tags=["k"]                                                                           |
+|               |     Sort: test.k ASC NULLS FIRST, test.j ASC NULLS FIRST                                                 |
+|               |       Filter: test.j >= TimestampMillisecond(-2000, None) AND test.j <= TimestampMillisecond(2000, None) |
+|               |         TableScan: test                                                                                  |
+|               | ]]                                                                                                       |
 | physical_plan | MergeScanExec: REDACTED
-|               |                                                 |
-+---------------+-------------------------------------------------+
+|               |                                                                                                          |
+---------------+----------------------------------------------------------------------------------------------------------+

 -- explain at 0s, 5s and 10s. No point at 0s.
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
 -- SQLNESS REPLACE (peers.*) REDACTED
 TQL EXPLAIN ('1970-01-01T00:00:00'::timestamp, '1970-01-01T00:00:00'::timestamp + '10 seconds'::interval, '5s') test;

-+---------------+-------------------------------------------------+
-| plan_type     | plan                                            |
-+---------------+-------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                |
+---------------+--------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                         |
+---------------+--------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                              |
+|               | PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[j]                    |
+|               |   PromSeriesDivide: tags=["k"]                                                                               |
+|               |     Sort: test.k ASC NULLS FIRST, test.j ASC NULLS FIRST                                                     |
+|               |       Filter: test.j >= TimestampMillisecond(-300000, None) AND test.j <= TimestampMillisecond(300000, None) |
+|               |         TableScan: test                                                                                      |
+|               | ]]                                                                                                           |
 | physical_plan | MergeScanExec: REDACTED
-|               |                                                 |
-+---------------+-------------------------------------------------+
+|               |                                                                                                              |
+---------------+--------------------------------------------------------------------------------------------------------------+

 -- explain verbose at 0s, 5s and 10s. No point at 0s.
 -- SQLNESS REPLACE (-+) -
@@ -71,7 +89,13 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
 | logical_plan after resolve_grouping_function_| SAME TEXT AS ABOVE_|
 | logical_plan after type_coercion_| SAME TEXT AS ABOVE_|
 | logical_plan after DistPlannerAnalyzer_| Projection: test.i, test.j, test.k_|
-|_|_MergeScan [is_placeholder=false]_|
+|_|_MergeScan [is_placeholder=false, remote_input=[_|
+|_| PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
+|_|_PromSeriesDivide: tags=["k"]_|
+|_|_Sort: test.k ASC NULLS FIRST, test.j ASC NULLS FIRST_|
+|_|_Filter: test.j >= TimestampMillisecond(-300000, None) AND test.j <= TimestampMillisecond(300000, None)_|
+|_|_TableScan: test_|
+|_| ]]_|
 | analyzed_logical_plan_| SAME TEXT AS ABOVE_|
 | logical_plan after eliminate_nested_union_| SAME TEXT AS ABOVE_|
 | logical_plan after simplify_expressions_| SAME TEXT AS ABOVE_|
@@ -97,7 +121,13 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
 | logical_plan after unwrap_cast_in_comparison_| SAME TEXT AS ABOVE_|
 | logical_plan after common_sub_expression_eliminate_| SAME TEXT AS ABOVE_|
 | logical_plan after eliminate_group_by_constant_| SAME TEXT AS ABOVE_|
-| logical_plan after optimize_projections_| MergeScan [is_placeholder=false]_|
+| logical_plan after optimize_projections_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
+|_|_PromSeriesDivide: tags=["k"]_|
+|_|_Sort: test.k ASC NULLS FIRST, test.j ASC NULLS FIRST_|
+|_|_Filter: test.j >= TimestampMillisecond(-300000, None) AND test.j <= TimestampMillisecond(300000, None)_|
+|_|_TableScan: test_|
+|_| ]]_|
 | logical_plan after ScanHintRule_| SAME TEXT AS ABOVE_|
 | logical_plan after eliminate_nested_union_| SAME TEXT AS ABOVE_|
 | logical_plan after simplify_expressions_| SAME TEXT AS ABOVE_|
@@ -125,7 +155,13 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
 | logical_plan after eliminate_group_by_constant_| SAME TEXT AS ABOVE_|
 | logical_plan after optimize_projections_| SAME TEXT AS ABOVE_|
 | logical_plan after ScanHintRule_| SAME TEXT AS ABOVE_|
-| logical_plan_| MergeScan [is_placeholder=false]_|
+| logical_plan_| MergeScan [is_placeholder=false, remote_input=[_|
+|_| PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
+|_|_PromSeriesDivide: tags=["k"]_|
+|_|_Sort: test.k ASC NULLS FIRST, test.j ASC NULLS FIRST_|
+|_|_Filter: test.j >= TimestampMillisecond(-300000, None) AND test.j <= TimestampMillisecond(300000, None)_|
+|_|_TableScan: test_|
+|_| ]]_|
 | initial_physical_plan_| MergeScanExec: REDACTED
 |_|_|
 | initial_physical_plan_with_stats_| MergeScanExec: REDACTED
--- a/tests/cases/standalone/limit/limit.result
+++ b/tests/cases/standalone/limit/limit.result
@@ -5,23 +5,37 @@ SELECT * FROM (SELECT SUM(number) FROM numbers LIMIT 100000000000) LIMIT 0;

 EXPLAIN SELECT * FROM (SELECT SUM(number) FROM numbers LIMIT 100000000000) LIMIT 0;

-+---------------+----------------------------------+
-| plan_type     | plan                             |
-+---------------+----------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false] |
-| physical_plan | EmptyExec                        |
-|               |                                  |
-+---------------+----------------------------------+
+---------------+-------------------------------------------------------------------------------+
+| plan_type     | plan                                                                          |
+---------------+-------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                               |
+|               | Limit: skip=0, fetch=0                                                        |
+|               |   Projection: sum(numbers.number)                                             |
+|               |     Limit: skip=0, fetch=100000000000                                         |
+|               |       Projection: sum(numbers.number)                                         |
+|               |         Aggregate: groupBy=[[]], aggr=[[sum(CAST(numbers.number AS UInt64))]] |
+|               |           TableScan: numbers                                                  |
+|               | ]]                                                                            |
+| physical_plan | EmptyExec                                                                     |
+|               |                                                                               |
+---------------+-------------------------------------------------------------------------------+

 EXPLAIN SELECT * FROM (SELECT SUM(number) FROM numbers LIMIT 100000000000) WHERE 1=0;

-+---------------+----------------------------------+
-| plan_type     | plan                             |
-+---------------+----------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false] |
-| physical_plan | EmptyExec                        |
-|               |                                  |
-+---------------+----------------------------------+
+---------------+-------------------------------------------------------------------------------+
+| plan_type     | plan                                                                          |
+---------------+-------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                               |
+|               | Projection: sum(numbers.number)                                               |
+|               |   Filter: Int64(1) = Int64(0)                                                 |
+|               |     Limit: skip=0, fetch=100000000000                                         |
+|               |       Projection: sum(numbers.number)                                         |
+|               |         Aggregate: groupBy=[[]], aggr=[[sum(CAST(numbers.number AS UInt64))]] |
+|               |           TableScan: numbers                                                  |
+|               | ]]                                                                            |
+| physical_plan | EmptyExec                                                                     |
+|               |                                                                               |
+---------------+-------------------------------------------------------------------------------+

 CREATE TABLE test (a TIMESTAMP TIME INDEX, b INTEGER);

--- a/tests/cases/standalone/optimizer/order_by.result
+++ b/tests/cases/standalone/optimizer/order_by.result
@@ -3,7 +3,10 @@ explain select * from numbers;
 +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                 |
 +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                     |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                      |
+|               | Projection: numbers.number                                                                                                                                                                                                           |
+|               |   TableScan: numbers                                                                                                                                                                                                                 |
+|               | ]]                                                                                                                                                                                                                                   |
 | physical_plan | StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                      |
 +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -13,7 +16,11 @@ explain select * from numbers order by number desc;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Sort: numbers.number DESC NULLS FIRST                                                                                                                                                                                                  |
+|               |   Projection: numbers.number                                                                                                                                                                                                           |
+|               |     TableScan: numbers                                                                                                                                                                                                                 |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: expr=[number@0 DESC], preserve_partitioning=[false]                                                                                                                                                                          |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
@@ -24,7 +31,11 @@ explain select * from numbers order by number asc;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Sort: numbers.number ASC NULLS LAST                                                                                                                                                                                                    |
+|               |   Projection: numbers.number                                                                                                                                                                                                           |
+|               |     TableScan: numbers                                                                                                                                                                                                                 |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                                |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
@@ -35,7 +46,12 @@ explain select * from numbers order by number desc limit 10;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Limit: skip=0, fetch=10                                                                                                                                                                                                                |
+|               |   Sort: numbers.number DESC NULLS FIRST                                                                                                                                                                                                |
+|               |     Projection: numbers.number                                                                                                                                                                                                         |
+|               |       TableScan: numbers                                                                                                                                                                                                               |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: TopK(fetch=10), expr=[number@0 DESC], preserve_partitioning=[false]                                                                                                                                                          |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
@@ -46,7 +62,12 @@ explain select * from numbers order by number asc limit 10;
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | plan_type     | plan                                                                                                                                                                                                                                   |
 +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false]                                                                                                                                                                                                       |
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                        |
+|               | Limit: skip=0, fetch=10                                                                                                                                                                                                                |
+|               |   Sort: numbers.number ASC NULLS LAST                                                                                                                                                                                                  |
+|               |     Projection: numbers.number                                                                                                                                                                                                         |
+|               |       TableScan: numbers                                                                                                                                                                                                               |
+|               | ]]                                                                                                                                                                                                                                     |
 | physical_plan | SortExec: TopK(fetch=10), expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false]                                                                                                                                                |
 |               |   StreamScanAdapter: [<SendableRecordBatchStream>], schema: [Schema { fields: [Field { name: "number", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {"greptime:version": "0"} }] |
 |               |                                                                                                                                                                                                                                        |
Author	SHA1	Message	Date
evenyag	5fc0c5706c	chore: bump version to v0.15.4 Signed-off-by: evenyag <realevenyag@gmail.com>	2025-08-04 22:19:40 +08:00
Ning Sun	4d768b2c31	feat: schema/database support for label_values (#6631 ) * feat: initial support for __schema__ in label values * feat: filter database with matches * refactor: skip unnecessary check * fix: resolve schema matcher in label values * test: add a test case for table not exists * refactor: add matchop check on db label * chore: merge main Signed-off-by: evenyag <realevenyag@gmail.com>	2025-08-04 22:19:40 +08:00
Yingwen	b62f219810	feat: Add option to limit the files reading simultaneously (#6635 ) * feat: limits the max number of files to scan at the same time Signed-off-by: evenyag <realevenyag@gmail.com> * feat: make max_concurrent_scan_files configurable Signed-off-by: evenyag <realevenyag@gmail.com> * feat: reduce concurrent scan files to 128 Signed-off-by: evenyag <realevenyag@gmail.com> * docs: update config example Signed-off-by: evenyag <realevenyag@gmail.com> * test: add test for max_concurrent_scan_files Signed-off-by: evenyag <realevenyag@gmail.com> * style: fix clippy Signed-off-by: evenyag <realevenyag@gmail.com> * test: update config test Signed-off-by: evenyag <realevenyag@gmail.com> --------- Signed-off-by: evenyag <realevenyag@gmail.com>	2025-08-04 22:19:40 +08:00
Ruihang Xia	5d330fad17	feat: `absent` function in PromQL (#6618 ) * feat: absent function in PromQL Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl serde Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * ai suggests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * resolve PR comments Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * comment out some tests Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-08-04 22:19:40 +08:00
Ruihang Xia	dfdfae1a7b	feat: support `__schema__` and `__database__` in Prom Remote Read (#6610 ) * feat: support __schema__ and __database__ in Prom remote R/W Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix integration test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * revert remote write changes Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * check matcher type Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-08-04 22:19:40 +08:00
Ruihang Xia	822f0caf4b	fix: only return the __name__ label when there is one (#6629 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-08-04 22:19:40 +08:00
yihong	09f3d72d2d	fix: closee issue #6555 return empty result (#6569 ) * fix: closee issue #6555 return empty result Signed-off-by: yihong0618 <zouzou0208@gmail.com> * fix: only start one instance one regrex sqlness test (#6570) Signed-off-by: yihong0618 <zouzou0208@gmail.com> * refactor: refactor partition mod to use PartitionExpr instead of PartitionDef (#6554) * refactor: refactor partition mod to use PartitionExpr instead of PartitionDef Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix snafu Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * Puts expression into PbPartition Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * address comments Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix compile Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * update proto Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * add serde test Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * add serde test Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix: address comments Signed-off-by: yihong0618 <zouzou0208@gmail.com> --------- Signed-off-by: yihong0618 <zouzou0208@gmail.com> Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> Co-authored-by: Zhenchi <zhongzc_arch@outlook.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-24 15:00:32 +08:00
Yingwen	ca0c1282ed	chore: bump version to 0.15.3 (#6580 ) Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-24 11:24:07 +08:00
Yingwen	b719c020ba	chore: cherry pick #6540 , #6550 , #6551 , #6556 , #6563 , #6534 to v0.15 branch (#6577 ) * feat: add metrics for request wait time and adjust stall metrics (#6540) * feat: add metric greptime_mito_request_wait_time to observe wait time Signed-off-by: evenyag <realevenyag@gmail.com> * feat: add worker to wait time metric Signed-off-by: evenyag <realevenyag@gmail.com> * refactor: rename stall gauge to greptime_mito_write_stalling_count Signed-off-by: evenyag <realevenyag@gmail.com> * feat: change greptime_mito_write_stall_total to total stalled requests Signed-off-by: evenyag <realevenyag@gmail.com> * refactor: merge lazy static blocks Signed-off-by: evenyag <realevenyag@gmail.com> --------- Signed-off-by: evenyag <realevenyag@gmail.com> * fix: estimate mem size for bulk ingester (#6550) Signed-off-by: evenyag <realevenyag@gmail.com> * fix: flow mirror cache (#6551) * fix: invalid cache when flownode change address Signed-off-by: discord9 <discord9@163.com> * update comments Signed-off-by: discord9 <discord9@163.com> * fix Signed-off-by: discord9 <discord9@163.com> * refactor: add log&rename Signed-off-by: discord9 <discord9@163.com> * stuff Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com> Signed-off-by: evenyag <realevenyag@gmail.com> * feat: impl timestamp function for promql (#6556) * feat: impl timestamp function for promql Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * chore: style and typo Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * fix: test Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * docs: update comments Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * chore: comment Signed-off-by: Dennis Zhuang <killme2008@gmail.com> --------- Signed-off-by: Dennis Zhuang <killme2008@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com> * feat: MergeScan print input (#6563) * feat: MergeScan print input Signed-off-by: discord9 <discord9@163.com> * test: fix ut Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com> Signed-off-by: evenyag <realevenyag@gmail.com> * fix: aggr group by all partition cols use partial commutative (#6534) * fix: aggr group by all partition cols use partial commutative Signed-off-by: discord9 <discord9@163.com> * test: bugged case Signed-off-by: discord9 <discord9@163.com> * test: sqlness fix Signed-off-by: discord9 <discord9@163.com> * test: more redacted Signed-off-by: discord9 <discord9@163.com> * more cases Signed-off-by: discord9 <discord9@163.com> * even more test cases Signed-off-by: discord9 <discord9@163.com> * join testcase Signed-off-by: discord9 <discord9@163.com> * fix: column requirement added in correct location Signed-off-by: discord9 <discord9@163.com> * fix test Signed-off-by: discord9 <discord9@163.com> * chore: clippy Signed-off-by: discord9 <discord9@163.com> * track col reqs per stack Signed-off-by: discord9 <discord9@163.com> * fix: continue Signed-off-by: discord9 <discord9@163.com> * chore: clippy Signed-off-by: discord9 <discord9@163.com> * refactor: test mod Signed-off-by: discord9 <discord9@163.com> * test utils Signed-off-by: discord9 <discord9@163.com> * test: better test Signed-off-by: discord9 <discord9@163.com> * more testcases Signed-off-by: discord9 <discord9@163.com> * test limit push down Signed-off-by: discord9 <discord9@163.com> * more testcases Signed-off-by: discord9 <discord9@163.com> * more testcase Signed-off-by: discord9 <discord9@163.com> * more test Signed-off-by: discord9 <discord9@163.com> * chore: update sqlness Signed-off-by: discord9 <discord9@163.com> * chore: update commnets Signed-off-by: discord9 <discord9@163.com> * fix: check col reqs from bottom to upper Signed-off-by: discord9 <discord9@163.com> * chore: more comment Signed-off-by: discord9 <discord9@163.com> * docs: more todo Signed-off-by: discord9 <discord9@163.com> * chore: comments Signed-off-by: discord9 <discord9@163.com> * test: a new failing test that should be fixed Signed-off-by: discord9 <discord9@163.com> * fix: part col alias tracking Signed-off-by: discord9 <discord9@163.com> * chore: unused Signed-off-by: discord9 <discord9@163.com> * chore: clippy Signed-off-by: discord9 <discord9@163.com> * docs: comment Signed-off-by: discord9 <discord9@163.com> * mroe testcase Signed-off-by: discord9 <discord9@163.com> * more testcase for step/part aggr combine Signed-off-by: discord9 <discord9@163.com> * FIXME: a new bug Signed-off-by: discord9 <discord9@163.com> * literally unfixable Signed-off-by: discord9 <discord9@163.com> * chore: remove some debug print Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com> Signed-off-by: evenyag <realevenyag@gmail.com> --------- Signed-off-by: evenyag <realevenyag@gmail.com> Signed-off-by: discord9 <discord9@163.com> Signed-off-by: Dennis Zhuang <killme2008@gmail.com> Co-authored-by: fys <40801205+fengys1996@users.noreply.github.com> Co-authored-by: discord9 <55937128+discord9@users.noreply.github.com> Co-authored-by: dennis zhuang <killme2008@gmail.com>	2025-07-23 22:29:14 +08:00
Ruihang Xia	717c1d1807	feat: update partial execution metrics (#6499 ) * feat: update partial execution metrics Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * send data with metrics in distributed mode Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * only send partial metrics under VERBOSE flag Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * loop to while Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
Zhenchi	291f3c89fe	fix: row selection intersection removes trailing rows (#6539 ) * fix: row selection intersection removes trailing rows Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix typos Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
discord9	602cc38056	fix: breaking loop when not retryable (#6538 ) fix: breaking when not retryable Signed-off-by: discord9 <discord9@163.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
Lei, HUANG	46b3593021	fix(grpc): check grpc client unavailable (#6488 ) * fix/check-grpc-client-unavailable: Improve async handling in `greptime_handler.rs` - Updated the `DoPut` response handling to use `await` with `result_sender.send` for better asynchronous operation. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> * fix/check-grpc-client-unavailable: ### Improve Error Handling in `greptime_handler.rs` - Enhanced error handling for the `DoPut` operation by switching from `send` to `try_send` for the `result_sender`. - Added specific logging for unreachable clients, including `request_id` in the warning message. Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> --------- Signed-off-by: Lei, HUANG <mrsatangel@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
Yan Tingwang	ff402fd6f6	test: add sqlness test for max execution time (#6517 ) * add sqlness test for max_execution_time Signed-off-by: codephage. <tingwangyan2020@163.com> * add Pre-line comments SQLNESS PROTOCOL MYSQL Signed-off-by: codephage. <tingwangyan2020@163.com> * fix(mysql): support max_execution_time variable Co-authored-by: evenyag <realevenyag@gmail.com> Signed-off-by: codephage. <tingwangyan2020@163.com> * fix: test::test_check & sqlness test mysql Signed-off-by: codephage. <tingwangyan2020@163.com> * add sqlness test for max_execution_time Signed-off-by: codephage. <tingwangyan2020@163.com> * add Pre-line comments SQLNESS PROTOCOL MYSQL Signed-off-by: codephage. <tingwangyan2020@163.com> * fix(mysql): support max_execution_time variable Co-authored-by: evenyag <realevenyag@gmail.com> Signed-off-by: codephage. <tingwangyan2020@163.com> * fix: test::test_check & sqlness test mysql Signed-off-by: codephage. <tingwangyan2020@163.com> * chore: Unify the sql style Signed-off-by: codephage. <tingwangyan2020@163.com> --------- Signed-off-by: codephage. <tingwangyan2020@163.com> Co-authored-by: evenyag <realevenyag@gmail.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
Yan Tingwang	b83e6e2b18	fix: add system variable max_execution_time (#6511 ) add system variable : max_execution_time Signed-off-by: codephage. <tingwangyan2020@163.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
discord9	cb74337dbe	refactor(flow): faster time window expr (#6495 ) * refactor: faster window expr Signed-off-by: discord9 <discord9@163.com> * docs: explain fast path Signed-off-by: discord9 <discord9@163.com> * chore: rm unwrap Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com> Signed-off-by: evenyag <realevenyag@gmail.com>	2025-07-23 20:54:33 +08:00
shuiyisong	32bffbb668	feat: add filter processor to v0.15 (#6516 ) feat: add filter processor Signed-off-by: shuiyisong <xixing.sys@gmail.com>	2025-07-14 17:43:49 +08:00