fix: config test failed and use similar_asserts::assert_eq to replace assert_eq for long string compare (#4731 )

* fix: config test failed and use 'similar_asserts::assert_eq' to replace 'assert_eq' for long string compare * Update Cargo.toml Co-authored-by: Yingwen <realevenyag@gmail.com> * Update src/cmd/tests/load_config_test.rs Co-authored-by: Yingwen <realevenyag@gmail.com> --------- Co-authored-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com>
chore: add auto-decompression layer for otlp http request (#4723 )
2026-01-07 22:02:56 +00:00 · 2024-09-18 07:41:25 +00:00 · 2024-09-18 04:32:00 +00:00 · 2024-09-18 03:33:55 +00:00 · 2024-09-18 03:01:30 +00:00 · 2024-09-12 18:21:05 +00:00
218 changed files with 8931 additions and 4416 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1175,6 +1175,17 @@ dependencies = [
 "regex-automata 0.1.10",
 ]

+[[package]]
+name = "bstr"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c"
+dependencies = [
+ "memchr",
+ "regex-automata 0.4.7",
+ "serde",
+]
+
 [[package]]
 name = "btoi"
 version = "0.4.3"
@@ -1761,6 +1772,7 @@ dependencies = [
 "serde_json",
 "servers",
 "session",
+ "similar-asserts",
 "snafu 0.8.4",
 "store-api",
 "substrait 0.9.3",
@@ -1812,10 +1824,12 @@ name = "common-base"
 version = "0.9.3"
 dependencies = [
 "anymap",
+ "async-trait",
 "bitvec",
 "bytes",
 "common-error",
 "common-macro",
+ "futures",
 "paste",
 "serde",
 "snafu 0.8.4",
@@ -1952,6 +1966,7 @@ dependencies = [
 "datatypes",
 "geohash",
 "h3o",
+ "jsonb",
 "num",
 "num-traits",
 "once_cell",
@@ -2293,6 +2308,7 @@ dependencies = [
 "common-telemetry",
 "futures-util",
 "humantime-serde",
+ "num_cpus",
 "rskafka",
 "rustls 0.23.10",
 "rustls-native-certs",
@@ -3156,6 +3172,7 @@ dependencies = [
 "arrow",
 "arrow-array",
 "arrow-schema",
+ "base64 0.21.7",
 "common-base",
 "common-decimal",
 "common-error",
@@ -3164,6 +3181,8 @@ dependencies = [
 "common-time",
 "datafusion-common",
 "enum_dispatch",
+ "greptime-proto",
+ "jsonb",
 "num",
 "num-traits",
 "ordered-float 3.9.2",
@@ -3696,6 +3715,12 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"

+[[package]]
+name = "fast-float"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c"
+
 [[package]]
 name = "fastdivide"
 version = "0.4.1"
@@ -4300,7 +4325,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=c437b55725b7f5224fe9d46db21072b4a682ee4b#c437b55725b7f5224fe9d46db21072b4a682ee4b"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=973f49cde88a582fb65755cc572ebcf6fb93ccf7#973f49cde88a582fb65755cc572ebcf6fb93ccf7"
 dependencies = [
 "prost 0.12.6",
 "serde",
@@ -5407,6 +5432,21 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "jsonb"
+version = "0.4.1"
+source = "git+https://github.com/CookiePieWw/jsonb.git?rev=d0166c130fce903bf6c58643417a3173a6172d31#d0166c130fce903bf6c58643417a3173a6172d31"
+dependencies = [
+ "byteorder",
+ "fast-float",
+ "itoa",
+ "nom",
+ "ordered-float 4.2.0",
+ "rand",
+ "ryu",
+ "serde_json",
+]
+
 [[package]]
 name = "jsonpath-rust"
 version = "0.5.1"
@@ -7200,9 +7240,11 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a8fddc9b68f5b80dae9d6f510b88e02396f006ad48cac349411fbecc80caae4"
 dependencies = [
+ "hex",
 "opentelemetry 0.22.0",
 "opentelemetry_sdk 0.22.1",
 "prost 0.12.6",
+ "serde",
 "tonic 0.11.0",
 ]

@@ -8060,6 +8102,8 @@ dependencies = [
 "chrono",
 "fallible-iterator",
 "postgres-protocol",
+ "serde",
+ "serde_json",
 ]

 [[package]]
@@ -9625,7 +9669,7 @@ source = "git+https://github.com/discord9/RustPython?rev=9ed5137412#9ed51374125b
 dependencies = [
 "ascii",
 "bitflags 1.3.2",
- "bstr",
+ "bstr 0.2.17",
 "cfg-if",
 "hexf-parse",
 "itertools 0.10.5",
@@ -9660,7 +9704,7 @@ version = "0.2.0"
 source = "git+https://github.com/discord9/RustPython?rev=9ed5137412#9ed51374125b5f1a9e5cee5dd7e27023b8591f1e"
 dependencies = [
 "bitflags 1.3.2",
- "bstr",
+ "bstr 0.2.17",
 "itertools 0.10.5",
 "lz4_flex 0.9.5",
 "num-bigint",
@@ -9813,7 +9857,7 @@ dependencies = [
 "ascii",
 "atty",
 "bitflags 1.3.2",
- "bstr",
+ "bstr 0.2.17",
 "caseless",
 "cfg-if",
 "chrono",
@@ -10398,6 +10442,7 @@ dependencies = [
 "hyper 0.14.29",
 "influxdb_line_protocol",
 "itertools 0.10.5",
+ "jsonb",
 "lazy_static",
 "mime_guess",
 "mysql_async",
@@ -10580,6 +10625,26 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a"

+[[package]]
+name = "similar"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e"
+dependencies = [
+ "bstr 1.10.0",
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "similar-asserts"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfe85670573cd6f0fa97940f26e7e6601213c3b0555246c24234131f88c5709e"
+dependencies = [
+ "console",
+ "similar",
+]
+
 [[package]]
 name = "simple_asn1"
 version = "0.6.2"
@@ -10777,6 +10842,7 @@ dependencies = [
 "hex",
 "iso8601",
 "itertools 0.10.5",
+ "jsonb",
 "lazy_static",
 "regex",
 "serde_json",
@@ -11743,6 +11809,7 @@ dependencies = [
 "datanode",
 "datatypes",
 "dotenv",
+ "flate2",
 "flow",
 "frontend",
 "futures",
@@ -11766,6 +11833,7 @@ dependencies = [
 "serde_json",
 "servers",
 "session",
+ "similar-asserts",
 "snafu 0.8.4",
 "sql",
 "sqlx",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -120,10 +120,11 @@ etcd-client = { version = "0.13" }
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c437b55725b7f5224fe9d46db21072b4a682ee4b" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "973f49cde88a582fb65755cc572ebcf6fb93ccf7" }
 humantime = "2.1"
 humantime-serde = "1.1"
 itertools = "0.10"
+jsonb = { git = "https://github.com/CookiePieWw/jsonb.git", rev = "d0166c130fce903bf6c58643417a3173a6172d31", default-features = false }
 lazy_static = "1.4"
 meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev = "80eb97c24c88af4dd9a86f8bbaf50e741d4eb8cd" }
 mockall = "0.11.4"
@@ -135,6 +136,7 @@ opentelemetry-proto = { version = "0.5", features = [
    "gen-tonic",
    "metrics",
    "trace",
+    "with-serde",
 ] }
 parquet = { version = "51.0.0", default-features = false, features = ["arrow", "async", "object_store"] }
 paste = "1.0"
@@ -167,6 +169,7 @@ shadow-rs = "0.31"
 smallvec = { version = "1", features = ["serde"] }
 snafu = "0.8"
 sysinfo = "0.30"
+similar-asserts = "1.6.0"
 # on branch v0.44.x
 sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "54a267ac89c09b11c0c88934690530807185d3e7", features = [
    "visitor",
--- a/2
+++ b/2
@@ -221,7 +221,7 @@ config-docs: ## Generate configuration documentation from toml files.
 	docker run --rm \
    -v ${PWD}:/greptimedb \
    -w /greptimedb/config \
-    toml2docs/toml2docs:v0.1.1 \
+    toml2docs/toml2docs:v0.1.3 \
    -p '##' \
    -t ./config-docs-template.md \
    -o ./config.md
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ Our core developers have been building time-series data platforms for years. Bas

 * **Compatible with InfluxDB, Prometheus and more protocols**

-  Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/clients/overview).
+  Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/protocols/overview).

 ## Try GreptimeDB

--- a/config/config.md
+++ b/config/config.md
@@ -14,7 +14,7 @@
 | --- | -----| ------- | ----------- |
 | `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. |
 | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. |
-| `default_timezone` | String | `None` | The default timezone of the server. |
+| `default_timezone` | String | Unset | The default timezone of the server. |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
 | `runtime` | -- | -- | The runtime options. |
@@ -29,8 +29,8 @@
 | `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
 | `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
 | `grpc.tls.mode` | String | `disable` | TLS mode. |
-| `grpc.tls.cert_path` | String | `None` | Certificate file path. |
-| `grpc.tls.key_path` | String | `None` | Private key file path. |
+| `grpc.tls.cert_path` | String | Unset | Certificate file path. |
+| `grpc.tls.key_path` | String | Unset | Private key file path. |
 | `grpc.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload.<br/>For now, gRPC tls config does not support auto reload. |
 | `mysql` | -- | -- | MySQL server options. |
 | `mysql.enable` | Bool | `true` | Whether to enable. |
@@ -38,8 +38,8 @@
 | `mysql.runtime_size` | Integer | `2` | The number of server worker threads. |
 | `mysql.tls` | -- | -- | -- |
 | `mysql.tls.mode` | String | `disable` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- `disable` (default value)<br/>- `prefer`<br/>- `require`<br/>- `verify-ca`<br/>- `verify-full` |
-| `mysql.tls.cert_path` | String | `None` | Certificate file path. |
-| `mysql.tls.key_path` | String | `None` | Private key file path. |
+| `mysql.tls.cert_path` | String | Unset | Certificate file path. |
+| `mysql.tls.key_path` | String | Unset | Private key file path. |
 | `mysql.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
 | `postgres` | -- | -- | PostgresSQL server options. |
 | `postgres.enable` | Bool | `true` | Whether to enable |
@@ -47,8 +47,8 @@
 | `postgres.runtime_size` | Integer | `2` | The number of server worker threads. |
 | `postgres.tls` | -- | -- | PostgresSQL server TLS options, see `mysql.tls` section. |
 | `postgres.tls.mode` | String | `disable` | TLS mode. |
-| `postgres.tls.cert_path` | String | `None` | Certificate file path. |
-| `postgres.tls.key_path` | String | `None` | Private key file path. |
+| `postgres.tls.cert_path` | String | Unset | Certificate file path. |
+| `postgres.tls.key_path` | String | Unset | Private key file path. |
 | `postgres.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
 | `opentsdb` | -- | -- | OpenTSDB protocol options. |
 | `opentsdb.enable` | Bool | `true` | Whether to enable OpenTSDB put in HTTP API. |
@@ -59,7 +59,7 @@
 | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
 | `wal` | -- | -- | The WAL options. |
 | `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
-| `wal.dir` | String | `None` | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.dir` | String | Unset | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.file_size` | String | `256MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.purge_threshold` | String | `4GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.purge_interval` | String | `10m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
@@ -68,6 +68,7 @@
 | `wal.enable_log_recycle` | Bool | `true` | Whether to reuse logically truncated log files.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.prefill_log_files` | Bool | `false` | Whether to pre-create log files on start up.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.sync_period` | String | `10s` | Duration for fsyncing log files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.recovery_parallelism` | Integer | `2` | Parallelism during WAL recovery. |
 | `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.auto_create_topics` | Bool | `true` | Automatically create topics for WAL.<br/>Set to `true` to automatically create topics for WAL.<br/>Otherwise, use topics named `topic_name_prefix_[0..num_topics)` |
 | `wal.num_topics` | Integer | `64` | Number of topics.<br/>**It's only used when the provider is `kafka`**. |
@@ -90,22 +91,22 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | `None` | Cache configuration for object storage such as 'S3' etc.<br/>The local file cache directory. |
-| `storage.cache_capacity` | String | `None` | The local file cache capacity in bytes. |
-| `storage.bucket` | String | `None` | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
-| `storage.root` | String | `None` | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
-| `storage.access_key_id` | String | `None` | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
-| `storage.secret_access_key` | String | `None` | The secret access key of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3`**. |
-| `storage.access_key_secret` | String | `None` | The secret access key of the aliyun account.<br/>**It's only used when the storage type is `Oss`**. |
-| `storage.account_name` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.account_key` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.scope` | String | `None` | The scope of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
-| `storage.credential_path` | String | `None` | The credential path of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
-| `storage.credential` | String | `None` | The credential of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
-| `storage.container` | String | `None` | The container of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.sas_token` | String | `None` | The sas token of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.endpoint` | String | `None` | The endpoint of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
-| `storage.region` | String | `None` | The region of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `storage.cache_path` | String | Unset | Cache configuration for object storage such as 'S3' etc.<br/>The local file cache directory. |
+| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. |
+| `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
+| `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
+| `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
+| `storage.secret_access_key` | String | Unset | The secret access key of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3`**. |
+| `storage.access_key_secret` | String | Unset | The secret access key of the aliyun account.<br/>**It's only used when the storage type is `Oss`**. |
+| `storage.account_name` | String | Unset | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.account_key` | String | Unset | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.scope` | String | Unset | The scope of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.credential_path` | String | Unset | The credential path of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.credential` | String | Unset | The credential of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.container` | String | Unset | The container of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.sas_token` | String | Unset | The sas token of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.endpoint` | String | Unset | The endpoint of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `storage.region` | String | Unset | The region of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
 | `[[region_engine]]` | -- | -- | The region engine options. You can configure multiple region engines. |
 | `region_engine.mito` | -- | -- | The Mito engine options. |
 | `region_engine.mito.num_workers` | Integer | `8` | Number of region workers. |
@@ -115,16 +116,16 @@
 | `region_engine.mito.compress_manifest` | Bool | `false` | Whether to compress manifest and checkpoint file by gzip (default false). |
 | `region_engine.mito.max_background_jobs` | Integer | `4` | Max number of running background jobs |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
-| `region_engine.mito.global_write_buffer_size` | String | `1GB` | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
-| `region_engine.mito.global_write_buffer_reject_size` | String | `2GB` | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
-| `region_engine.mito.sst_meta_cache_size` | String | `128MB` | Cache size for SST metadata. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
-| `region_engine.mito.vector_cache_size` | String | `512MB` | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
-| `region_engine.mito.page_cache_size` | String | `512MB` | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
-| `region_engine.mito.selector_result_cache_size` | String | `512MB` | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
+| `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`. |
+| `region_engine.mito.sst_meta_cache_size` | String | Auto | Cache size for SST metadata. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
+| `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
+| `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.enable_experimental_write_cache` | Bool | `false` | Whether to enable the experimental write cache. |
 | `region_engine.mito.experimental_write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}/write_cache`. |
 | `region_engine.mito.experimental_write_cache_size` | String | `512MB` | Capacity for write cache. |
-| `region_engine.mito.experimental_write_cache_ttl` | String | `None` | TTL for write cache. |
+| `region_engine.mito.experimental_write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.scan_parallelism` | Integer | `0` | Parallelism to scan a region (default: 1/4 of cpu cores).<br/>- `0`: using the default value (1/4 of cpu cores).<br/>- `1`: scan in current thread.<br/>- `n`: scan in parallelism n. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
@@ -154,7 +155,7 @@
 | `region_engine.file` | -- | -- | Enable the file engine. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
-| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
 | `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
 | `logging.otlp_endpoint` | String | `http://localhost:4317` | The OTLP tracing endpoint. |
 | `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
@@ -164,13 +165,13 @@
 | `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
 | `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
 | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
-| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself<br/>You must create the database before enabling it. |
-| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommended to collect metrics generated by itself<br/>You must create the database before enabling it. |
+| `export_metrics.self_import.db` | String | Unset | -- |
 | `export_metrics.remote_write` | -- | -- | -- |
 | `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
-| `tracing.tokio_console_addr` | String | `None` | The tokio console address. |
+| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |


 ## Distributed Mode
@@ -179,7 +180,7 @@

 | Key | Type | Default | Descriptions |
 | --- | -----| ------- | ----------- |
-| `default_timezone` | String | `None` | The default timezone of the server. |
+| `default_timezone` | String | Unset | The default timezone of the server. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
@@ -196,8 +197,8 @@
 | `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
 | `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
 | `grpc.tls.mode` | String | `disable` | TLS mode. |
-| `grpc.tls.cert_path` | String | `None` | Certificate file path. |
-| `grpc.tls.key_path` | String | `None` | Private key file path. |
+| `grpc.tls.cert_path` | String | Unset | Certificate file path. |
+| `grpc.tls.key_path` | String | Unset | Private key file path. |
 | `grpc.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload.<br/>For now, gRPC tls config does not support auto reload. |
 | `mysql` | -- | -- | MySQL server options. |
 | `mysql.enable` | Bool | `true` | Whether to enable. |
@@ -205,8 +206,8 @@
 | `mysql.runtime_size` | Integer | `2` | The number of server worker threads. |
 | `mysql.tls` | -- | -- | -- |
 | `mysql.tls.mode` | String | `disable` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- `disable` (default value)<br/>- `prefer`<br/>- `require`<br/>- `verify-ca`<br/>- `verify-full` |
-| `mysql.tls.cert_path` | String | `None` | Certificate file path. |
-| `mysql.tls.key_path` | String | `None` | Private key file path. |
+| `mysql.tls.cert_path` | String | Unset | Certificate file path. |
+| `mysql.tls.key_path` | String | Unset | Private key file path. |
 | `mysql.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
 | `postgres` | -- | -- | PostgresSQL server options. |
 | `postgres.enable` | Bool | `true` | Whether to enable |
@@ -214,8 +215,8 @@
 | `postgres.runtime_size` | Integer | `2` | The number of server worker threads. |
 | `postgres.tls` | -- | -- | PostgresSQL server TLS options, see `mysql.tls` section. |
 | `postgres.tls.mode` | String | `disable` | TLS mode. |
-| `postgres.tls.cert_path` | String | `None` | Certificate file path. |
-| `postgres.tls.key_path` | String | `None` | Private key file path. |
+| `postgres.tls.cert_path` | String | Unset | Certificate file path. |
+| `postgres.tls.key_path` | String | Unset | Private key file path. |
 | `postgres.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload |
 | `opentsdb` | -- | -- | OpenTSDB protocol options. |
 | `opentsdb.enable` | Bool | `true` | Whether to enable OpenTSDB put in HTTP API. |
@@ -240,7 +241,7 @@
 | `datanode.client.tcp_nodelay` | Bool | `true` | -- |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
-| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
 | `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
 | `logging.otlp_endpoint` | String | `http://localhost:4317` | The OTLP tracing endpoint. |
 | `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
@@ -251,12 +252,12 @@
 | `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
 | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
 | `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself<br/>You must create the database before enabling it. |
-| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.self_import.db` | String | Unset | -- |
 | `export_metrics.remote_write` | -- | -- | -- |
 | `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
-| `tracing.tokio_console_addr` | String | `None` | The tokio console address. |
+| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |


 ### Metasrv
@@ -305,7 +306,7 @@
 | `wal.backoff_deadline` | String | `5mins` | Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
-| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
 | `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
 | `logging.otlp_endpoint` | String | `http://localhost:4317` | The OTLP tracing endpoint. |
 | `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
@@ -316,12 +317,12 @@
 | `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
 | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
 | `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself<br/>You must create the database before enabling it. |
-| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.self_import.db` | String | Unset | -- |
 | `export_metrics.remote_write` | -- | -- | -- |
 | `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
-| `tracing.tokio_console_addr` | String | `None` | The tokio console address. |
+| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |


 ### Datanode
@@ -329,16 +330,16 @@
 | Key | Type | Default | Descriptions |
 | --- | -----| ------- | ----------- |
 | `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. |
-| `node_id` | Integer | `None` | The datanode identifier and should be unique in the cluster. |
+| `node_id` | Integer | Unset | The datanode identifier and should be unique in the cluster. |
 | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
-| `rpc_addr` | String | `None` | Deprecated, use `grpc.addr` instead. |
-| `rpc_hostname` | String | `None` | Deprecated, use `grpc.hostname` instead. |
-| `rpc_runtime_size` | Integer | `None` | Deprecated, use `grpc.runtime_size` instead. |
-| `rpc_max_recv_message_size` | String | `None` | Deprecated, use `grpc.rpc_max_recv_message_size` instead. |
-| `rpc_max_send_message_size` | String | `None` | Deprecated, use `grpc.rpc_max_send_message_size` instead. |
+| `rpc_addr` | String | Unset | Deprecated, use `grpc.addr` instead. |
+| `rpc_hostname` | String | Unset | Deprecated, use `grpc.hostname` instead. |
+| `rpc_runtime_size` | Integer | Unset | Deprecated, use `grpc.runtime_size` instead. |
+| `rpc_max_recv_message_size` | String | Unset | Deprecated, use `grpc.rpc_max_recv_message_size` instead. |
+| `rpc_max_send_message_size` | String | Unset | Deprecated, use `grpc.rpc_max_send_message_size` instead. |
 | `http` | -- | -- | The HTTP server options. |
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
 | `http.timeout` | String | `30s` | HTTP request timeout. Set to 0 to disable timeout. |
@@ -351,8 +352,8 @@
 | `grpc.max_send_message_size` | String | `512MB` | The maximum send message size for gRPC server. |
 | `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
 | `grpc.tls.mode` | String | `disable` | TLS mode. |
-| `grpc.tls.cert_path` | String | `None` | Certificate file path. |
-| `grpc.tls.key_path` | String | `None` | Private key file path. |
+| `grpc.tls.cert_path` | String | Unset | Certificate file path. |
+| `grpc.tls.key_path` | String | Unset | Private key file path. |
 | `grpc.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload.<br/>For now, gRPC tls config does not support auto reload. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
@@ -372,7 +373,7 @@
 | `meta_client.metadata_cache_tti` | String | `5m` | -- |
 | `wal` | -- | -- | The WAL options. |
 | `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
-| `wal.dir` | String | `None` | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.dir` | String | Unset | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.file_size` | String | `256MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.purge_threshold` | String | `4GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.purge_interval` | String | `10m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
@@ -381,6 +382,7 @@
 | `wal.enable_log_recycle` | Bool | `true` | Whether to reuse logically truncated log files.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.prefill_log_files` | Bool | `false` | Whether to pre-create log files on start up.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.sync_period` | String | `10s` | Duration for fsyncing log files.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.recovery_parallelism` | Integer | `2` | Parallelism during WAL recovery. |
 | `wal.broker_endpoints` | Array | -- | The Kafka broker endpoints.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.max_batch_bytes` | String | `1MB` | The max size of a single producer batch.<br/>Warning: Kafka has a default limit of 1MB per message in a topic.<br/>**It's only used when the provider is `kafka`**. |
 | `wal.consumer_wait_timeout` | String | `100ms` | The consumer wait timeout.<br/>**It's only used when the provider is `kafka`**. |
@@ -393,22 +395,22 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | `None` | Cache configuration for object storage such as 'S3' etc.<br/>The local file cache directory. |
-| `storage.cache_capacity` | String | `None` | The local file cache capacity in bytes. |
-| `storage.bucket` | String | `None` | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
-| `storage.root` | String | `None` | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
-| `storage.access_key_id` | String | `None` | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
-| `storage.secret_access_key` | String | `None` | The secret access key of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3`**. |
-| `storage.access_key_secret` | String | `None` | The secret access key of the aliyun account.<br/>**It's only used when the storage type is `Oss`**. |
-| `storage.account_name` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.account_key` | String | `None` | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.scope` | String | `None` | The scope of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
-| `storage.credential_path` | String | `None` | The credential path of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
-| `storage.credential` | String | `None` | The credential of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
-| `storage.container` | String | `None` | The container of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.sas_token` | String | `None` | The sas token of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
-| `storage.endpoint` | String | `None` | The endpoint of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
-| `storage.region` | String | `None` | The region of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `storage.cache_path` | String | Unset | Cache configuration for object storage such as 'S3' etc.<br/>The local file cache directory. |
+| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. |
+| `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
+| `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
+| `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
+| `storage.secret_access_key` | String | Unset | The secret access key of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3`**. |
+| `storage.access_key_secret` | String | Unset | The secret access key of the aliyun account.<br/>**It's only used when the storage type is `Oss`**. |
+| `storage.account_name` | String | Unset | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.account_key` | String | Unset | The account key of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.scope` | String | Unset | The scope of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.credential_path` | String | Unset | The credential path of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.credential` | String | Unset | The credential of the google cloud storage.<br/>**It's only used when the storage type is `Gcs`**. |
+| `storage.container` | String | Unset | The container of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.sas_token` | String | Unset | The sas token of the azure account.<br/>**It's only used when the storage type is `Azblob`**. |
+| `storage.endpoint` | String | Unset | The endpoint of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
+| `storage.region` | String | Unset | The region of the S3 service.<br/>**It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**. |
 | `[[region_engine]]` | -- | -- | The region engine options. You can configure multiple region engines. |
 | `region_engine.mito` | -- | -- | The Mito engine options. |
 | `region_engine.mito.num_workers` | Integer | `8` | Number of region workers. |
@@ -418,16 +420,16 @@
 | `region_engine.mito.compress_manifest` | Bool | `false` | Whether to compress manifest and checkpoint file by gzip (default false). |
 | `region_engine.mito.max_background_jobs` | Integer | `4` | Max number of running background jobs |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
-| `region_engine.mito.global_write_buffer_size` | String | `1GB` | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
-| `region_engine.mito.global_write_buffer_reject_size` | String | `2GB` | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
-| `region_engine.mito.sst_meta_cache_size` | String | `128MB` | Cache size for SST metadata. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
-| `region_engine.mito.vector_cache_size` | String | `512MB` | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
-| `region_engine.mito.page_cache_size` | String | `512MB` | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
-| `region_engine.mito.selector_result_cache_size` | String | `512MB` | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
+| `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
+| `region_engine.mito.sst_meta_cache_size` | String | Auto | Cache size for SST metadata. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
+| `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
+| `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.enable_experimental_write_cache` | Bool | `false` | Whether to enable the experimental write cache. |
 | `region_engine.mito.experimental_write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}/write_cache`. |
 | `region_engine.mito.experimental_write_cache_size` | String | `512MB` | Capacity for write cache. |
-| `region_engine.mito.experimental_write_cache_ttl` | String | `None` | TTL for write cache. |
+| `region_engine.mito.experimental_write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.scan_parallelism` | Integer | `0` | Parallelism to scan a region (default: 1/4 of cpu cores).<br/>- `0`: using the default value (1/4 of cpu cores).<br/>- `1`: scan in current thread.<br/>- `n`: scan in parallelism n. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
@@ -455,7 +457,7 @@
 | `region_engine.file` | -- | -- | Enable the file engine. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
-| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
 | `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
 | `logging.otlp_endpoint` | String | `http://localhost:4317` | The OTLP tracing endpoint. |
 | `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
@@ -466,12 +468,12 @@
 | `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
 | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
 | `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommend to collect metrics generated by itself<br/>You must create the database before enabling it. |
-| `export_metrics.self_import.db` | String | `None` | -- |
+| `export_metrics.self_import.db` | String | Unset | -- |
 | `export_metrics.remote_write` | -- | -- | -- |
 | `export_metrics.remote_write.url` | String | `""` | The url the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
 | `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
-| `tracing.tokio_console_addr` | String | `None` | The tokio console address. |
+| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |


 ### Flownode
@@ -479,7 +481,7 @@
 | Key | Type | Default | Descriptions |
 | --- | -----| ------- | ----------- |
 | `mode` | String | `distributed` | The running mode of the flownode. It can be `standalone` or `distributed`. |
-| `node_id` | Integer | `None` | The flownode identifier and should be unique in the cluster. |
+| `node_id` | Integer | Unset | The flownode identifier and should be unique in the cluster. |
 | `grpc` | -- | -- | The gRPC server options. |
 | `grpc.addr` | String | `127.0.0.1:6800` | The address to bind the gRPC server. |
 | `grpc.hostname` | String | `127.0.0.1` | The hostname advertised to the metasrv,<br/>and used for connections from outside the host |
@@ -501,7 +503,7 @@
 | `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. |
 | `logging` | -- | -- | The logging options. |
 | `logging.dir` | String | `/tmp/greptimedb/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
-| `logging.level` | String | `None` | The log level. Can be `info`/`debug`/`warn`/`error`. |
+| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
 | `logging.enable_otlp_tracing` | Bool | `false` | Enable OTLP tracing. |
 | `logging.otlp_endpoint` | String | `http://localhost:4317` | The OTLP tracing endpoint. |
 | `logging.append_stdout` | Bool | `true` | Whether to append logs to stdout. |
@@ -509,4 +511,4 @@
 | `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
 | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
-| `tracing.tokio_console_addr` | String | `None` | The tokio console address. |
+| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -2,7 +2,7 @@
 mode = "standalone"

 ## The datanode identifier and should be unique in the cluster.
-## +toml2docs:none-default
+## @toml2docs:none-default
 node_id = 42

 ## Start services after regions have obtained leases.
@@ -20,23 +20,23 @@ enable_telemetry = true
 init_regions_parallelism = 16

 ## Deprecated, use `grpc.addr` instead.
-## +toml2docs:none-default
+## @toml2docs:none-default
 rpc_addr = "127.0.0.1:3001"

 ## Deprecated, use `grpc.hostname` instead.
-## +toml2docs:none-default
+## @toml2docs:none-default
 rpc_hostname = "127.0.0.1"

 ## Deprecated, use `grpc.runtime_size` instead.
-## +toml2docs:none-default
+## @toml2docs:none-default
 rpc_runtime_size = 8

 ## Deprecated, use `grpc.rpc_max_recv_message_size` instead.
-## +toml2docs:none-default
+## @toml2docs:none-default
 rpc_max_recv_message_size = "512MB"

 ## Deprecated, use `grpc.rpc_max_send_message_size` instead.
-## +toml2docs:none-default
+## @toml2docs:none-default
 rpc_max_send_message_size = "512MB"


@@ -71,11 +71,11 @@ max_send_message_size = "512MB"
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload.
@@ -83,11 +83,11 @@ key_path = ""
 watch = false

 ## The runtime options.
-[runtime]
+#+ [runtime]
 ## The number of threads to execute the runtime for global read operations.
-global_rt_size = 8
+#+ global_rt_size = 8
 ## The number of threads to execute the runtime for global write operations.
-compact_rt_size = 4
+#+ compact_rt_size = 4

 ## The heartbeat options.
 [heartbeat]
@@ -135,7 +135,7 @@ provider = "raft_engine"

 ## The directory to store the WAL files.
 ## **It's only used when the provider is `raft_engine`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 dir = "/tmp/greptimedb/wal"

 ## The size of the WAL segment file.
@@ -170,6 +170,9 @@ prefill_log_files = false
 ## **It's only used when the provider is `raft_engine`**.
 sync_period = "10s"

+## Parallelism during WAL recovery.
+recovery_parallelism = 2
+
 ## The Kafka broker endpoints.
 ## **It's only used when the provider is `kafka`**.
 broker_endpoints = ["127.0.0.1:9092"]
@@ -279,83 +282,83 @@ type = "File"

 ## Cache configuration for object storage such as 'S3' etc.
 ## The local file cache directory.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cache_path = "/path/local_cache"

 ## The local file cache capacity in bytes.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cache_capacity = "256MB"

 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 bucket = "greptimedb"

 ## The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.
 ## **It's only used when the storage type is `S3`, `Oss` and `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 root = "greptimedb"

 ## The access key id of the aws account.
 ## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
 ## **It's only used when the storage type is `S3` and `Oss`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 access_key_id = "test"

 ## The secret access key of the aws account.
 ## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
 ## **It's only used when the storage type is `S3`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 secret_access_key = "test"

 ## The secret access key of the aliyun account.
 ## **It's only used when the storage type is `Oss`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 access_key_secret = "test"

 ## The account key of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 account_name = "test"

 ## The account key of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 account_key = "test"

 ## The scope of the google cloud storage.
 ## **It's only used when the storage type is `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 scope = "test"

 ## The credential path of the google cloud storage.
 ## **It's only used when the storage type is `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 credential_path = "test"

 ## The credential of the google cloud storage.
 ## **It's only used when the storage type is `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 credential = "base64-credential"

 ## The container of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 container = "greptimedb"

 ## The sas token of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 sas_token = ""

 ## The endpoint of the S3 service.
 ## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 endpoint = "https://s3.amazonaws.com"

 ## The region of the S3 service.
 ## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 region = "us-west-2"

 # Custom storage options
@@ -385,7 +388,7 @@ region = "us-west-2"
 [region_engine.mito]

 ## Number of region workers.
-num_workers = 8
+#+ num_workers = 8

 ## Request channel size of each worker.
 worker_channel_size = 128
@@ -406,26 +409,32 @@ max_background_jobs = 4
 auto_flush_interval = "1h"

 ## Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB.
-global_write_buffer_size = "1GB"
+## @toml2docs:none-default="Auto"
+#+ global_write_buffer_size = "1GB"

 ## Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`
-global_write_buffer_reject_size = "2GB"
+## @toml2docs:none-default="Auto"
+#+ global_write_buffer_reject_size = "2GB"

 ## Cache size for SST metadata. Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
-sst_meta_cache_size = "128MB"
+## @toml2docs:none-default="Auto"
+#+ sst_meta_cache_size = "128MB"

 ## Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
-vector_cache_size = "512MB"
+## @toml2docs:none-default="Auto"
+#+ vector_cache_size = "512MB"

 ## Cache size for pages of SST row groups. Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/8 of OS memory.
-page_cache_size = "512MB"
+## @toml2docs:none-default="Auto"
+#+ page_cache_size = "512MB"

 ## Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
-selector_result_cache_size = "512MB"
+## @toml2docs:none-default="Auto"
+#+ selector_result_cache_size = "512MB"

 ## Whether to enable the experimental write cache.
 enable_experimental_write_cache = false
@@ -437,7 +446,7 @@ experimental_write_cache_path = ""
 experimental_write_cache_size = "512MB"

 ## TTL for write cache.
-## +toml2docs:none-default
+## @toml2docs:none-default
 experimental_write_cache_ttl = "8h"

 ## Buffer size for SST writing.
@@ -553,7 +562,7 @@ fork_dictionary_bytes = "1GiB"
 dir = "/tmp/greptimedb/logs"

 ## The log level. Can be `info`/`debug`/`warn`/`error`.
-## +toml2docs:none-default
+## @toml2docs:none-default
 level = "info"

 ## Enable OTLP tracing.
@@ -587,7 +596,7 @@ write_interval = "30s"
 ## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
 ## You must create the database before enabling it.
 [export_metrics.self_import]
-## +toml2docs:none-default
+## @toml2docs:none-default
 db = "greptime_metrics"

 [export_metrics.remote_write]
@@ -600,5 +609,5 @@ headers = { }
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 [tracing]
 ## The tokio console address.
-## +toml2docs:none-default
+## @toml2docs:none-default
 tokio_console_addr = "127.0.0.1"
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -2,7 +2,7 @@
 mode = "distributed"

 ## The flownode identifier and should be unique in the cluster.
-## +toml2docs:none-default
+## @toml2docs:none-default
 node_id = 14

 ## The gRPC server options.
@@ -63,7 +63,7 @@ retry_interval = "3s"
 dir = "/tmp/greptimedb/logs"

 ## The log level. Can be `info`/`debug`/`warn`/`error`.
-## +toml2docs:none-default
+## @toml2docs:none-default
 level = "info"

 ## Enable OTLP tracing.
@@ -87,6 +87,6 @@ default_ratio = 1.0
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 [tracing]
 ## The tokio console address.
-## +toml2docs:none-default
+## @toml2docs:none-default
 tokio_console_addr = "127.0.0.1"

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -1,13 +1,13 @@
 ## The default timezone of the server.
-## +toml2docs:none-default
+## @toml2docs:none-default
 default_timezone = "UTC"

 ## The runtime options.
-[runtime]
+#+ [runtime]
 ## The number of threads to execute the runtime for global read operations.
-global_rt_size = 8
+#+ global_rt_size = 8
 ## The number of threads to execute the runtime for global write operations.
-compact_rt_size = 4
+#+ compact_rt_size = 4

 ## The heartbeat options.
 [heartbeat]
@@ -44,11 +44,11 @@ runtime_size = 8
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload.
@@ -76,11 +76,11 @@ runtime_size = 2
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload
@@ -101,11 +101,11 @@ runtime_size = 2
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload
@@ -170,7 +170,7 @@ tcp_nodelay = true
 dir = "/tmp/greptimedb/logs"

 ## The log level. Can be `info`/`debug`/`warn`/`error`.
-## +toml2docs:none-default
+## @toml2docs:none-default
 level = "info"

 ## Enable OTLP tracing.
@@ -204,7 +204,7 @@ write_interval = "30s"
 ## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
 ## You must create the database before enabling it.
 [export_metrics.self_import]
-## +toml2docs:none-default
+## @toml2docs:none-default
 db = "greptime_metrics"

 [export_metrics.remote_write]
@@ -217,5 +217,5 @@ headers = { }
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 [tracing]
 ## The tokio console address.
-## +toml2docs:none-default
+## @toml2docs:none-default
 tokio_console_addr = "127.0.0.1"
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -36,11 +36,11 @@ enable_region_failover = false
 backend = "EtcdStore"

 ## The runtime options.
-[runtime]
+#+ [runtime]
 ## The number of threads to execute the runtime for global read operations.
-global_rt_size = 8
+#+ global_rt_size = 8
 ## The number of threads to execute the runtime for global write operations.
-compact_rt_size = 4
+#+ compact_rt_size = 4

 ## Procedure storage options.
 [procedure]
@@ -157,7 +157,7 @@ backoff_deadline = "5mins"
 dir = "/tmp/greptimedb/logs"

 ## The log level. Can be `info`/`debug`/`warn`/`error`.
-## +toml2docs:none-default
+## @toml2docs:none-default
 level = "info"

 ## Enable OTLP tracing.
@@ -191,7 +191,7 @@ write_interval = "30s"
 ## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
 ## You must create the database before enabling it.
 [export_metrics.self_import]
-## +toml2docs:none-default
+## @toml2docs:none-default
 db = "greptime_metrics"

 [export_metrics.remote_write]
@@ -204,5 +204,5 @@ headers = { }
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 [tracing]
 ## The tokio console address.
-## +toml2docs:none-default
+## @toml2docs:none-default
 tokio_console_addr = "127.0.0.1"
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -5,7 +5,7 @@ mode = "standalone"
 enable_telemetry = true

 ## The default timezone of the server.
-## +toml2docs:none-default
+## @toml2docs:none-default
 default_timezone = "UTC"

 ## Initialize all regions in the background during the startup.
@@ -16,11 +16,11 @@ init_regions_in_background = false
 init_regions_parallelism = 16

 ## The runtime options.
-[runtime]
+#+ [runtime]
 ## The number of threads to execute the runtime for global read operations.
-global_rt_size = 8
+#+ global_rt_size = 8
 ## The number of threads to execute the runtime for global write operations.
-compact_rt_size = 4
+#+ compact_rt_size = 4

 ## The HTTP server options.
 [http]
@@ -46,11 +46,11 @@ runtime_size = 8
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload.
@@ -78,11 +78,11 @@ runtime_size = 2
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload
@@ -103,11 +103,11 @@ runtime_size = 2
 mode = "disable"

 ## Certificate file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cert_path = ""

 ## Private key file path.
-## +toml2docs:none-default
+## @toml2docs:none-default
 key_path = ""

 ## Watch for Certificate and key file change and auto reload
@@ -139,7 +139,7 @@ provider = "raft_engine"

 ## The directory to store the WAL files.
 ## **It's only used when the provider is `raft_engine`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 dir = "/tmp/greptimedb/wal"

 ## The size of the WAL segment file.
@@ -174,6 +174,9 @@ prefill_log_files = false
 ## **It's only used when the provider is `raft_engine`**.
 sync_period = "10s"

+## Parallelism during WAL recovery.
+recovery_parallelism = 2
+
 ## The Kafka broker endpoints.
 ## **It's only used when the provider is `kafka`**.
 broker_endpoints = ["127.0.0.1:9092"]
@@ -317,83 +320,83 @@ type = "File"

 ## Cache configuration for object storage such as 'S3' etc.
 ## The local file cache directory.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cache_path = "/path/local_cache"

 ## The local file cache capacity in bytes.
-## +toml2docs:none-default
+## @toml2docs:none-default
 cache_capacity = "256MB"

 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 bucket = "greptimedb"

 ## The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.
 ## **It's only used when the storage type is `S3`, `Oss` and `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 root = "greptimedb"

 ## The access key id of the aws account.
 ## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
 ## **It's only used when the storage type is `S3` and `Oss`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 access_key_id = "test"

 ## The secret access key of the aws account.
 ## It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.
 ## **It's only used when the storage type is `S3`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 secret_access_key = "test"

 ## The secret access key of the aliyun account.
 ## **It's only used when the storage type is `Oss`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 access_key_secret = "test"

 ## The account key of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 account_name = "test"

 ## The account key of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 account_key = "test"

 ## The scope of the google cloud storage.
 ## **It's only used when the storage type is `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 scope = "test"

 ## The credential path of the google cloud storage.
 ## **It's only used when the storage type is `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 credential_path = "test"

 ## The credential of the google cloud storage.
 ## **It's only used when the storage type is `Gcs`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 credential = "base64-credential"

 ## The container of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 container = "greptimedb"

 ## The sas token of the azure account.
 ## **It's only used when the storage type is `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 sas_token = ""

 ## The endpoint of the S3 service.
 ## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 endpoint = "https://s3.amazonaws.com"

 ## The region of the S3 service.
 ## **It's only used when the storage type is `S3`, `Oss`, `Gcs` and `Azblob`**.
-## +toml2docs:none-default
+## @toml2docs:none-default
 region = "us-west-2"

 # Custom storage options
@@ -423,7 +426,7 @@ region = "us-west-2"
 [region_engine.mito]

 ## Number of region workers.
-num_workers = 8
+#+ num_workers = 8

 ## Request channel size of each worker.
 worker_channel_size = 128
@@ -444,26 +447,32 @@ max_background_jobs = 4
 auto_flush_interval = "1h"

 ## Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB.
-global_write_buffer_size = "1GB"
+## @toml2docs:none-default="Auto"
+#+ global_write_buffer_size = "1GB"

-## Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`
-global_write_buffer_reject_size = "2GB"
+## Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`.
+## @toml2docs:none-default="Auto"
+#+ global_write_buffer_reject_size = "2GB"

 ## Cache size for SST metadata. Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
-sst_meta_cache_size = "128MB"
+## @toml2docs:none-default="Auto"
+#+ sst_meta_cache_size = "128MB"

 ## Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
-vector_cache_size = "512MB"
+## @toml2docs:none-default="Auto"
+#+ vector_cache_size = "512MB"

 ## Cache size for pages of SST row groups. Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/8 of OS memory.
-page_cache_size = "512MB"
+## @toml2docs:none-default="Auto"
+#+ page_cache_size = "512MB"

 ## Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.
 ## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
-selector_result_cache_size = "512MB"
+## @toml2docs:none-default="Auto"
+#+ selector_result_cache_size = "512MB"

 ## Whether to enable the experimental write cache.
 enable_experimental_write_cache = false
@@ -475,7 +484,7 @@ experimental_write_cache_path = ""
 experimental_write_cache_size = "512MB"

 ## TTL for write cache.
-## +toml2docs:none-default
+## @toml2docs:none-default
 experimental_write_cache_ttl = "8h"

 ## Buffer size for SST writing.
@@ -597,7 +606,7 @@ fork_dictionary_bytes = "1GiB"
 dir = "/tmp/greptimedb/logs"

 ## The log level. Can be `info`/`debug`/`warn`/`error`.
-## +toml2docs:none-default
+## @toml2docs:none-default
 level = "info"

 ## Enable OTLP tracing.
@@ -628,10 +637,10 @@ enable = false
 ## The interval of export metrics.
 write_interval = "30s"

-## For `standalone` mode, `self_import` is recommend to collect metrics generated by itself
+## For `standalone` mode, `self_import` is recommended to collect metrics generated by itself
 ## You must create the database before enabling it.
 [export_metrics.self_import]
-## +toml2docs:none-default
+## @toml2docs:none-default
 db = "greptime_metrics"

 [export_metrics.remote_write]
@@ -644,5 +653,5 @@ headers = { }
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 [tracing]
 ## The tokio console address.
-## +toml2docs:none-default
+## @toml2docs:none-default
 tokio_console_addr = "127.0.0.1"
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -42,7 +42,8 @@ use greptime_proto::v1::greptime_request::Request;
 use greptime_proto::v1::query_request::Query;
 use greptime_proto::v1::value::ValueData;
 use greptime_proto::v1::{
-    ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, QueryRequest, Row, SemanticType,
+    ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, JsonTypeExtension, QueryRequest,
+    Row, SemanticType,
 };
 use paste::paste;
 use snafu::prelude::*;
@@ -103,7 +104,17 @@ impl From<ColumnDataTypeWrapper> for ConcreteDataType {
            ColumnDataType::Uint64 => ConcreteDataType::uint64_datatype(),
            ColumnDataType::Float32 => ConcreteDataType::float32_datatype(),
            ColumnDataType::Float64 => ConcreteDataType::float64_datatype(),
-            ColumnDataType::Binary => ConcreteDataType::binary_datatype(),
+            ColumnDataType::Binary => {
+                if let Some(TypeExt::JsonType(_)) = datatype_wrapper
+                    .datatype_ext
+                    .as_ref()
+                    .and_then(|datatype_ext| datatype_ext.type_ext.as_ref())
+                {
+                    ConcreteDataType::json_datatype()
+                } else {
+                    ConcreteDataType::binary_datatype()
+                }
+            }
            ColumnDataType::String => ConcreteDataType::string_datatype(),
            ColumnDataType::Date => ConcreteDataType::date_datatype(),
            ColumnDataType::Datetime => ConcreteDataType::datetime_datatype(),
@@ -236,7 +247,7 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
            ConcreteDataType::UInt64(_) => ColumnDataType::Uint64,
            ConcreteDataType::Float32(_) => ColumnDataType::Float32,
            ConcreteDataType::Float64(_) => ColumnDataType::Float64,
-            ConcreteDataType::Binary(_) => ColumnDataType::Binary,
+            ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => ColumnDataType::Binary,
            ConcreteDataType::String(_) => ColumnDataType::String,
            ConcreteDataType::Date(_) => ColumnDataType::Date,
            ConcreteDataType::DateTime(_) => ColumnDataType::Datetime,
@@ -276,6 +287,16 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
                        })),
                    })
            }
+            ColumnDataType::Binary => {
+                if datatype == ConcreteDataType::json_datatype() {
+                    // Json is the same as  binary in proto. The extension marks the binary in proto is actually a json.
+                    Some(ColumnDataTypeExtension {
+                        type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
+                    })
+                } else {
+                    None
+                }
+            }
            _ => None,
        };
        Ok(Self {
@@ -649,7 +670,8 @@ pub fn pb_values_to_vector_ref(data_type: &ConcreteDataType, values: Values) ->
        ConcreteDataType::Null(_)
        | ConcreteDataType::List(_)
        | ConcreteDataType::Dictionary(_)
-        | ConcreteDataType::Duration(_) => {
+        | ConcreteDataType::Duration(_)
+        | ConcreteDataType::Json(_) => {
            unreachable!()
        }
    }
@@ -813,7 +835,8 @@ pub fn pb_values_to_values(data_type: &ConcreteDataType, values: Values) -> Vec<
        ConcreteDataType::Null(_)
        | ConcreteDataType::List(_)
        | ConcreteDataType::Dictionary(_)
-        | ConcreteDataType::Duration(_) => {
+        | ConcreteDataType::Duration(_)
+        | ConcreteDataType::Json(_) => {
            unreachable!()
        }
    }
@@ -831,7 +854,13 @@ pub fn is_column_type_value_eq(
    expect_type: &ConcreteDataType,
 ) -> bool {
    ColumnDataTypeWrapper::try_new(type_value, type_extension)
-        .map(|wrapper| ConcreteDataType::from(wrapper) == *expect_type)
+        .map(|wrapper| {
+            let datatype = ConcreteDataType::from(wrapper);
+            (datatype == *expect_type)
+            // Json type leverage binary type in pb, so this is valid.
+                || (datatype == ConcreteDataType::binary_datatype()
+                    && *expect_type == ConcreteDataType::json_datatype())
+        })
        .unwrap_or(false)
 }

--- a/src/api/src/region.rs
+++ b/src/api/src/region.rs
@@ -21,14 +21,14 @@ use greptime_proto::v1::region::RegionResponse as RegionResponseV1;
 #[derive(Debug)]
 pub struct RegionResponse {
    pub affected_rows: AffectedRows,
-    pub extension: HashMap<String, Vec<u8>>,
+    pub extensions: HashMap<String, Vec<u8>>,
 }

 impl RegionResponse {
    pub fn from_region_response(region_response: RegionResponseV1) -> Self {
        Self {
            affected_rows: region_response.affected_rows as _,
-            extension: region_response.extension,
+            extensions: region_response.extensions,
        }
    }

@@ -36,7 +36,7 @@ impl RegionResponse {
    pub fn new(affected_rows: AffectedRows) -> Self {
        Self {
            affected_rows,
-            extension: Default::default(),
+            extensions: Default::default(),
        }
    }
 }
--- a/src/catalog/src/kvbackend/manager.rs
+++ b/src/catalog/src/kvbackend/manager.rs
@@ -36,6 +36,7 @@ use futures_util::{StreamExt, TryStreamExt};
 use meta_client::client::MetaClient;
 use moka::sync::Cache;
 use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
+use session::context::{Channel, QueryContext};
 use snafu::prelude::*;
 use table::dist_table::DistTable;
 use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
@@ -152,7 +153,11 @@ impl CatalogManager for KvBackendCatalogManager {
        Ok(keys)
    }

-    async fn schema_names(&self, catalog: &str) -> Result<Vec<String>> {
+    async fn schema_names(
+        &self,
+        catalog: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<Vec<String>> {
        let stream = self
            .table_metadata_manager
            .schema_manager()
@@ -163,12 +168,17 @@ impl CatalogManager for KvBackendCatalogManager {
            .map_err(BoxedError::new)
            .context(ListSchemasSnafu { catalog })?;

-        keys.extend(self.system_catalog.schema_names());
+        keys.extend(self.system_catalog.schema_names(query_ctx));

        Ok(keys.into_iter().collect())
    }

-    async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>> {
+    async fn table_names(
+        &self,
+        catalog: &str,
+        schema: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<Vec<String>> {
        let stream = self
            .table_metadata_manager
            .table_name_manager()
@@ -181,7 +191,7 @@ impl CatalogManager for KvBackendCatalogManager {
            .into_iter()
            .map(|(k, _)| k)
            .collect::<Vec<_>>();
-        tables.extend_from_slice(&self.system_catalog.table_names(schema));
+        tables.extend_from_slice(&self.system_catalog.table_names(schema, query_ctx));

        Ok(tables.into_iter().collect())
    }
@@ -194,8 +204,13 @@ impl CatalogManager for KvBackendCatalogManager {
            .context(TableMetadataManagerSnafu)
    }

-    async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool> {
-        if self.system_catalog.schema_exists(schema) {
+    async fn schema_exists(
+        &self,
+        catalog: &str,
+        schema: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<bool> {
+        if self.system_catalog.schema_exists(schema, query_ctx) {
            return Ok(true);
        }

@@ -206,8 +221,14 @@ impl CatalogManager for KvBackendCatalogManager {
            .context(TableMetadataManagerSnafu)
    }

-    async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
-        if self.system_catalog.table_exists(schema, table) {
+    async fn table_exists(
+        &self,
+        catalog: &str,
+        schema: &str,
+        table: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<bool> {
+        if self.system_catalog.table_exists(schema, table, query_ctx) {
            return Ok(true);
        }

@@ -225,10 +246,12 @@ impl CatalogManager for KvBackendCatalogManager {
        catalog_name: &str,
        schema_name: &str,
        table_name: &str,
+        query_ctx: Option<&QueryContext>,
    ) -> Result<Option<TableRef>> {
-        if let Some(table) = self
-            .system_catalog
-            .table(catalog_name, schema_name, table_name)
+        let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
+        if let Some(table) =
+            self.system_catalog
+                .table(catalog_name, schema_name, table_name, query_ctx)
        {
            return Ok(Some(table));
        }
@@ -236,23 +259,45 @@ impl CatalogManager for KvBackendCatalogManager {
        let table_cache: TableCacheRef = self.cache_registry.get().context(CacheNotFoundSnafu {
            name: "table_cache",
        })?;
-
-        table_cache
+        if let Some(table) = table_cache
            .get_by_ref(&TableName {
                catalog_name: catalog_name.to_string(),
                schema_name: schema_name.to_string(),
                table_name: table_name.to_string(),
            })
            .await
-            .context(GetTableCacheSnafu)
+            .context(GetTableCacheSnafu)?
+        {
+            return Ok(Some(table));
+        }
+
+        if channel == Channel::Postgres {
+            // falldown to pg_catalog
+            if let Some(table) =
+                self.system_catalog
+                    .table(catalog_name, PG_CATALOG_NAME, table_name, query_ctx)
+            {
+                return Ok(Some(table));
+            }
+        }
+
+        return Ok(None);
    }

-    fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>> {
+    fn tables<'a>(
+        &'a self,
+        catalog: &'a str,
+        schema: &'a str,
+        query_ctx: Option<&'a QueryContext>,
+    ) -> BoxStream<'a, Result<TableRef>> {
        let sys_tables = try_stream!({
            // System tables
-            let sys_table_names = self.system_catalog.table_names(schema);
+            let sys_table_names = self.system_catalog.table_names(schema, query_ctx);
            for table_name in sys_table_names {
-                if let Some(table) = self.system_catalog.table(catalog, schema, &table_name) {
+                if let Some(table) =
+                    self.system_catalog
+                        .table(catalog, schema, &table_name, query_ctx)
+                {
                    yield table;
                }
            }
@@ -320,18 +365,27 @@ struct SystemCatalog {
 }

 impl SystemCatalog {
-    // TODO(j0hn50n133): remove the duplicated hard-coded table names logic
-    fn schema_names(&self) -> Vec<String> {
-        vec![
-            INFORMATION_SCHEMA_NAME.to_string(),
-            PG_CATALOG_NAME.to_string(),
-        ]
+    fn schema_names(&self, query_ctx: Option<&QueryContext>) -> Vec<String> {
+        let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
+        match channel {
+            // pg_catalog only visible under postgres protocol
+            Channel::Postgres => vec![
+                INFORMATION_SCHEMA_NAME.to_string(),
+                PG_CATALOG_NAME.to_string(),
+            ],
+            _ => {
+                vec![INFORMATION_SCHEMA_NAME.to_string()]
+            }
+        }
    }

-    fn table_names(&self, schema: &str) -> Vec<String> {
+    fn table_names(&self, schema: &str, query_ctx: Option<&QueryContext>) -> Vec<String> {
+        let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
        match schema {
            INFORMATION_SCHEMA_NAME => self.information_schema_provider.table_names(),
-            PG_CATALOG_NAME => self.pg_catalog_provider.table_names(),
+            PG_CATALOG_NAME if channel == Channel::Postgres => {
+                self.pg_catalog_provider.table_names()
+            }
            DEFAULT_SCHEMA_NAME => {
                vec![NUMBERS_TABLE_NAME.to_string()]
            }
@@ -339,23 +393,35 @@ impl SystemCatalog {
        }
    }

-    fn schema_exists(&self, schema: &str) -> bool {
-        schema == INFORMATION_SCHEMA_NAME || schema == PG_CATALOG_NAME
+    fn schema_exists(&self, schema: &str, query_ctx: Option<&QueryContext>) -> bool {
+        let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
+        match channel {
+            Channel::Postgres => schema == PG_CATALOG_NAME || schema == INFORMATION_SCHEMA_NAME,
+            _ => schema == INFORMATION_SCHEMA_NAME,
+        }
    }

-    fn table_exists(&self, schema: &str, table: &str) -> bool {
+    fn table_exists(&self, schema: &str, table: &str, query_ctx: Option<&QueryContext>) -> bool {
+        let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
        if schema == INFORMATION_SCHEMA_NAME {
            self.information_schema_provider.table(table).is_some()
        } else if schema == DEFAULT_SCHEMA_NAME {
            table == NUMBERS_TABLE_NAME
-        } else if schema == PG_CATALOG_NAME {
+        } else if schema == PG_CATALOG_NAME && channel == Channel::Postgres {
            self.pg_catalog_provider.table(table).is_some()
        } else {
            false
        }
    }

-    fn table(&self, catalog: &str, schema: &str, table_name: &str) -> Option<TableRef> {
+    fn table(
+        &self,
+        catalog: &str,
+        schema: &str,
+        table_name: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Option<TableRef> {
+        let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
        if schema == INFORMATION_SCHEMA_NAME {
            let information_schema_provider =
                self.catalog_cache.get_with_by_ref(catalog, move || {
@@ -366,7 +432,7 @@ impl SystemCatalog {
                    ))
                });
            information_schema_provider.table(table_name)
-        } else if schema == PG_CATALOG_NAME {
+        } else if schema == PG_CATALOG_NAME && channel == Channel::Postgres {
            if catalog == DEFAULT_CATALOG_NAME {
                self.pg_catalog_provider.table(table_name)
            } else {
--- a/src/catalog/src/lib.rs
+++ b/src/catalog/src/lib.rs
@@ -20,8 +20,10 @@ use std::fmt::{Debug, Formatter};
 use std::sync::Arc;

 use api::v1::CreateTableExpr;
+use common_catalog::consts::{INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME};
 use futures::future::BoxFuture;
 use futures_util::stream::BoxStream;
+use session::context::QueryContext;
 use table::metadata::TableId;
 use table::TableRef;

@@ -44,15 +46,35 @@ pub trait CatalogManager: Send + Sync {

    async fn catalog_names(&self) -> Result<Vec<String>>;

-    async fn schema_names(&self, catalog: &str) -> Result<Vec<String>>;
+    async fn schema_names(
+        &self,
+        catalog: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<Vec<String>>;

-    async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>>;
+    async fn table_names(
+        &self,
+        catalog: &str,
+        schema: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<Vec<String>>;

    async fn catalog_exists(&self, catalog: &str) -> Result<bool>;

-    async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool>;
+    async fn schema_exists(
+        &self,
+        catalog: &str,
+        schema: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<bool>;

-    async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool>;
+    async fn table_exists(
+        &self,
+        catalog: &str,
+        schema: &str,
+        table: &str,
+        query_ctx: Option<&QueryContext>,
+    ) -> Result<bool>;

    /// Returns the table by catalog, schema and table name.
    async fn table(
@@ -60,10 +82,25 @@ pub trait CatalogManager: Send + Sync {
        catalog: &str,
        schema: &str,
        table_name: &str,
+        query_ctx: Option<&QueryContext>,
    ) -> Result<Option<TableRef>>;

    /// Returns all tables with a stream by catalog and schema.
-    fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>>;
+    fn tables<'a>(
+        &'a self,
+        catalog: &'a str,
+        schema: &'a str,
+        query_ctx: Option<&'a QueryContext>,
+    ) -> BoxStream<'a, Result<TableRef>>;
+
+    /// Check if `schema` is a reserved schema name
+    fn is_reserved_schema_name(&self, schema: &str) -> bool {
+        // We have to check whether a schema name is reserved before create schema.
+        // We need this rather than use schema_exists directly because `pg_catalog` is
+        // only visible via postgres protocol. So if we don't check, a mysql client may
+        // create a schema named `pg_catalog` which is somehow malformed.
+        schema == INFORMATION_SCHEMA_NAME || schema == PG_CATALOG_NAME
+    }
 }

 pub type CatalogManagerRef = Arc<dyn CatalogManager>;
--- a/src/catalog/src/memory/manager.rs
+++ b/src/catalog/src/memory/manager.rs
@@ -26,6 +26,7 @@ use common_catalog::consts::{
 use common_meta::key::flow::FlowMetadataManager;
 use common_meta::kv_backend::memory::MemoryKvBackend;
 use futures_util::stream::BoxStream;
+use session::context::QueryContext;
 use snafu::OptionExt;
 use table::TableRef;

@@ -53,7 +54,11 @@ impl CatalogManager for MemoryCatalogManager {
        Ok(self.catalogs.read().unwrap().keys().cloned().collect())
    }

-    async fn schema_names(&self, catalog: &str) -> Result<Vec<String>> {
+    async fn schema_names(
+        &self,
+        catalog: &str,
+        _query_ctx: Option<&QueryContext>,
+    ) -> Result<Vec<String>> {
        Ok(self
            .catalogs
            .read()
@@ -67,7 +72,12 @@ impl CatalogManager for MemoryCatalogManager {
            .collect())
    }

-    async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>> {
+    async fn table_names(
+        &self,
+        catalog: &str,
+        schema: &str,
+        _query_ctx: Option<&QueryContext>,
+    ) -> Result<Vec<String>> {
        Ok(self
            .catalogs
            .read()
@@ -87,11 +97,22 @@ impl CatalogManager for MemoryCatalogManager {
        self.catalog_exist_sync(catalog)
    }

-    async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool> {
+    async fn schema_exists(
+        &self,
+        catalog: &str,
+        schema: &str,
+        _query_ctx: Option<&QueryContext>,
+    ) -> Result<bool> {
        self.schema_exist_sync(catalog, schema)
    }

-    async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
+    async fn table_exists(
+        &self,
+        catalog: &str,
+        schema: &str,
+        table: &str,
+        _query_ctx: Option<&QueryContext>,
+    ) -> Result<bool> {
        let catalogs = self.catalogs.read().unwrap();
        Ok(catalogs
            .get(catalog)
@@ -108,6 +129,7 @@ impl CatalogManager for MemoryCatalogManager {
        catalog: &str,
        schema: &str,
        table_name: &str,
+        _query_ctx: Option<&QueryContext>,
    ) -> Result<Option<TableRef>> {
        let result = try {
            self.catalogs
@@ -121,7 +143,12 @@ impl CatalogManager for MemoryCatalogManager {
        Ok(result)
    }

-    fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>> {
+    fn tables<'a>(
+        &'a self,
+        catalog: &'a str,
+        schema: &'a str,
+        _query_ctx: Option<&QueryContext>,
+    ) -> BoxStream<'a, Result<TableRef>> {
        let catalogs = self.catalogs.read().unwrap();

        let Some(schemas) = catalogs.get(catalog) else {
@@ -371,11 +398,12 @@ mod tests {
                DEFAULT_CATALOG_NAME,
                DEFAULT_SCHEMA_NAME,
                NUMBERS_TABLE_NAME,
+                None,
            )
            .await
            .unwrap()
            .unwrap();
-        let stream = catalog_list.tables(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
+        let stream = catalog_list.tables(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, None);
        let tables = stream.try_collect::<Vec<_>>().await.unwrap();
        assert_eq!(tables.len(), 1);
        assert_eq!(
@@ -384,7 +412,12 @@ mod tests {
        );

        assert!(catalog_list
-            .table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "not_exists")
+            .table(
+                DEFAULT_CATALOG_NAME,
+                DEFAULT_SCHEMA_NAME,
+                "not_exists",
+                None
+            )
            .await
            .unwrap()
            .is_none());
@@ -411,7 +444,7 @@ mod tests {
        };
        catalog.register_table_sync(register_table_req).unwrap();
        assert!(catalog
-            .table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name)
+            .table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name, None)
            .await
            .unwrap()
            .is_some());
@@ -423,7 +456,7 @@ mod tests {
        };
        catalog.deregister_table_sync(deregister_table_req).unwrap();
        assert!(catalog
-            .table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name)
+            .table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name, None)
            .await
            .unwrap()
            .is_none());
--- a/src/catalog/src/system_schema/information_schema/columns.rs
+++ b/src/catalog/src/system_schema/information_schema/columns.rs
@@ -257,8 +257,8 @@ impl InformationSchemaColumnsBuilder {
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);

            while let Some(table) = stream.try_next().await? {
                let keys = &table.table_info().meta.primary_key_indices;
--- a/src/catalog/src/system_schema/information_schema/key_column_usage.rs
+++ b/src/catalog/src/system_schema/information_schema/key_column_usage.rs
@@ -212,8 +212,8 @@ impl InformationSchemaKeyColumnUsageBuilder {
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);

            while let Some(table) = stream.try_next().await? {
                let mut primary_constraints = vec![];
--- a/src/catalog/src/system_schema/information_schema/partitions.rs
+++ b/src/catalog/src/system_schema/information_schema/partitions.rs
@@ -240,9 +240,9 @@ impl InformationSchemaPartitionsBuilder {

        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
            let table_info_stream = catalog_manager
-                .tables(&catalog_name, &schema_name)
+                .tables(&catalog_name, &schema_name, None)
                .try_filter_map(|t| async move {
                    let table_info = t.table_info();
                    if table_info.table_type == TableType::Temporary {
--- a/src/catalog/src/system_schema/information_schema/region_peers.rs
+++ b/src/catalog/src/system_schema/information_schema/region_peers.rs
@@ -176,9 +176,9 @@ impl InformationSchemaRegionPeersBuilder {

        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
            let table_id_stream = catalog_manager
-                .tables(&catalog_name, &schema_name)
+                .tables(&catalog_name, &schema_name, None)
                .try_filter_map(|t| async move {
                    let table_info = t.table_info();
                    if table_info.table_type == TableType::Temporary {
--- a/src/catalog/src/system_schema/information_schema/schemata.rs
+++ b/src/catalog/src/system_schema/information_schema/schemata.rs
@@ -171,7 +171,7 @@ impl InformationSchemaSchemataBuilder {
        let table_metadata_manager = utils::table_meta_manager(&self.catalog_manager)?;
        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
            let opts = if let Some(table_metadata_manager) = &table_metadata_manager {
                table_metadata_manager
                    .schema_manager()
--- a/src/catalog/src/system_schema/information_schema/table_constraints.rs
+++ b/src/catalog/src/system_schema/information_schema/table_constraints.rs
@@ -176,8 +176,8 @@ impl InformationSchemaTableConstraintsBuilder {
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);

            while let Some(table) = stream.try_next().await? {
                let keys = &table.table_info().meta.primary_key_indices;
--- a/src/catalog/src/system_schema/information_schema/tables.rs
+++ b/src/catalog/src/system_schema/information_schema/tables.rs
@@ -234,8 +234,8 @@ impl InformationSchemaTablesBuilder {
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
        let predicates = Predicates::from_scan_request(&request);

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);

            while let Some(table) = stream.try_next().await? {
                let table_info = table.table_info();
--- a/src/catalog/src/system_schema/information_schema/views.rs
+++ b/src/catalog/src/system_schema/information_schema/views.rs
@@ -192,8 +192,8 @@ impl InformationSchemaViewsBuilder {
            .context(CastManagerSnafu)?
            .view_info_cache()?;

-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
+        for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);

            while let Some(table) = stream.try_next().await? {
                let table_info = table.table_info();
--- a/src/catalog/src/system_schema/pg_catalog.rs
+++ b/src/catalog/src/system_schema/pg_catalog.rs
@@ -18,15 +18,16 @@ mod pg_namespace;
 mod table_names;

 use std::collections::HashMap;
-use std::sync::{Arc, Weak};
+use std::sync::{Arc, LazyLock, Weak};

-use common_catalog::consts::{self, PG_CATALOG_NAME};
+use common_catalog::consts::{self, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, PG_CATALOG_NAME};
 use datatypes::schema::ColumnSchema;
 use lazy_static::lazy_static;
 use paste::paste;
 use pg_catalog_memory_table::get_schema_columns;
 use pg_class::PGClass;
 use pg_namespace::PGNamespace;
+use session::context::{Channel, QueryContext};
 use table::TableRef;
 pub use table_names::*;

@@ -142,3 +143,12 @@ impl SystemSchemaProviderInner for PGCatalogProvider {
        &self.catalog_name
    }
 }
+
+/// Provide query context to call the [`CatalogManager`]'s method.
+static PG_QUERY_CTX: LazyLock<QueryContext> = LazyLock::new(|| {
+    QueryContext::with_channel(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, Channel::Postgres)
+});
+
+fn query_ctx() -> Option<&'static QueryContext> {
+    Some(&PG_QUERY_CTX)
+}
--- a/src/catalog/src/system_schema/pg_catalog/pg_class.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_class.rs
@@ -32,7 +32,7 @@ use store_api::storage::ScanRequest;
 use table::metadata::TableType;

 use super::pg_namespace::oid_map::PGNamespaceOidMapRef;
-use super::{OID_COLUMN_NAME, PG_CLASS};
+use super::{query_ctx, OID_COLUMN_NAME, PG_CLASS};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
@@ -202,8 +202,11 @@ impl PGClassBuilder {
            .upgrade()
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
        let predicates = Predicates::from_scan_request(&request);
-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
-            let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
+        for schema_name in catalog_manager
+            .schema_names(&catalog_name, query_ctx())
+            .await?
+        {
+            let mut stream = catalog_manager.tables(&catalog_name, &schema_name, query_ctx());
            while let Some(table) = stream.try_next().await? {
                let table_info = table.table_info();
                self.add_class(
--- a/src/catalog/src/system_schema/pg_catalog/pg_namespace.rs
+++ b/src/catalog/src/system_schema/pg_catalog/pg_namespace.rs
@@ -31,7 +31,7 @@ use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, VectorRef};
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::ScanRequest;

-use super::{PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
+use super::{query_ctx, PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
 use crate::error::{
    CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
 };
@@ -180,7 +180,10 @@ impl PGNamespaceBuilder {
            .upgrade()
            .context(UpgradeWeakCatalogManagerRefSnafu)?;
        let predicates = Predicates::from_scan_request(&request);
-        for schema_name in catalog_manager.schema_names(&catalog_name).await? {
+        for schema_name in catalog_manager
+            .schema_names(&catalog_name, query_ctx())
+            .await?
+        {
            self.add_namespace(&predicates, &schema_name);
        }
        self.finish()
--- a/src/catalog/src/table_source.rs
+++ b/src/catalog/src/table_source.rs
@@ -23,7 +23,7 @@ use datafusion::datasource::view::ViewTable;
 use datafusion::datasource::{provider_as_source, TableProvider};
 use datafusion::logical_expr::TableSource;
 use itertools::Itertools;
-use session::context::QueryContext;
+use session::context::QueryContextRef;
 use snafu::{ensure, OptionExt, ResultExt};
 use table::metadata::TableType;
 use table::table::adapter::DfTableProviderAdapter;
@@ -45,6 +45,7 @@ pub struct DfTableSourceProvider {
    disallow_cross_catalog_query: bool,
    default_catalog: String,
    default_schema: String,
+    query_ctx: QueryContextRef,
    plan_decoder: SubstraitPlanDecoderRef,
    enable_ident_normalization: bool,
 }
@@ -53,7 +54,7 @@ impl DfTableSourceProvider {
    pub fn new(
        catalog_manager: CatalogManagerRef,
        disallow_cross_catalog_query: bool,
-        query_ctx: &QueryContext,
+        query_ctx: QueryContextRef,
        plan_decoder: SubstraitPlanDecoderRef,
        enable_ident_normalization: bool,
    ) -> Self {
@@ -63,6 +64,7 @@ impl DfTableSourceProvider {
            resolved_tables: HashMap::new(),
            default_catalog: query_ctx.current_catalog().to_owned(),
            default_schema: query_ctx.current_schema(),
+            query_ctx,
            plan_decoder,
            enable_ident_normalization,
        }
@@ -71,8 +73,7 @@ impl DfTableSourceProvider {
    pub fn resolve_table_ref(&self, table_ref: TableReference) -> Result<ResolvedTableReference> {
        if self.disallow_cross_catalog_query {
            match &table_ref {
-                TableReference::Bare { .. } => (),
-                TableReference::Partial { .. } => {}
+                TableReference::Bare { .. } | TableReference::Partial { .. } => {}
                TableReference::Full {
                    catalog, schema, ..
                } => {
@@ -107,7 +108,7 @@ impl DfTableSourceProvider {

        let table = self
            .catalog_manager
-            .table(catalog_name, schema_name, table_name)
+            .table(catalog_name, schema_name, table_name, Some(&self.query_ctx))
            .await?
            .with_context(|| TableNotExistSnafu {
                table: format_full_table_name(catalog_name, schema_name, table_name),
@@ -210,12 +211,12 @@ mod tests {

    #[test]
    fn test_validate_table_ref() {
-        let query_ctx = &QueryContext::with("greptime", "public");
+        let query_ctx = Arc::new(QueryContext::with("greptime", "public"));

        let table_provider = DfTableSourceProvider::new(
            MemoryCatalogManager::with_default_setup(),
            true,
-            query_ctx,
+            query_ctx.clone(),
            DummyDecoder::arc(),
            true,
        );
@@ -308,7 +309,7 @@ mod tests {

    #[tokio::test]
    async fn test_resolve_view() {
-        let query_ctx = &QueryContext::with("greptime", "public");
+        let query_ctx = Arc::new(QueryContext::with("greptime", "public"));
        let backend = Arc::new(MemoryKvBackend::default());
        let layered_cache_builder = LayeredCacheRegistryBuilder::default()
            .add_cache_registry(CacheRegistryBuilder::default().build());
@@ -344,8 +345,13 @@ mod tests {
            .await
            .unwrap();

-        let mut table_provider =
-            DfTableSourceProvider::new(catalog_manager, true, query_ctx, MockDecoder::arc(), true);
+        let mut table_provider = DfTableSourceProvider::new(
+            catalog_manager,
+            true,
+            query_ctx.clone(),
+            MockDecoder::arc(),
+            true,
+        );

        // View not found
        let table_ref = TableReference::bare("not_exists_view");
--- a/src/catalog/src/table_source/dummy_catalog.rs
+++ b/src/catalog/src/table_source/dummy_catalog.rs
@@ -112,7 +112,7 @@ impl SchemaProvider for DummySchemaProvider {
    async fn table(&self, name: &str) -> datafusion::error::Result<Option<Arc<dyn TableProvider>>> {
        let table = self
            .catalog_manager
-            .table(&self.catalog_name, &self.schema_name, name)
+            .table(&self.catalog_name, &self.schema_name, name, None)
            .await?
            .with_context(|| TableNotExistSnafu {
                table: format_full_table_name(&self.catalog_name, &self.schema_name, name),
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -70,6 +70,7 @@ serde.workspace = true
 serde_json.workspace = true
 servers.workspace = true
 session.workspace = true
+similar-asserts.workspace = true
 snafu.workspace = true
 store-api.workspace = true
 substrait.workspace = true
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -16,12 +16,10 @@ use std::time::Duration;

 use cmd::options::GreptimeOptions;
 use cmd::standalone::StandaloneOptions;
-use common_base::readable_size::ReadableSize;
 use common_config::Configurable;
 use common_grpc::channel_manager::{
    DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE, DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE,
 };
-use common_runtime::global::RuntimeOptions;
 use common_telemetry::logging::{LoggingOptions, DEFAULT_OTLP_ENDPOINT};
 use common_wal::config::raft_engine::RaftEngineConfig;
 use common_wal::config::DatanodeWalConfig;
@@ -45,10 +43,6 @@ fn test_load_datanode_example_config() {
            .unwrap();

    let expected = GreptimeOptions::<DatanodeOptions> {
-        runtime: RuntimeOptions {
-            global_rt_size: 8,
-            compact_rt_size: 4,
-        },
        component: DatanodeOptions {
            node_id: Some(42),
            meta_client: Some(MetaClientOptions {
@@ -65,6 +59,7 @@ fn test_load_datanode_example_config() {
            wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig {
                dir: Some("/tmp/greptimedb/wal".to_string()),
                sync_period: Some(Duration::from_secs(10)),
+                recovery_parallelism: 2,
                ..Default::default()
            }),
            storage: StorageConfig {
@@ -73,16 +68,8 @@ fn test_load_datanode_example_config() {
            },
            region_engine: vec![
                RegionEngineConfig::Mito(MitoConfig {
-                    num_workers: 8,
                    auto_flush_interval: Duration::from_secs(3600),
                    scan_parallelism: 0,
-                    global_write_buffer_size: ReadableSize::gb(1),
-                    global_write_buffer_reject_size: ReadableSize::gb(2),
-                    sst_meta_cache_size: ReadableSize::mb(128),
-                    vector_cache_size: ReadableSize::mb(512),
-                    page_cache_size: ReadableSize::mb(512),
-                    selector_result_cache_size: ReadableSize::mb(512),
-                    max_background_jobs: 4,
                    experimental_write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
                    ..Default::default()
                }),
@@ -107,9 +94,10 @@ fn test_load_datanode_example_config() {
            rpc_max_send_message_size: Some(DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE),
            ..Default::default()
        },
+        ..Default::default()
    };

-    assert_eq!(options, expected);
+    similar_asserts::assert_eq!(options, expected);
 }

 #[test]
@@ -119,10 +107,6 @@ fn test_load_frontend_example_config() {
        GreptimeOptions::<FrontendOptions>::load_layered_options(example_config.to_str(), "")
            .unwrap();
    let expected = GreptimeOptions::<FrontendOptions> {
-        runtime: RuntimeOptions {
-            global_rt_size: 8,
-            compact_rt_size: 4,
-        },
        component: FrontendOptions {
            default_timezone: Some("UTC".to_string()),
            meta_client: Some(MetaClientOptions {
@@ -155,8 +139,9 @@ fn test_load_frontend_example_config() {
            },
            ..Default::default()
        },
+        ..Default::default()
    };
-    assert_eq!(options, expected);
+    similar_asserts::assert_eq!(options, expected);
 }

 #[test]
@@ -166,10 +151,6 @@ fn test_load_metasrv_example_config() {
        GreptimeOptions::<MetasrvOptions>::load_layered_options(example_config.to_str(), "")
            .unwrap();
    let expected = GreptimeOptions::<MetasrvOptions> {
-        runtime: RuntimeOptions {
-            global_rt_size: 8,
-            compact_rt_size: 4,
-        },
        component: MetasrvOptions {
            selector: SelectorType::default(),
            data_home: "/tmp/metasrv/".to_string(),
@@ -187,8 +168,9 @@ fn test_load_metasrv_example_config() {
            },
            ..Default::default()
        },
+        ..Default::default()
    };
-    assert_eq!(options, expected);
+    similar_asserts::assert_eq!(options, expected);
 }

 #[test]
@@ -198,30 +180,19 @@ fn test_load_standalone_example_config() {
        GreptimeOptions::<StandaloneOptions>::load_layered_options(example_config.to_str(), "")
            .unwrap();
    let expected = GreptimeOptions::<StandaloneOptions> {
-        runtime: RuntimeOptions {
-            global_rt_size: 8,
-            compact_rt_size: 4,
-        },
        component: StandaloneOptions {
            default_timezone: Some("UTC".to_string()),
            wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig {
                dir: Some("/tmp/greptimedb/wal".to_string()),
                sync_period: Some(Duration::from_secs(10)),
+                recovery_parallelism: 2,
                ..Default::default()
            }),
            region_engine: vec![
                RegionEngineConfig::Mito(MitoConfig {
-                    num_workers: 8,
                    auto_flush_interval: Duration::from_secs(3600),
-                    scan_parallelism: 0,
-                    global_write_buffer_size: ReadableSize::gb(1),
-                    global_write_buffer_reject_size: ReadableSize::gb(2),
-                    sst_meta_cache_size: ReadableSize::mb(128),
-                    vector_cache_size: ReadableSize::mb(512),
-                    page_cache_size: ReadableSize::mb(512),
-                    selector_result_cache_size: ReadableSize::mb(512),
-                    max_background_jobs: 4,
                    experimental_write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
+                    scan_parallelism: 0,
                    ..Default::default()
                }),
                RegionEngineConfig::File(EngineConfig {}),
@@ -243,6 +214,7 @@ fn test_load_standalone_example_config() {
            },
            ..Default::default()
        },
+        ..Default::default()
    };
-    assert_eq!(options, expected);
+    similar_asserts::assert_eq!(options, expected);
 }
--- a/src/common/base/Cargo.toml
+++ b/src/common/base/Cargo.toml
@@ -9,10 +9,12 @@ workspace = true

 [dependencies]
 anymap = "1.0.0-beta.2"
+async-trait.workspace = true
 bitvec = "1.0"
 bytes.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+futures.workspace = true
 paste = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 snafu.workspace = true
--- a/src/common/base/src/buffer.rs
+++ b/src/common/base/src/buffer.rs
@@ -1,242 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::io::{Read, Write};
-
-use bytes::{Buf, BufMut, BytesMut};
-use common_error::ext::ErrorExt;
-use common_macro::stack_trace_debug;
-use paste::paste;
-use snafu::{ensure, Location, ResultExt, Snafu};
-
-#[derive(Snafu)]
-#[snafu(visibility(pub))]
-#[stack_trace_debug]
-pub enum Error {
-    #[snafu(display(
-        "Destination buffer overflow, src_len: {}, dst_len: {}",
-        src_len,
-        dst_len
-    ))]
-    Overflow {
-        src_len: usize,
-        dst_len: usize,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Buffer underflow"))]
-    Underflow {
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("IO operation reach EOF"))]
-    Eof {
-        #[snafu(source)]
-        error: std::io::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-}
-
-pub type Result<T> = std::result::Result<T, Error>;
-
-impl ErrorExt for Error {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
-macro_rules! impl_read_le {
-    ( $($num_ty: ty), *) => {
-        $(
-            paste!{
-                // TODO(hl): default implementation requires allocating a
-                // temp buffer. maybe use more efficient impls in concrete buffers.
-                // see https://github.com/GrepTimeTeam/greptimedb/pull/97#discussion_r930798941
-                fn [<read_ $num_ty _le>](&mut self) -> Result<$num_ty> {
-                    let mut buf = [0u8; std::mem::size_of::<$num_ty>()];
-                    self.read_to_slice(&mut buf)?;
-                    Ok($num_ty::from_le_bytes(buf))
-                }
-
-                fn [<peek_ $num_ty _le>](&mut self) -> Result<$num_ty> {
-                    let mut buf = [0u8; std::mem::size_of::<$num_ty>()];
-                    self.peek_to_slice(&mut buf)?;
-                    Ok($num_ty::from_le_bytes(buf))
-                }
-            }
-        )*
-    }
-}
-
-macro_rules! impl_write_le {
-    ( $($num_ty: ty), *) => {
-        $(
-            paste!{
-                fn [<write_ $num_ty _le>](&mut self, n: $num_ty) -> Result<()> {
-                    self.write_from_slice(&n.to_le_bytes())?;
-                    Ok(())
-                }
-            }
-        )*
-    }
-}
-
-pub trait Buffer {
-    /// Returns remaining data size for read.
-    fn remaining_size(&self) -> usize;
-
-    /// Returns true if buffer has no data for read.
-    fn is_empty(&self) -> bool {
-        self.remaining_size() == 0
-    }
-
-    /// Peeks data into dst. This method should not change internal cursor,
-    /// invoke `advance_by` if needed.
-    /// # Panics
-    /// This method **may** panic if buffer does not have enough data to be copied to dst.
-    fn peek_to_slice(&self, dst: &mut [u8]) -> Result<()>;
-
-    /// Reads data into dst. This method will change internal cursor.
-    /// # Panics
-    /// This method **may** panic if buffer does not have enough data to be copied to dst.
-    fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> {
-        self.peek_to_slice(dst)?;
-        self.advance_by(dst.len());
-        Ok(())
-    }
-
-    /// Advances internal cursor for next read.
-    /// # Panics
-    /// This method **may** panic if the offset after advancing exceeds the length of underlying buffer.
-    fn advance_by(&mut self, by: usize);
-
-    impl_read_le![u8, i8, u16, i16, u32, i32, u64, i64, f32, f64];
-}
-
-macro_rules! impl_buffer_for_bytes {
-    ( $($buf_ty:ty), *) => {
-        $(
-        impl Buffer for $buf_ty {
-            fn remaining_size(&self) -> usize{
-                self.len()
-            }
-
-            fn peek_to_slice(&self, dst: &mut [u8]) -> Result<()> {
-                let dst_len = dst.len();
-                ensure!(self.remaining() >= dst.len(), OverflowSnafu {
-                        src_len: self.remaining_size(),
-                        dst_len,
-                    }
-                );
-                dst.copy_from_slice(&self[0..dst_len]);
-                Ok(())
-            }
-
-            #[inline]
-            fn advance_by(&mut self, by: usize) {
-                self.advance(by);
-            }
-        }
-        )*
-    };
-}
-
-impl_buffer_for_bytes![bytes::Bytes, bytes::BytesMut];
-
-impl Buffer for &[u8] {
-    fn remaining_size(&self) -> usize {
-        self.len()
-    }
-
-    fn peek_to_slice(&self, dst: &mut [u8]) -> Result<()> {
-        let dst_len = dst.len();
-        ensure!(
-            self.len() >= dst.len(),
-            OverflowSnafu {
-                src_len: self.remaining_size(),
-                dst_len,
-            }
-        );
-        dst.copy_from_slice(&self[0..dst_len]);
-        Ok(())
-    }
-
-    fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> {
-        ensure!(
-            self.len() >= dst.len(),
-            OverflowSnafu {
-                src_len: self.remaining_size(),
-                dst_len: dst.len(),
-            }
-        );
-        self.read_exact(dst).context(EofSnafu)
-    }
-
-    fn advance_by(&mut self, by: usize) {
-        *self = &self[by..];
-    }
-}
-
-/// Mutable buffer.
-pub trait BufferMut {
-    fn as_slice(&self) -> &[u8];
-
-    fn write_from_slice(&mut self, src: &[u8]) -> Result<()>;
-
-    impl_write_le![i8, u8, i16, u16, i32, u32, i64, u64, f32, f64];
-}
-
-impl BufferMut for BytesMut {
-    fn as_slice(&self) -> &[u8] {
-        self
-    }
-
-    fn write_from_slice(&mut self, src: &[u8]) -> Result<()> {
-        self.put_slice(src);
-        Ok(())
-    }
-}
-
-impl BufferMut for &mut [u8] {
-    fn as_slice(&self) -> &[u8] {
-        self
-    }
-
-    fn write_from_slice(&mut self, src: &[u8]) -> Result<()> {
-        // see std::io::Write::write_all
-        // https://doc.rust-lang.org/src/std/io/impls.rs.html#363
-        self.write_all(src).map_err(|_| {
-            OverflowSnafu {
-                src_len: src.len(),
-                dst_len: self.as_slice().len(),
-            }
-            .build()
-        })
-    }
-}
-
-impl BufferMut for Vec<u8> {
-    fn as_slice(&self) -> &[u8] {
-        self
-    }
-
-    fn write_from_slice(&mut self, src: &[u8]) -> Result<()> {
-        self.extend_from_slice(src);
-        Ok(())
-    }
-}
--- a/src/common/base/src/bytes.rs
+++ b/src/common/base/src/bytes.rs
@@ -44,6 +44,12 @@ impl From<Vec<u8>> for Bytes {
    }
 }

+impl From<Bytes> for Vec<u8> {
+    fn from(bytes: Bytes) -> Vec<u8> {
+        bytes.0.into()
+    }
+}
+
 impl Deref for Bytes {
    type Target = [u8];

--- a/src/common/base/src/lib.rs
+++ b/src/common/base/src/lib.rs
@@ -13,9 +13,9 @@
 // limitations under the License.

 pub mod bit_vec;
-pub mod buffer;
 pub mod bytes;
 pub mod plugins;
+pub mod range_read;
 #[allow(clippy::all)]
 pub mod readable_size;
 pub mod secrets;
--- a/src/common/base/src/range_read.rs
+++ b/src/common/base/src/range_read.rs
@@ -0,0 +1,105 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::io;
+use std::ops::Range;
+
+use async_trait::async_trait;
+use bytes::{BufMut, Bytes};
+use futures::{AsyncReadExt, AsyncSeekExt};
+
+/// `Metadata` contains the metadata of a source.
+pub struct Metadata {
+    /// The length of the source in bytes.
+    pub content_length: u64,
+}
+
+/// `RangeReader` reads a range of bytes from a source.
+#[async_trait]
+pub trait RangeReader: Send + Unpin {
+    /// Returns the metadata of the source.
+    async fn metadata(&mut self) -> io::Result<Metadata>;
+
+    /// Reads the bytes in the given range.
+    async fn read(&mut self, range: Range<u64>) -> io::Result<Bytes>;
+
+    /// Reads the bytes in the given range into the buffer.
+    ///
+    /// Handles the buffer based on its capacity:
+    /// - If the buffer is insufficient to hold the bytes, it will either:
+    ///   - Allocate additional space (e.g., for `Vec<u8>`)
+    ///   - Panic (e.g., for `&mut [u8]`)
+    async fn read_into(
+        &mut self,
+        range: Range<u64>,
+        buf: &mut (impl BufMut + Send),
+    ) -> io::Result<()> {
+        let bytes = self.read(range).await?;
+        buf.put_slice(&bytes);
+        Ok(())
+    }
+
+    /// Reads the bytes in the given ranges.
+    async fn read_vec(&mut self, ranges: &[Range<u64>]) -> io::Result<Vec<Bytes>> {
+        let mut result = Vec::with_capacity(ranges.len());
+        for range in ranges {
+            result.push(self.read(range.clone()).await?);
+        }
+        Ok(result)
+    }
+}
+
+#[async_trait]
+impl<R: RangeReader + Send + Unpin> RangeReader for &mut R {
+    async fn metadata(&mut self) -> io::Result<Metadata> {
+        (*self).metadata().await
+    }
+    async fn read(&mut self, range: Range<u64>) -> io::Result<Bytes> {
+        (*self).read(range).await
+    }
+    async fn read_into(
+        &mut self,
+        range: Range<u64>,
+        buf: &mut (impl BufMut + Send),
+    ) -> io::Result<()> {
+        (*self).read_into(range, buf).await
+    }
+    async fn read_vec(&mut self, ranges: &[Range<u64>]) -> io::Result<Vec<Bytes>> {
+        (*self).read_vec(ranges).await
+    }
+}
+
+/// `RangeReaderAdapter` bridges `RangeReader` and `AsyncRead + AsyncSeek`.
+pub struct RangeReaderAdapter<R>(pub R);
+
+/// Implements `RangeReader` for a type that implements `AsyncRead + AsyncSeek`.
+///
+/// TODO(zhongzc): It's a temporary solution for porting the codebase from `AsyncRead + AsyncSeek` to `RangeReader`.
+/// Until the codebase is fully ported to `RangeReader`, remove this implementation.
+#[async_trait]
+impl<R: futures::AsyncRead + futures::AsyncSeek + Send + Unpin> RangeReader
+    for RangeReaderAdapter<R>
+{
+    async fn metadata(&mut self) -> io::Result<Metadata> {
+        let content_length = self.0.seek(io::SeekFrom::End(0)).await?;
+        Ok(Metadata { content_length })
+    }
+
+    async fn read(&mut self, range: Range<u64>) -> io::Result<Bytes> {
+        let mut buf = vec![0; (range.end - range.start) as usize];
+        self.0.seek(io::SeekFrom::Start(range.start)).await?;
+        self.0.read_exact(&mut buf).await?;
+        Ok(Bytes::from(buf))
+    }
+}
--- a/src/common/base/tests/buffer_tests.rs
+++ b/src/common/base/tests/buffer_tests.rs
@@ -1,182 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#![feature(assert_matches)]
-
-#[cfg(test)]
-mod tests {
-    use std::assert_matches::assert_matches;
-
-    use bytes::{Buf, Bytes, BytesMut};
-    use common_base::buffer::Error::Overflow;
-    use common_base::buffer::{Buffer, BufferMut};
-    use paste::paste;
-
-    #[test]
-    pub fn test_buffer_read_write() {
-        let mut buf = BytesMut::with_capacity(16);
-        buf.write_u64_le(1234u64).unwrap();
-        let result = buf.peek_u64_le().unwrap();
-        assert_eq!(1234u64, result);
-        buf.advance_by(8);
-
-        buf.write_from_slice("hello, world".as_bytes()).unwrap();
-        let mut content = vec![0u8; 5];
-        buf.peek_to_slice(&mut content).unwrap();
-        let read = String::from_utf8_lossy(&content);
-        assert_eq!("hello", read);
-        buf.advance_by(5);
-        // after read, buffer should still have 7 bytes to read.
-        assert_eq!(7, buf.remaining());
-
-        let mut content = vec![0u8; 6];
-        buf.read_to_slice(&mut content).unwrap();
-        let read = String::from_utf8_lossy(&content);
-        assert_eq!(", worl", read);
-        // after read, buffer should still have 1 byte to read.
-        assert_eq!(1, buf.remaining());
-    }
-
-    #[test]
-    pub fn test_buffer_read() {
-        let mut bytes = Bytes::from_static("hello".as_bytes());
-        assert_eq!(5, bytes.remaining_size());
-        assert_eq!(b'h', bytes.peek_u8_le().unwrap());
-        bytes.advance_by(1);
-        assert_eq!(4, bytes.remaining_size());
-    }
-
-    macro_rules! test_primitive_read_write {
-        ( $($num_ty: ty), *) => {
-            $(
-                paste!{
-                    #[test]
-                    fn [<test_read_write_ $num_ty>]() {
-                        assert_eq!($num_ty::MAX,(&mut $num_ty::MAX.to_le_bytes() as &[u8]).[<read_ $num_ty _le>]().unwrap());
-                        assert_eq!($num_ty::MIN,(&mut $num_ty::MIN.to_le_bytes() as &[u8]).[<read_ $num_ty _le>]().unwrap());
-                    }
-                }
-            )*
-        }
-    }
-
-    test_primitive_read_write![u8, u16, u32, u64, i8, i16, i32, i64, f32, f64];
-
-    #[test]
-    pub fn test_read_write_from_slice_buffer() {
-        let mut buf = "hello".as_bytes();
-        assert_eq!(104, buf.peek_u8_le().unwrap());
-        buf.advance_by(1);
-        assert_eq!(101, buf.peek_u8_le().unwrap());
-        buf.advance_by(1);
-        assert_eq!(108, buf.peek_u8_le().unwrap());
-        buf.advance_by(1);
-        assert_eq!(108, buf.peek_u8_le().unwrap());
-        buf.advance_by(1);
-        assert_eq!(111, buf.peek_u8_le().unwrap());
-        buf.advance_by(1);
-        assert_matches!(buf.peek_u8_le(), Err(Overflow { .. }));
-    }
-
-    #[test]
-    pub fn test_read_u8_from_slice_buffer() {
-        let mut buf = "hello".as_bytes();
-        assert_eq!(104, buf.read_u8_le().unwrap());
-        assert_eq!(101, buf.read_u8_le().unwrap());
-        assert_eq!(108, buf.read_u8_le().unwrap());
-        assert_eq!(108, buf.read_u8_le().unwrap());
-        assert_eq!(111, buf.read_u8_le().unwrap());
-        assert_matches!(buf.read_u8_le(), Err(Overflow { .. }));
-    }
-
-    #[test]
-    pub fn test_read_write_numbers() {
-        let mut buf: Vec<u8> = vec![];
-        buf.write_u64_le(1234).unwrap();
-        assert_eq!(1234, (&buf[..]).read_u64_le().unwrap());
-
-        buf.write_u32_le(4242).unwrap();
-        let mut p = &buf[..];
-        assert_eq!(1234, p.read_u64_le().unwrap());
-        assert_eq!(4242, p.read_u32_le().unwrap());
-    }
-
-    macro_rules! test_primitive_vec_read_write {
-        ( $($num_ty: ty), *) => {
-            $(
-                paste!{
-                    #[test]
-                    fn [<test_read_write_ $num_ty _from_vec_buffer>]() {
-                        let mut buf = vec![];
-                        let _ = buf.[<write_ $num_ty _le>]($num_ty::MAX).unwrap();
-                        assert_eq!($num_ty::MAX, buf.as_slice().[<read_ $num_ty _le>]().unwrap());
-                    }
-                }
-            )*
-        }
-    }
-
-    test_primitive_vec_read_write![u8, u16, u32, u64, i8, i16, i32, i64, f32, f64];
-
-    #[test]
-    pub fn test_peek_write_from_vec_buffer() {
-        let mut buf: Vec<u8> = vec![];
-        buf.write_from_slice("hello".as_bytes()).unwrap();
-        let mut slice = buf.as_slice();
-        assert_eq!(104, slice.peek_u8_le().unwrap());
-        slice.advance_by(1);
-        assert_eq!(101, slice.peek_u8_le().unwrap());
-        slice.advance_by(1);
-        assert_eq!(108, slice.peek_u8_le().unwrap());
-        slice.advance_by(1);
-        assert_eq!(108, slice.peek_u8_le().unwrap());
-        slice.advance_by(1);
-        assert_eq!(111, slice.peek_u8_le().unwrap());
-        slice.advance_by(1);
-        assert_matches!(slice.read_u8_le(), Err(Overflow { .. }));
-    }
-
-    macro_rules! test_primitive_bytes_read_write {
-        ( $($num_ty: ty), *) => {
-            $(
-                paste!{
-                    #[test]
-                    fn [<test_read_write_ $num_ty _from_bytes>]() {
-                        let mut bytes = bytes::Bytes::from($num_ty::MAX.to_le_bytes().to_vec());
-                        assert_eq!($num_ty::MAX, bytes.[<read_ $num_ty _le>]().unwrap());
-
-                        let mut bytes = bytes::Bytes::from($num_ty::MIN.to_le_bytes().to_vec());
-                        assert_eq!($num_ty::MIN, bytes.[<read_ $num_ty _le>]().unwrap());
-                    }
-                }
-            )*
-        }
-    }
-
-    test_primitive_bytes_read_write![u8, u16, u32, u64, i8, i16, i32, i64, f32, f64];
-
-    #[test]
-    pub fn test_write_overflow() {
-        let mut buf = [0u8; 4];
-        assert_matches!(
-            (&mut buf[..]).write_from_slice("hell".as_bytes()),
-            Ok { .. }
-        );
-
-        assert_matches!(
-            (&mut buf[..]).write_from_slice("hello".as_bytes()),
-            Err(common_base::buffer::Error::Overflow { .. })
-        );
-    }
-}
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -29,6 +29,7 @@ datafusion.workspace = true
 datatypes.workspace = true
 geohash = { version = "0.13", optional = true }
 h3o = { version = "0.6", optional = true }
+jsonb.workspace = true
 num = "0.4"
 num-traits = "0.2"
 once_cell.workspace = true
--- a/src/common/function/src/function_registry.rs
+++ b/src/common/function/src/function_registry.rs
@@ -22,6 +22,7 @@ use crate::function::{AsyncFunctionRef, FunctionRef};
 use crate::scalars::aggregate::{AggregateFunctionMetaRef, AggregateFunctions};
 use crate::scalars::date::DateFunction;
 use crate::scalars::expression::ExpressionFunction;
+use crate::scalars::json::JsonFunction;
 use crate::scalars::matches::MatchesFunction;
 use crate::scalars::math::MathFunction;
 use crate::scalars::numpy::NumpyFunction;
@@ -116,6 +117,9 @@ pub static FUNCTION_REGISTRY: Lazy<Arc<FunctionRegistry>> = Lazy::new(|| {
    SystemFunction::register(&function_registry);
    TableFunction::register(&function_registry);

+    // Json related functions
+    JsonFunction::register(&function_registry);
+
    // Geo functions
    #[cfg(feature = "geo")]
    crate::scalars::geo::GeoFunctions::register(&function_registry);
--- a/src/common/function/src/scalars.rs
+++ b/src/common/function/src/scalars.rs
@@ -17,9 +17,11 @@ pub(crate) mod date;
 pub mod expression;
 #[cfg(feature = "geo")]
 pub mod geo;
+pub mod json;
 pub mod matches;
 pub mod math;
 pub mod numpy;
+
 #[cfg(test)]
 pub(crate) mod test;
 pub(crate) mod timestamp;
--- a/src/common/function/src/scalars/json.rs
+++ b/src/common/function/src/scalars/json.rs
@@ -0,0 +1,38 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+mod json_get;
+mod json_to_string;
+mod to_json;
+
+use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString};
+use json_to_string::JsonToStringFunction;
+use to_json::ToJsonFunction;
+
+use crate::function_registry::FunctionRegistry;
+
+pub(crate) struct JsonFunction;
+
+impl JsonFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register(Arc::new(JsonToStringFunction));
+        registry.register(Arc::new(ToJsonFunction));
+
+        registry.register(Arc::new(JsonGetInt));
+        registry.register(Arc::new(JsonGetFloat));
+        registry.register(Arc::new(JsonGetString));
+        registry.register(Arc::new(JsonGetBool));
+    }
+}
--- a/src/common/function/src/scalars/json/json_get.rs
+++ b/src/common/function/src/scalars/json/json_get.rs
@@ -0,0 +1,454 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{self, Display};
+
+use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
+use common_query::prelude::Signature;
+use datafusion::logical_expr::Volatility;
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::VectorRef;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::{
+    BooleanVectorBuilder, Float64VectorBuilder, Int64VectorBuilder, MutableVector,
+    StringVectorBuilder,
+};
+use snafu::ensure;
+
+use crate::function::{Function, FunctionContext};
+
+fn get_json_by_path(json: &[u8], path: &str) -> Option<Vec<u8>> {
+    let json_path = jsonb::jsonpath::parse_json_path(path.as_bytes());
+    match json_path {
+        Ok(json_path) => {
+            let mut sub_jsonb = Vec::new();
+            let mut sub_offsets = Vec::new();
+            match jsonb::get_by_path(json, json_path, &mut sub_jsonb, &mut sub_offsets) {
+                Ok(_) => Some(sub_jsonb),
+                Err(_) => None,
+            }
+        }
+        _ => None,
+    }
+}
+
+/// Get the value from the JSONB by the given path and return it as specified type.
+/// If the path does not exist or the value is not the type specified, return `NULL`.
+macro_rules! json_get {
+    // e.g. name = JsonGetInt, type = Int64, rust_type = i64, doc = "Get the value from the JSONB by the given path and return it as an integer."
+    ($name: ident, $type: ident, $rust_type: ident, $doc:expr) => {
+        paste::paste! {
+            #[doc = $doc]
+            #[derive(Clone, Debug, Default)]
+            pub struct $name;
+
+            impl Function for $name {
+                fn name(&self) -> &str {
+                    stringify!([<$name:snake>])
+                }
+
+                fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+                    Ok(ConcreteDataType::[<$type:snake _datatype>]())
+                }
+
+                fn signature(&self) -> Signature {
+                    Signature::exact(
+                        vec![
+                            ConcreteDataType::json_datatype(),
+                            ConcreteDataType::string_datatype(),
+                        ],
+                        Volatility::Immutable,
+                    )
+                }
+
+                fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+                    ensure!(
+                        columns.len() == 2,
+                        InvalidFuncArgsSnafu {
+                            err_msg: format!(
+                                "The length of the args is not correct, expect exactly two, have: {}",
+                                columns.len()
+                            ),
+                        }
+                    );
+                    let jsons = &columns[0];
+                    let paths = &columns[1];
+
+                    let size = jsons.len();
+                    let datatype = jsons.data_type();
+                    let mut results = [<$type VectorBuilder>]::with_capacity(size);
+
+                    match datatype {
+                        // JSON data type uses binary vector
+                        ConcreteDataType::Binary(_) => {
+                            for i in 0..size {
+                                let json = jsons.get_ref(i);
+                                let path = paths.get_ref(i);
+
+                                let json = json.as_binary();
+                                let path = path.as_string();
+                                let result = match (json, path) {
+                                    (Ok(Some(json)), Ok(Some(path))) => {
+                                        get_json_by_path(json, path)
+                                            .and_then(|json| { jsonb::[<to_ $rust_type>](&json).ok() })
+                                    }
+                                    _ => None,
+                                };
+
+                                results.push(result);
+                            }
+                        }
+                        _ => {
+                            return UnsupportedInputDataTypeSnafu {
+                                function: stringify!([<$name:snake>]),
+                                datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
+                            }
+                            .fail();
+                        }
+                    }
+
+                    Ok(results.to_vector())
+                }
+            }
+
+            impl Display for $name {
+                fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+                    write!(f, "{}", stringify!([<$name:snake>]).to_ascii_uppercase())
+                }
+            }
+        }
+    };
+}
+
+json_get!(
+    JsonGetInt,
+    Int64,
+    i64,
+    "Get the value from the JSONB by the given path and return it as an integer."
+);
+
+json_get!(
+    JsonGetFloat,
+    Float64,
+    f64,
+    "Get the value from the JSONB by the given path and return it as a float."
+);
+
+json_get!(
+    JsonGetBool,
+    Boolean,
+    bool,
+    "Get the value from the JSONB by the given path and return it as a boolean."
+);
+
+/// Get the value from the JSONB by the given path and return it as a string.
+#[derive(Clone, Debug, Default)]
+pub struct JsonGetString;
+
+impl Function for JsonGetString {
+    fn name(&self) -> &str {
+        "json_get_string"
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::string_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        Signature::exact(
+            vec![
+                ConcreteDataType::json_datatype(),
+                ConcreteDataType::string_datatype(),
+            ],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 2,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly two, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        let jsons = &columns[0];
+        let paths = &columns[1];
+
+        let size = jsons.len();
+        let datatype = jsons.data_type();
+        let mut results = StringVectorBuilder::with_capacity(size);
+
+        match datatype {
+            // JSON data type uses binary vector
+            ConcreteDataType::Binary(_) => {
+                for i in 0..size {
+                    let json = jsons.get_ref(i);
+                    let path = paths.get_ref(i);
+
+                    let json = json.as_binary();
+                    let path = path.as_string();
+                    let result = match (json, path) {
+                        (Ok(Some(json)), Ok(Some(path))) => {
+                            get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
+                        }
+                        _ => None,
+                    };
+
+                    results.push(result.as_deref());
+                }
+            }
+            _ => {
+                return UnsupportedInputDataTypeSnafu {
+                    function: "json_get_string",
+                    datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
+                }
+                .fail();
+            }
+        }
+
+        Ok(results.to_vector())
+    }
+}
+
+impl Display for JsonGetString {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", "json_get_string".to_ascii_uppercase())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_query::prelude::TypeSignature;
+    use datatypes::scalars::ScalarVector;
+    use datatypes::vectors::{BinaryVector, StringVector};
+
+    use super::*;
+
+    #[test]
+    fn test_json_get_int() {
+        let json_get_int = JsonGetInt;
+
+        assert_eq!("json_get_int", json_get_int.name());
+        assert_eq!(
+            ConcreteDataType::int64_datatype(),
+            json_get_int
+                .return_type(&[
+                    ConcreteDataType::json_datatype(),
+                    ConcreteDataType::string_datatype()
+                ])
+                .unwrap()
+        );
+
+        assert!(matches!(json_get_int.signature(),
+                         Signature {
+                             type_signature: TypeSignature::Exact(valid_types),
+                             volatility: Volatility::Immutable
+                         } if  valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()]
+        ));
+
+        let json_strings = [
+            r#"{"a": {"b": 2}, "b": 2, "c": 3}"#,
+            r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
+            r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
+        ];
+        let paths = vec!["$.a.b", "$.a", "$.c"];
+        let results = [Some(2), Some(4), None];
+
+        let jsonbs = json_strings
+            .iter()
+            .map(|s| {
+                let value = jsonb::parse_value(s.as_bytes()).unwrap();
+                value.to_vec()
+            })
+            .collect::<Vec<_>>();
+
+        let json_vector = BinaryVector::from_vec(jsonbs);
+        let path_vector = StringVector::from_vec(paths);
+        let args: Vec<VectorRef> = vec![Arc::new(json_vector), Arc::new(path_vector)];
+        let vector = json_get_int
+            .eval(FunctionContext::default(), &args)
+            .unwrap();
+
+        assert_eq!(3, vector.len());
+        for (i, gt) in results.iter().enumerate() {
+            let result = vector.get_ref(i);
+            let result = result.as_i64().unwrap();
+            assert_eq!(*gt, result);
+        }
+    }
+
+    #[test]
+    fn test_json_get_float() {
+        let json_get_float = JsonGetFloat;
+
+        assert_eq!("json_get_float", json_get_float.name());
+        assert_eq!(
+            ConcreteDataType::float64_datatype(),
+            json_get_float
+                .return_type(&[
+                    ConcreteDataType::json_datatype(),
+                    ConcreteDataType::string_datatype()
+                ])
+                .unwrap()
+        );
+
+        assert!(matches!(json_get_float.signature(),
+                         Signature {
+                             type_signature: TypeSignature::Exact(valid_types),
+                             volatility: Volatility::Immutable
+                         } if  valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()]
+        ));
+
+        let json_strings = [
+            r#"{"a": {"b": 2.1}, "b": 2.2, "c": 3.3}"#,
+            r#"{"a": 4.4, "b": {"c": 6.6}, "c": 6.6}"#,
+            r#"{"a": 7.7, "b": 8.8, "c": {"a": 7.7}}"#,
+        ];
+        let paths = vec!["$.a.b", "$.a", "$.c"];
+        let results = [Some(2.1), Some(4.4), None];
+
+        let jsonbs = json_strings
+            .iter()
+            .map(|s| {
+                let value = jsonb::parse_value(s.as_bytes()).unwrap();
+                value.to_vec()
+            })
+            .collect::<Vec<_>>();
+
+        let json_vector = BinaryVector::from_vec(jsonbs);
+        let path_vector = StringVector::from_vec(paths);
+        let args: Vec<VectorRef> = vec![Arc::new(json_vector), Arc::new(path_vector)];
+        let vector = json_get_float
+            .eval(FunctionContext::default(), &args)
+            .unwrap();
+
+        assert_eq!(3, vector.len());
+        for (i, gt) in results.iter().enumerate() {
+            let result = vector.get_ref(i);
+            let result = result.as_f64().unwrap();
+            assert_eq!(*gt, result);
+        }
+    }
+
+    #[test]
+    fn test_json_get_bool() {
+        let json_get_bool = JsonGetBool;
+
+        assert_eq!("json_get_bool", json_get_bool.name());
+        assert_eq!(
+            ConcreteDataType::boolean_datatype(),
+            json_get_bool
+                .return_type(&[
+                    ConcreteDataType::json_datatype(),
+                    ConcreteDataType::string_datatype()
+                ])
+                .unwrap()
+        );
+
+        assert!(matches!(json_get_bool.signature(),
+                         Signature {
+                             type_signature: TypeSignature::Exact(valid_types),
+                             volatility: Volatility::Immutable
+                         } if  valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()]
+        ));
+
+        let json_strings = [
+            r#"{"a": {"b": true}, "b": false, "c": true}"#,
+            r#"{"a": false, "b": {"c": true}, "c": false}"#,
+            r#"{"a": true, "b": false, "c": {"a": true}}"#,
+        ];
+        let paths = vec!["$.a.b", "$.a", "$.c"];
+        let results = [Some(true), Some(false), None];
+
+        let jsonbs = json_strings
+            .iter()
+            .map(|s| {
+                let value = jsonb::parse_value(s.as_bytes()).unwrap();
+                value.to_vec()
+            })
+            .collect::<Vec<_>>();
+
+        let json_vector = BinaryVector::from_vec(jsonbs);
+        let path_vector = StringVector::from_vec(paths);
+        let args: Vec<VectorRef> = vec![Arc::new(json_vector), Arc::new(path_vector)];
+        let vector = json_get_bool
+            .eval(FunctionContext::default(), &args)
+            .unwrap();
+
+        assert_eq!(3, vector.len());
+        for (i, gt) in results.iter().enumerate() {
+            let result = vector.get_ref(i);
+            let result = result.as_boolean().unwrap();
+            assert_eq!(*gt, result);
+        }
+    }
+
+    #[test]
+    fn test_json_get_string() {
+        let json_get_string = JsonGetString;
+
+        assert_eq!("json_get_string", json_get_string.name());
+        assert_eq!(
+            ConcreteDataType::string_datatype(),
+            json_get_string
+                .return_type(&[
+                    ConcreteDataType::json_datatype(),
+                    ConcreteDataType::string_datatype()
+                ])
+                .unwrap()
+        );
+
+        assert!(matches!(json_get_string.signature(),
+                         Signature {
+                             type_signature: TypeSignature::Exact(valid_types),
+                             volatility: Volatility::Immutable
+                         } if  valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()]
+        ));
+
+        let json_strings = [
+            r#"{"a": {"b": "a"}, "b": "b", "c": "c"}"#,
+            r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#,
+            r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#,
+        ];
+        let paths = vec!["$.a.b", "$.a", ""];
+        let results = [Some("a"), Some("d"), None];
+
+        let jsonbs = json_strings
+            .iter()
+            .map(|s| {
+                let value = jsonb::parse_value(s.as_bytes()).unwrap();
+                value.to_vec()
+            })
+            .collect::<Vec<_>>();
+
+        let json_vector = BinaryVector::from_vec(jsonbs);
+        let path_vector = StringVector::from_vec(paths);
+        let args: Vec<VectorRef> = vec![Arc::new(json_vector), Arc::new(path_vector)];
+        let vector = json_get_string
+            .eval(FunctionContext::default(), &args)
+            .unwrap();
+
+        assert_eq!(3, vector.len());
+        for (i, gt) in results.iter().enumerate() {
+            let result = vector.get_ref(i);
+            let result = result.as_string().unwrap();
+            assert_eq!(*gt, result);
+        }
+    }
+}
--- a/src/common/function/src/scalars/json/json_to_string.rs
+++ b/src/common/function/src/scalars/json/json_to_string.rs
@@ -0,0 +1,174 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{self, Display};
+
+use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
+use common_query::prelude::Signature;
+use datafusion::logical_expr::Volatility;
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::VectorRef;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::{MutableVector, StringVectorBuilder};
+use snafu::ensure;
+
+use crate::function::{Function, FunctionContext};
+
+/// Converts the `JSONB` into `String`. It's useful for displaying JSONB content.
+#[derive(Clone, Debug, Default)]
+pub struct JsonToStringFunction;
+
+const NAME: &str = "json_to_string";
+
+impl Function for JsonToStringFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::string_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        Signature::exact(
+            vec![ConcreteDataType::json_datatype()],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 1,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly one, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        let jsons = &columns[0];
+
+        let size = jsons.len();
+        let datatype = jsons.data_type();
+        let mut results = StringVectorBuilder::with_capacity(size);
+
+        match datatype {
+            // JSON data type uses binary vector
+            ConcreteDataType::Binary(_) => {
+                for i in 0..size {
+                    let json = jsons.get_ref(i);
+
+                    let json = json.as_binary();
+                    let result = match json {
+                        Ok(Some(json)) => match jsonb::from_slice(json) {
+                            Ok(json) => {
+                                let json = json.to_string();
+                                Some(json)
+                            }
+                            Err(_) => {
+                                return InvalidFuncArgsSnafu {
+                                    err_msg: format!("Illegal json binary: {:?}", json),
+                                }
+                                .fail()
+                            }
+                        },
+                        _ => None,
+                    };
+
+                    results.push(result.as_deref());
+                }
+            }
+            _ => {
+                return UnsupportedInputDataTypeSnafu {
+                    function: NAME,
+                    datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
+                }
+                .fail();
+            }
+        }
+
+        Ok(results.to_vector())
+    }
+}
+
+impl Display for JsonToStringFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "JSON_TO_STRING")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_query::prelude::TypeSignature;
+    use datatypes::scalars::ScalarVector;
+    use datatypes::vectors::BinaryVector;
+
+    use super::*;
+
+    #[test]
+    fn test_get_by_path_function() {
+        let json_to_string = JsonToStringFunction;
+
+        assert_eq!("json_to_string", json_to_string.name());
+        assert_eq!(
+            ConcreteDataType::string_datatype(),
+            json_to_string
+                .return_type(&[ConcreteDataType::json_datatype()])
+                .unwrap()
+        );
+
+        assert!(matches!(json_to_string.signature(),
+                         Signature {
+                             type_signature: TypeSignature::Exact(valid_types),
+                             volatility: Volatility::Immutable
+                         } if  valid_types == vec![ConcreteDataType::json_datatype()]
+        ));
+
+        let json_strings = [
+            r#"{"a": {"b": 2}, "b": 2, "c": 3}"#,
+            r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
+            r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
+        ];
+
+        let jsonbs = json_strings
+            .iter()
+            .map(|s| {
+                let value = jsonb::parse_value(s.as_bytes()).unwrap();
+                value.to_vec()
+            })
+            .collect::<Vec<_>>();
+
+        let json_vector = BinaryVector::from_vec(jsonbs);
+        let args: Vec<VectorRef> = vec![Arc::new(json_vector)];
+        let vector = json_to_string
+            .eval(FunctionContext::default(), &args)
+            .unwrap();
+
+        assert_eq!(3, vector.len());
+        for (i, gt) in json_strings.iter().enumerate() {
+            let result = vector.get_ref(i);
+            let result = result.as_string().unwrap().unwrap();
+            // remove whitespaces
+            assert_eq!(gt.replace(" ", ""), result);
+        }
+
+        let invalid_jsonb = vec![b"invalid json"];
+        let invalid_json_vector = BinaryVector::from_vec(invalid_jsonb);
+        let args: Vec<VectorRef> = vec![Arc::new(invalid_json_vector)];
+        let vector = json_to_string.eval(FunctionContext::default(), &args);
+        assert!(vector.is_err());
+    }
+}
--- a/src/common/function/src/scalars/json/to_json.rs
+++ b/src/common/function/src/scalars/json/to_json.rs
@@ -0,0 +1,165 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{self, Display};
+
+use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
+use common_query::prelude::Signature;
+use datafusion::logical_expr::Volatility;
+use datatypes::data_type::ConcreteDataType;
+use datatypes::prelude::VectorRef;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::{BinaryVectorBuilder, MutableVector};
+use snafu::ensure;
+
+use crate::function::{Function, FunctionContext};
+
+/// Parses the `String` into `JSONB`.
+#[derive(Clone, Debug, Default)]
+pub struct ToJsonFunction;
+
+const NAME: &str = "to_json";
+
+impl Function for ToJsonFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::json_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        Signature::exact(
+            vec![ConcreteDataType::string_datatype()],
+            Volatility::Immutable,
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 1,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly one, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        let json_strings = &columns[0];
+
+        let size = json_strings.len();
+        let datatype = json_strings.data_type();
+        let mut results = BinaryVectorBuilder::with_capacity(size);
+
+        match datatype {
+            ConcreteDataType::String(_) => {
+                for i in 0..size {
+                    let json_string = json_strings.get_ref(i);
+
+                    let json_string = json_string.as_string();
+                    let result = match json_string {
+                        Ok(Some(json_string)) => match jsonb::parse_value(json_string.as_bytes()) {
+                            Ok(json) => Some(json.to_vec()),
+                            Err(_) => {
+                                return InvalidFuncArgsSnafu {
+                                    err_msg: format!(
+                                        "Cannot convert the string to json, have: {}",
+                                        json_string
+                                    ),
+                                }
+                                .fail()
+                            }
+                        },
+                        _ => None,
+                    };
+
+                    results.push(result.as_deref());
+                }
+            }
+            _ => {
+                return UnsupportedInputDataTypeSnafu {
+                    function: NAME,
+                    datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
+                }
+                .fail();
+            }
+        }
+
+        Ok(results.to_vector())
+    }
+}
+
+impl Display for ToJsonFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "TO_JSON")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_query::prelude::TypeSignature;
+    use datatypes::scalars::ScalarVector;
+    use datatypes::vectors::StringVector;
+
+    use super::*;
+
+    #[test]
+    fn test_get_by_path_function() {
+        let to_json = ToJsonFunction;
+
+        assert_eq!("to_json", to_json.name());
+        assert_eq!(
+            ConcreteDataType::json_datatype(),
+            to_json
+                .return_type(&[ConcreteDataType::json_datatype()])
+                .unwrap()
+        );
+
+        assert!(matches!(to_json.signature(),
+                         Signature {
+                             type_signature: TypeSignature::Exact(valid_types),
+                             volatility: Volatility::Immutable
+                         } if  valid_types == vec![ConcreteDataType::string_datatype()]
+        ));
+
+        let json_strings = [
+            r#"{"a": {"b": 2}, "b": 2, "c": 3}"#,
+            r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
+            r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
+        ];
+
+        let jsonbs = json_strings
+            .iter()
+            .map(|s| {
+                let value = jsonb::parse_value(s.as_bytes()).unwrap();
+                value.to_vec()
+            })
+            .collect::<Vec<_>>();
+
+        let json_string_vector = StringVector::from_vec(json_strings.to_vec());
+        let args: Vec<VectorRef> = vec![Arc::new(json_string_vector)];
+        let vector = to_json.eval(FunctionContext::default(), &args).unwrap();
+
+        assert_eq!(3, vector.len());
+        for (i, gt) in jsonbs.iter().enumerate() {
+            let result = vector.get_ref(i);
+            let result = result.as_binary().unwrap().unwrap();
+            // remove whitespaces
+            assert_eq!(gt, result);
+        }
+    }
+}
--- a/src/common/function/src/system/version.rs
+++ b/src/common/function/src/system/version.rs
@@ -19,6 +19,7 @@ use common_query::error::Result;
 use common_query::prelude::{Signature, Volatility};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::vectors::{StringVector, VectorRef};
+use session::context::Channel;

 use crate::function::{Function, FunctionContext};

@@ -44,11 +45,22 @@ impl Function for VersionFunction {
        Signature::exact(vec![], Volatility::Immutable)
    }

-    fn eval(&self, _func_ctx: FunctionContext, _columns: &[VectorRef]) -> Result<VectorRef> {
-        let result = StringVector::from(vec![format!(
-            "5.7.20-greptimedb-{}",
-            env!("CARGO_PKG_VERSION")
-        )]);
+    fn eval(&self, func_ctx: FunctionContext, _columns: &[VectorRef]) -> Result<VectorRef> {
+        let version = match func_ctx.query_ctx.channel() {
+            Channel::Mysql => {
+                format!(
+                    "{}-greptimedb-{}",
+                    std::env::var("GREPTIMEDB_MYSQL_SERVER_VERSION")
+                        .unwrap_or_else(|_| "8.4.2".to_string()),
+                    env!("CARGO_PKG_VERSION")
+                )
+            }
+            Channel::Postgres => {
+                format!("16.3-greptimedb-{}", env!("CARGO_PKG_VERSION"))
+            }
+            _ => env!("CARGO_PKG_VERSION").to_string(),
+        };
+        let result = StringVector::from(vec![version]);
        Ok(Arc::new(result))
    }
 }
--- a/src/common/grpc-expr/src/insert.rs
+++ b/src/common/grpc-expr/src/insert.rs
@@ -14,11 +14,10 @@

 use api::helper;
 use api::v1::column::Values;
-use api::v1::{AddColumns, Column, CreateTableExpr};
+use api::v1::{Column, CreateTableExpr};
 use common_base::BitVec;
 use datatypes::data_type::{ConcreteDataType, DataType};
 use datatypes::prelude::VectorRef;
-use datatypes::schema::SchemaRef;
 use snafu::{ensure, ResultExt};
 use table::metadata::TableId;
 use table::table_reference::TableReference;
@@ -27,11 +26,6 @@ use crate::error::{CreateVectorSnafu, Result, UnexpectedValuesLengthSnafu};
 use crate::util;
 use crate::util::ColumnExpr;

-pub fn find_new_columns(schema: &SchemaRef, columns: &[Column]) -> Result<Option<AddColumns>> {
-    let column_exprs = ColumnExpr::from_columns(columns);
-    util::extract_new_columns(schema, column_exprs)
-}
-
 /// Try to build create table request from insert data.
 pub fn build_create_expr_from_insertion(
    catalog_name: &str,
@@ -114,7 +108,6 @@ mod tests {
    use super::*;
    use crate::error;
    use crate::error::ColumnDataTypeSnafu;
-    use crate::insert::find_new_columns;

    #[inline]
    fn build_column_schema(
@@ -281,11 +274,18 @@ mod tests {

        let schema = Arc::new(SchemaBuilder::try_from(columns).unwrap().build().unwrap());

-        assert!(find_new_columns(&schema, &[]).unwrap().is_none());
+        assert!(
+            util::extract_new_columns(&schema, ColumnExpr::from_columns(&[]))
+                .unwrap()
+                .is_none()
+        );

        let insert_batch = mock_insert_batch();

-        let add_columns = find_new_columns(&schema, &insert_batch.0).unwrap().unwrap();
+        let add_columns =
+            util::extract_new_columns(&schema, ColumnExpr::from_columns(&insert_batch.0))
+                .unwrap()
+                .unwrap();

        assert_eq!(5, add_columns.add_columns.len());
        let host_column = &add_columns.add_columns[0];
--- a/src/common/grpc-expr/src/lib.rs
+++ b/src/common/grpc-expr/src/lib.rs
@@ -19,4 +19,4 @@ pub mod insert;
 pub mod util;

 pub use alter::{alter_expr_to_request, create_table_schema};
-pub use insert::{build_create_expr_from_insertion, find_new_columns};
+pub use insert::build_create_expr_from_insertion;
--- a/src/common/grpc/src/select.rs
+++ b/src/common/grpc/src/select.rs
@@ -70,7 +70,7 @@ macro_rules! convert_arrow_array_to_grpc_vals {
                    return Ok(vals);
                },
            )+
-            ConcreteDataType::Null(_) | ConcreteDataType::List(_) | ConcreteDataType::Dictionary(_) | ConcreteDataType::Duration(_) => unreachable!("Should not send {:?} in gRPC", $data_type),
+            ConcreteDataType::Null(_) | ConcreteDataType::List(_) | ConcreteDataType::Dictionary(_) | ConcreteDataType::Duration(_) | ConcreteDataType::Json(_) => unreachable!("Should not send {:?} in gRPC", $data_type),
        }
    }};
 }
--- a/src/common/meta/src/ddl/alter_logical_tables.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables.rs
@@ -39,7 +39,7 @@ use crate::key::DeserializedValueWithBytes;
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
 use crate::rpc::ddl::AlterTableTask;
 use crate::rpc::router::find_leaders;
-use crate::{cache_invalidator, metrics, ClusterId};
+use crate::{metrics, ClusterId};

 pub struct AlterLogicalTablesProcedure {
    pub context: DdlContext,
@@ -131,7 +131,7 @@ impl AlterLogicalTablesProcedure {
        let phy_raw_schemas = future::join_all(alter_region_tasks)
            .await
            .into_iter()
-            .map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
+            .map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
            .collect::<Result<Vec<_>>>()?;

        if phy_raw_schemas.is_empty() {
@@ -170,12 +170,11 @@ impl AlterLogicalTablesProcedure {
    }

    pub(crate) async fn on_invalidate_table_cache(&mut self) -> Result<Status> {
-        let ctx = cache_invalidator::Context::default();
        let to_invalidate = self.build_table_cache_keys_to_invalidate();

        self.context
            .cache_invalidator
-            .invalidate(&ctx, &to_invalidate)
+            .invalidate(&Default::default(), &to_invalidate)
            .await?;
        Ok(Status::done())
    }
--- a/src/common/meta/src/ddl/create_logical_tables.rs
+++ b/src/common/meta/src/ddl/create_logical_tables.rs
@@ -157,7 +157,7 @@ impl CreateLogicalTablesProcedure {
        let phy_raw_schemas = join_all(create_region_tasks)
            .await
            .into_iter()
-            .map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
+            .map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
            .collect::<Result<Vec<_>>>()?;

        if phy_raw_schemas.is_empty() {
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -441,11 +441,9 @@ async fn handle_alter_table_task(
        .table_metadata_manager()
        .table_route_manager()
        .table_route_storage()
-        .get_raw(table_id)
+        .get(table_id)
        .await?
-        .context(TableRouteNotFoundSnafu { table_id })?
-        .into_inner();
-
+        .context(TableRouteNotFoundSnafu { table_id })?;
    ensure!(
        table_route_value.is_physical(),
        UnexpectedLogicalRouteTableSnafu {
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -90,6 +90,7 @@
 pub mod catalog_name;
 pub mod datanode_table;
 pub mod flow;
+pub mod node_address;
 pub mod schema_name;
 pub mod table_info;
 pub mod table_name;
@@ -102,7 +103,7 @@ pub mod view_info;

 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::fmt::Debug;
-use std::ops::Deref;
+use std::ops::{Deref, DerefMut};
 use std::sync::Arc;

 use bytes::Bytes;
@@ -134,6 +135,7 @@ use self::table_route::{TableRouteManager, TableRouteValue};
 use self::tombstone::TombstoneManager;
 use crate::ddl::utils::region_storage_path;
 use crate::error::{self, Result, SerdeJsonSnafu};
+use crate::key::node_address::NodeAddressValue;
 use crate::key::table_route::TableRouteKey;
 use crate::key::txn_helper::TxnOpGetResponseSet;
 use crate::kv_backend::txn::{Txn, TxnOp};
@@ -152,12 +154,15 @@ pub const TABLE_NAME_KEY_PREFIX: &str = "__table_name";
 pub const CATALOG_NAME_KEY_PREFIX: &str = "__catalog_name";
 pub const SCHEMA_NAME_KEY_PREFIX: &str = "__schema_name";
 pub const TABLE_ROUTE_PREFIX: &str = "__table_route";
+pub const NODE_ADDRESS_PREFIX: &str = "__node_address";

-pub const CACHE_KEY_PREFIXES: [&str; 4] = [
+/// The keys with these prefixes will be loaded into the cache when the leader starts.
+pub const CACHE_KEY_PREFIXES: [&str; 5] = [
    TABLE_NAME_KEY_PREFIX,
    CATALOG_NAME_KEY_PREFIX,
    SCHEMA_NAME_KEY_PREFIX,
    TABLE_ROUTE_PREFIX,
+    NODE_ADDRESS_PREFIX,
 ];

 pub type RegionDistribution = BTreeMap<DatanodeId, Vec<RegionNumber>>;
@@ -210,6 +215,11 @@ lazy_static! {
        .unwrap();
 }

+lazy_static! {
+    static ref NODE_ADDRESS_PATTERN: Regex =
+        Regex::new(&format!("^{NODE_ADDRESS_PREFIX}/([0-9]+)/([0-9]+)$")).unwrap();
+}
+
 /// The key of metadata.
 pub trait MetadataKey<'a, T> {
    fn to_bytes(&self) -> Vec<u8>;
@@ -306,6 +316,12 @@ impl<T: DeserializeOwned + Serialize> Deref for DeserializedValueWithBytes<T> {
    }
 }

+impl<T: DeserializeOwned + Serialize> DerefMut for DeserializedValueWithBytes<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
 impl<T: DeserializeOwned + Serialize + Debug> Debug for DeserializedValueWithBytes<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -1230,7 +1246,8 @@ impl_metadata_value! {
    FlowInfoValue,
    FlowNameValue,
    FlowRouteValue,
-    TableFlowValue
+    TableFlowValue,
+    NodeAddressValue
 }

 impl_optional_metadata_value! {
@@ -1952,7 +1969,7 @@ mod tests {
        let table_route_value = table_metadata_manager
            .table_route_manager
            .table_route_storage()
-            .get_raw(table_id)
+            .get_with_raw_bytes(table_id)
            .await
            .unwrap()
            .unwrap();
@@ -2005,7 +2022,7 @@ mod tests {
        let table_route_value = table_metadata_manager
            .table_route_manager
            .table_route_storage()
-            .get_raw(table_id)
+            .get_with_raw_bytes(table_id)
            .await
            .unwrap()
            .unwrap();
--- a/src/common/meta/src/key/node_address.rs
+++ b/src/common/meta/src/key/node_address.rs
@@ -0,0 +1,114 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Display;
+
+use api::v1::meta::Role;
+use serde::{Deserialize, Serialize};
+use snafu::OptionExt;
+
+use crate::error::{InvalidMetadataSnafu, Result};
+use crate::key::{MetadataKey, NODE_ADDRESS_PATTERN, NODE_ADDRESS_PREFIX};
+use crate::peer::Peer;
+
+/// The key stores node address.
+///
+/// The layout: `__node_address/{role}/{node_id}`
+#[derive(Debug, PartialEq)]
+pub struct NodeAddressKey {
+    pub role: Role,
+    pub node_id: u64,
+}
+
+impl NodeAddressKey {
+    pub fn new(role: Role, node_id: u64) -> Self {
+        Self { role, node_id }
+    }
+
+    pub fn with_datanode(node_id: u64) -> Self {
+        Self::new(Role::Datanode, node_id)
+    }
+}
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
+pub struct NodeAddressValue {
+    pub peer: Peer,
+}
+
+impl NodeAddressValue {
+    pub fn new(peer: Peer) -> Self {
+        Self { peer }
+    }
+}
+
+impl<'a> MetadataKey<'a, NodeAddressKey> for NodeAddressKey {
+    fn to_bytes(&self) -> Vec<u8> {
+        self.to_string().into_bytes()
+    }
+
+    fn from_bytes(bytes: &[u8]) -> Result<NodeAddressKey> {
+        let key = std::str::from_utf8(bytes).map_err(|e| {
+            InvalidMetadataSnafu {
+                err_msg: format!(
+                    "NodeAddressKey '{}' is not a valid UTF8 string: {e}",
+                    String::from_utf8_lossy(bytes)
+                ),
+            }
+            .build()
+        })?;
+        let captures = NODE_ADDRESS_PATTERN
+            .captures(key)
+            .context(InvalidMetadataSnafu {
+                err_msg: format!("Invalid NodeAddressKey '{key}'"),
+            })?;
+        // Safety: pass the regex check above
+        let role = captures[1].parse::<i32>().unwrap();
+        let role = Role::try_from(role).map_err(|_| {
+            InvalidMetadataSnafu {
+                err_msg: format!("Invalid Role value: {role}"),
+            }
+            .build()
+        })?;
+        let node_id = captures[2].parse::<u64>().unwrap();
+        Ok(NodeAddressKey::new(role, node_id))
+    }
+}
+
+impl Display for NodeAddressKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}/{}/{}",
+            NODE_ADDRESS_PREFIX, self.role as i32, self.node_id
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_node_address_key() {
+        let key = NodeAddressKey::new(Role::Datanode, 1);
+        let bytes = key.to_bytes();
+        let key2 = NodeAddressKey::from_bytes(&bytes).unwrap();
+        assert_eq!(key, key2);
+
+        let key = NodeAddressKey::new(Role::Flownode, 3);
+        let bytes = key.to_bytes();
+        let key2 = NodeAddressKey::from_bytes(&bytes).unwrap();
+        assert_eq!(key, key2);
+    }
+}
--- a/src/common/meta/src/key/table_route.rs
+++ b/src/common/meta/src/key/table_route.rs
@@ -22,9 +22,10 @@ use store_api::storage::{RegionId, RegionNumber};
 use table::metadata::TableId;

 use crate::error::{
-    self, InvalidMetadataSnafu, MetadataCorruptionSnafu, Result, SerdeJsonSnafu,
-    TableRouteNotFoundSnafu, UnexpectedLogicalRouteTableSnafu,
+    InvalidMetadataSnafu, MetadataCorruptionSnafu, Result, SerdeJsonSnafu, TableRouteNotFoundSnafu,
+    UnexpectedLogicalRouteTableSnafu,
 };
+use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
 use crate::key::txn_helper::TxnOpGetResponseSet;
 use crate::key::{
    DeserializedValueWithBytes, MetadataKey, MetadataValue, RegionDistribution,
@@ -85,7 +86,7 @@ impl TableRouteValue {
                    debug_assert_eq!(region.region.id.table_id(), physical_table_id);
                    RegionId::new(table_id, region.region.id.region_number())
                })
-                .collect::<Vec<_>>();
+                .collect();
            TableRouteValue::logical(physical_table_id, region_routes)
        }
    }
@@ -189,12 +190,12 @@ impl TableRouteValue {
                .region_routes
                .iter()
                .map(|region_route| region_route.region.id.region_number())
-                .collect::<Vec<_>>(),
+                .collect(),
            TableRouteValue::Logical(x) => x
                .region_ids()
                .iter()
                .map(|region_id| region_id.region_number())
-                .collect::<Vec<_>>(),
+                .collect(),
        }
    }
 }
@@ -301,7 +302,7 @@ impl TableRouteManager {
            Some(route) => {
                ensure!(
                    route.is_physical(),
-                    error::UnexpectedLogicalRouteTableSnafu {
+                    UnexpectedLogicalRouteTableSnafu {
                        err_msg: format!("{route:?} is a non-physical TableRouteValue.")
                    }
                );
@@ -321,7 +322,7 @@ impl TableRouteManager {
    ) -> Result<TableId> {
        let table_route = self
            .storage
-            .get(logical_or_physical_table_id)
+            .get_inner(logical_or_physical_table_id)
            .await?
            .context(TableRouteNotFoundSnafu {
                table_id: logical_or_physical_table_id,
@@ -335,7 +336,7 @@ impl TableRouteManager {

    /// Returns the [TableRouteValue::Physical] recursively.
    ///
-    /// Returns a [TableRouteNotFound](crate::error::Error::TableRouteNotFound) Error if:
+    /// Returns a [TableRouteNotFound](error::Error::TableRouteNotFound) Error if:
    /// - the physical table(`logical_or_physical_table_id`) does not exist
    /// - the corresponding physical table of the logical table(`logical_or_physical_table_id`) does not exist.
    pub async fn get_physical_table_route(
@@ -528,6 +529,15 @@ impl TableRouteStorage {

    /// Returns the [`TableRouteValue`].
    pub async fn get(&self, table_id: TableId) -> Result<Option<TableRouteValue>> {
+        let mut table_route = self.get_inner(table_id).await?;
+        if let Some(table_route) = &mut table_route {
+            self.remap_route_address(table_route).await?;
+        };
+
+        Ok(table_route)
+    }
+
+    async fn get_inner(&self, table_id: TableId) -> Result<Option<TableRouteValue>> {
        let key = TableRouteKey::new(table_id);
        self.kv_backend
            .get(&key.to_bytes())
@@ -537,7 +547,19 @@ impl TableRouteStorage {
    }

    /// Returns the [`TableRouteValue`] wrapped with [`DeserializedValueWithBytes`].
-    pub async fn get_raw(
+    pub async fn get_with_raw_bytes(
+        &self,
+        table_id: TableId,
+    ) -> Result<Option<DeserializedValueWithBytes<TableRouteValue>>> {
+        let mut table_route = self.get_with_raw_bytes_inner(table_id).await?;
+        if let Some(table_route) = &mut table_route {
+            self.remap_route_address(table_route).await?;
+        };
+
+        Ok(table_route)
+    }
+
+    async fn get_with_raw_bytes_inner(
        &self,
        table_id: TableId,
    ) -> Result<Option<DeserializedValueWithBytes<TableRouteValue>>> {
@@ -554,27 +576,27 @@ impl TableRouteStorage {
    /// Returns a [TableRouteNotFound](crate::error::Error::TableRouteNotFound) Error if:
    /// - the physical table(`logical_or_physical_table_id`) does not exist
    /// - the corresponding physical table of the logical table(`logical_or_physical_table_id`) does not exist.
-    pub async fn get_raw_physical_table_route(
+    pub async fn get_physical_table_route_with_raw_bytes(
        &self,
        logical_or_physical_table_id: TableId,
    ) -> Result<(TableId, DeserializedValueWithBytes<TableRouteValue>)> {
-        let table_route =
-            self.get_raw(logical_or_physical_table_id)
-                .await?
-                .context(TableRouteNotFoundSnafu {
-                    table_id: logical_or_physical_table_id,
-                })?;
+        let table_route = self
+            .get_with_raw_bytes(logical_or_physical_table_id)
+            .await?
+            .context(TableRouteNotFoundSnafu {
+                table_id: logical_or_physical_table_id,
+            })?;

        match table_route.get_inner_ref() {
            TableRouteValue::Physical(_) => Ok((logical_or_physical_table_id, table_route)),
            TableRouteValue::Logical(x) => {
                let physical_table_id = x.physical_table_id();
-                let physical_table_route =
-                    self.get_raw(physical_table_id)
-                        .await?
-                        .context(TableRouteNotFoundSnafu {
-                            table_id: physical_table_id,
-                        })?;
+                let physical_table_route = self
+                    .get_with_raw_bytes(physical_table_id)
+                    .await?
+                    .context(TableRouteNotFoundSnafu {
+                        table_id: physical_table_id,
+                    })?;
                Ok((physical_table_id, physical_table_route))
            }
        }
@@ -582,6 +604,13 @@ impl TableRouteStorage {

    /// Returns batch of [`TableRouteValue`] that respects the order of `table_ids`.
    pub async fn batch_get(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
+        let mut table_routes = self.batch_get_inner(table_ids).await?;
+        self.remap_routes_addresses(&mut table_routes).await?;
+
+        Ok(table_routes)
+    }
+
+    async fn batch_get_inner(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
        let keys = table_ids
            .iter()
            .map(|id| TableRouteKey::new(*id).to_bytes())
@@ -604,8 +633,107 @@ impl TableRouteStorage {
                    Ok(None)
                }
            })
-            .collect::<Result<Vec<_>>>()
+            .collect()
    }
+
+    async fn remap_routes_addresses(
+        &self,
+        table_routes: &mut [Option<TableRouteValue>],
+    ) -> Result<()> {
+        let keys = table_routes
+            .iter()
+            .flat_map(|table_route| {
+                table_route
+                    .as_ref()
+                    .map(extract_address_keys)
+                    .unwrap_or_default()
+            })
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect();
+        let node_addrs = self.get_node_addresses(keys).await?;
+        for table_route in table_routes.iter_mut().flatten() {
+            set_addresses(&node_addrs, table_route)?;
+        }
+
+        Ok(())
+    }
+
+    async fn remap_route_address(&self, table_route: &mut TableRouteValue) -> Result<()> {
+        let keys = extract_address_keys(table_route).into_iter().collect();
+        let node_addrs = self.get_node_addresses(keys).await?;
+        set_addresses(&node_addrs, table_route)?;
+
+        Ok(())
+    }
+
+    async fn get_node_addresses(
+        &self,
+        keys: Vec<Vec<u8>>,
+    ) -> Result<HashMap<u64, NodeAddressValue>> {
+        if keys.is_empty() {
+            return Ok(HashMap::default());
+        }
+
+        self.kv_backend
+            .batch_get(BatchGetRequest { keys })
+            .await?
+            .kvs
+            .into_iter()
+            .map(|kv| {
+                let node_id = NodeAddressKey::from_bytes(&kv.key)?.node_id;
+                let node_addr = NodeAddressValue::try_from_raw_value(&kv.value)?;
+                Ok((node_id, node_addr))
+            })
+            .collect()
+    }
+}
+
+fn set_addresses(
+    node_addrs: &HashMap<u64, NodeAddressValue>,
+    table_route: &mut TableRouteValue,
+) -> Result<()> {
+    let TableRouteValue::Physical(physical_table_route) = table_route else {
+        return Ok(());
+    };
+
+    for region_route in &mut physical_table_route.region_routes {
+        if let Some(leader) = &mut region_route.leader_peer {
+            if let Some(node_addr) = node_addrs.get(&leader.id) {
+                leader.addr = node_addr.peer.addr.clone();
+            }
+        }
+        for follower in &mut region_route.follower_peers {
+            if let Some(node_addr) = node_addrs.get(&follower.id) {
+                follower.addr = node_addr.peer.addr.clone();
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn extract_address_keys(table_route: &TableRouteValue) -> HashSet<Vec<u8>> {
+    let TableRouteValue::Physical(physical_table_route) = table_route else {
+        return HashSet::default();
+    };
+
+    physical_table_route
+        .region_routes
+        .iter()
+        .flat_map(|region_route| {
+            region_route
+                .follower_peers
+                .iter()
+                .map(|peer| NodeAddressKey::with_datanode(peer.id).to_bytes())
+                .chain(
+                    region_route
+                        .leader_peer
+                        .as_ref()
+                        .map(|leader| NodeAddressKey::with_datanode(leader.id).to_bytes()),
+                )
+        })
+        .collect()
 }

 #[cfg(test)]
@@ -614,7 +742,9 @@ mod tests {

    use super::*;
    use crate::kv_backend::memory::MemoryKvBackend;
-    use crate::kv_backend::TxnService;
+    use crate::kv_backend::{KvBackend, TxnService};
+    use crate::peer::Peer;
+    use crate::rpc::store::PutRequest;

    #[test]
    fn test_table_route_compatibility() {
@@ -643,18 +773,18 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_table_route_storage_get_raw_empty() {
+    async fn test_table_route_storage_get_with_raw_bytes_empty() {
        let kv = Arc::new(MemoryKvBackend::default());
        let table_route_storage = TableRouteStorage::new(kv);
-        let table_route = table_route_storage.get_raw(1024).await.unwrap();
+        let table_route = table_route_storage.get_with_raw_bytes(1024).await.unwrap();
        assert!(table_route.is_none());
    }

    #[tokio::test]
-    async fn test_table_route_storage_get_raw() {
+    async fn test_table_route_storage_get_with_raw_bytes() {
        let kv = Arc::new(MemoryKvBackend::default());
        let table_route_storage = TableRouteStorage::new(kv.clone());
-        let table_route = table_route_storage.get_raw(1024).await.unwrap();
+        let table_route = table_route_storage.get_with_raw_bytes(1024).await.unwrap();
        assert!(table_route.is_none());
        let table_route_manager = TableRouteManager::new(kv.clone());
        let table_route_value = TableRouteValue::Logical(LogicalTableRouteValue {
@@ -667,7 +797,7 @@ mod tests {
            .unwrap();
        let r = kv.txn(txn).await.unwrap();
        assert!(r.succeeded);
-        let table_route = table_route_storage.get_raw(1024).await.unwrap();
+        let table_route = table_route_storage.get_with_raw_bytes(1024).await.unwrap();
        assert!(table_route.is_some());
        let got = table_route.unwrap().inner;
        assert_eq!(got, table_route_value);
@@ -718,4 +848,61 @@ mod tests {
        assert!(results[2].is_none());
        assert_eq!(results[3].as_ref().unwrap(), &routes[0].1);
    }
+
+    #[tokio::test]
+    async fn remap_route_address_updates_addresses() {
+        let kv = Arc::new(MemoryKvBackend::default());
+        let table_route_storage = TableRouteStorage::new(kv.clone());
+        let mut table_route = TableRouteValue::Physical(PhysicalTableRouteValue {
+            region_routes: vec![RegionRoute {
+                leader_peer: Some(Peer {
+                    id: 1,
+                    ..Default::default()
+                }),
+                follower_peers: vec![Peer {
+                    id: 2,
+                    ..Default::default()
+                }],
+                ..Default::default()
+            }],
+            version: 0,
+        });
+
+        kv.put(PutRequest {
+            key: NodeAddressKey::with_datanode(1).to_bytes(),
+            value: NodeAddressValue {
+                peer: Peer {
+                    addr: "addr1".to_string(),
+                    ..Default::default()
+                },
+            }
+            .try_as_raw_value()
+            .unwrap(),
+            ..Default::default()
+        })
+        .await
+        .unwrap();
+
+        table_route_storage
+            .remap_route_address(&mut table_route)
+            .await
+            .unwrap();
+
+        if let TableRouteValue::Physical(physical_table_route) = table_route {
+            assert_eq!(
+                physical_table_route.region_routes[0]
+                    .leader_peer
+                    .as_ref()
+                    .unwrap()
+                    .addr,
+                "addr1"
+            );
+            assert_eq!(
+                physical_table_route.region_routes[0].follower_peers[0].addr,
+                ""
+            );
+        } else {
+            panic!("Expected PhysicalTableRouteValue");
+        }
+    }
 }
--- a/src/common/wal/Cargo.toml
+++ b/src/common/wal/Cargo.toml
@@ -17,6 +17,7 @@ common-macro.workspace = true
 common-telemetry.workspace = true
 futures-util.workspace = true
 humantime-serde.workspace = true
+num_cpus.workspace = true
 rskafka.workspace = true
 rustls = { version = "0.23", default-features = false, features = ["ring", "logging", "std", "tls12"] }
 rustls-native-certs = "0.7"
--- a/src/common/wal/src/config/raft_engine.rs
+++ b/src/common/wal/src/config/raft_engine.rs
@@ -41,6 +41,8 @@ pub struct RaftEngineConfig {
    /// Duration for fsyncing log files.
    #[serde(with = "humantime_serde")]
    pub sync_period: Option<Duration>,
+    /// Parallelism during log recovery.
+    pub recovery_parallelism: usize,
 }

 impl Default for RaftEngineConfig {
@@ -55,6 +57,7 @@ impl Default for RaftEngineConfig {
            enable_log_recycle: true,
            prefill_log_files: false,
            sync_period: None,
+            recovery_parallelism: num_cpus::get(),
        }
    }
 }
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -454,7 +454,7 @@ impl DatanodeBuilder {
            "Creating raft-engine logstore with config: {:?} and storage path: {}",
            config, &wal_dir
        );
-        let logstore = RaftEngineLogStore::try_new(wal_dir, config.clone())
+        let logstore = RaftEngineLogStore::try_new(wal_dir, config)
            .await
            .map_err(Box::new)
            .context(OpenLogStoreSnafu)?;
--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -192,7 +192,7 @@ impl HeartbeatTask {
        let (outgoing_tx, mut outgoing_rx) = mpsc::channel(16);
        let mailbox = Arc::new(HeartbeatMailbox::new(outgoing_tx));

-        let quit_signal = Arc::new(tokio::sync::Notify::new());
+        let quit_signal = Arc::new(Notify::new());

        let mut tx = Self::create_streams(
            &meta_client,
@@ -324,10 +324,12 @@ impl HeartbeatTask {
                region_id: stat.region_id.as_u64(),
                engine: stat.engine,
                role: RegionRole::from(stat.role).into(),
-                // TODO(jeremy): w/rcus
+                // TODO(weny): w/rcus
                rcus: 0,
                wcus: 0,
                approximate_bytes: region_server.region_disk_usage(stat.region_id).unwrap_or(0),
+                // TODO(weny): add extensions
+                extensions: Default::default(),
            })
            .collect()
    }
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -366,10 +366,10 @@ impl RegionServerHandler for RegionServer {

        // merge results by sum up affected rows and merge extensions.
        let mut affected_rows = 0;
-        let mut extension = HashMap::new();
+        let mut extensions = HashMap::new();
        for result in results {
            affected_rows += result.affected_rows;
-            extension.extend(result.extension);
+            extensions.extend(result.extensions);
        }

        Ok(RegionResponseV1 {
@@ -380,7 +380,7 @@ impl RegionServerHandler for RegionServer {
                }),
            }),
            affected_rows: affected_rows as _,
-            extension,
+            extensions,
        })
    }
 }
@@ -708,7 +708,7 @@ impl RegionServerInner {
                    .await?;
                Ok(RegionResponse {
                    affected_rows: result.affected_rows,
-                    extension: result.extension,
+                    extensions: result.extensions,
                })
            }
            Err(err) => {
--- a/src/datatypes/Cargo.toml
+++ b/src/datatypes/Cargo.toml
@@ -15,6 +15,7 @@ workspace = true
 arrow.workspace = true
 arrow-array.workspace = true
 arrow-schema.workspace = true
+base64.workspace = true
 common-base.workspace = true
 common-decimal.workspace = true
 common-error.workspace = true
@@ -23,6 +24,8 @@ common-telemetry.workspace = true
 common-time.workspace = true
 datafusion-common.workspace = true
 enum_dispatch = "0.3"
+greptime-proto.workspace = true
+jsonb.workspace = true
 num = "0.4"
 num-traits = "0.2"
 ordered-float = { version = "3.0", features = ["serde"] }
--- a/src/datatypes/src/data_type.rs
+++ b/src/datatypes/src/data_type.rs
@@ -33,8 +33,8 @@ use crate::types::{
    BinaryType, BooleanType, DateTimeType, DateType, Decimal128Type, DictionaryType,
    DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, DurationSecondType,
    DurationType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
-    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, ListType,
-    NullType, StringType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
+    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonType,
+    ListType, NullType, StringType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
    UInt16Type, UInt32Type, UInt64Type, UInt8Type,
 };
@@ -81,6 +81,9 @@ pub enum ConcreteDataType {
    // Compound types:
    List(ListType),
    Dictionary(DictionaryType),
+
+    // JSON type:
+    Json(JsonType),
 }

 impl fmt::Display for ConcreteDataType {
@@ -128,6 +131,7 @@ impl fmt::Display for ConcreteDataType {
            ConcreteDataType::Decimal128(v) => write!(f, "{}", v.name()),
            ConcreteDataType::List(v) => write!(f, "{}", v.name()),
            ConcreteDataType::Dictionary(v) => write!(f, "{}", v.name()),
+            ConcreteDataType::Json(v) => write!(f, "{}", v.name()),
        }
    }
 }
@@ -162,6 +166,7 @@ impl ConcreteDataType {
                | ConcreteDataType::Duration(_)
                | ConcreteDataType::Decimal128(_)
                | ConcreteDataType::Binary(_)
+                | ConcreteDataType::Json(_)
        )
    }

@@ -216,6 +221,10 @@ impl ConcreteDataType {
        matches!(self, ConcreteDataType::Decimal128(_))
    }

+    pub fn is_json(&self) -> bool {
+        matches!(self, ConcreteDataType::Json(_))
+    }
+
    pub fn numerics() -> Vec<ConcreteDataType> {
        vec![
            ConcreteDataType::int8_datatype(),
@@ -404,7 +413,7 @@ macro_rules! impl_new_concrete_type_functions {

 impl_new_concrete_type_functions!(
    Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64,
-    Binary, Date, DateTime, String
+    Binary, Date, DateTime, String, Json
 );

 impl ConcreteDataType {
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -25,6 +25,7 @@ use datafusion_common::DFSchemaRef;
 use snafu::{ensure, ResultExt};

 use crate::error::{self, DuplicateColumnSnafu, Error, ProjectArrowSchemaSnafu, Result};
+use crate::prelude::DataType;
 pub use crate::schema::column_schema::{
    ColumnSchema, FulltextAnalyzer, FulltextOptions, Metadata, COMMENT_KEY, FULLTEXT_KEY,
    TIME_INDEX_KEY,
@@ -34,6 +35,8 @@ pub use crate::schema::raw::RawSchema;

 /// Key used to store version number of the schema in metadata.
 pub const VERSION_KEY: &str = "greptime:version";
+/// Key used to store actual column type in field metadata.
+pub const TYPE_KEY: &str = "greptime:type";

 /// A common schema, should be immutable.
 #[derive(Clone, PartialEq, Eq)]
@@ -256,7 +259,13 @@ fn collect_fields(column_schemas: &[ColumnSchema]) -> Result<FieldsAndIndices> {
        if column_schema.is_time_index() && timestamp_index.is_none() {
            timestamp_index = Some(index);
        }
-        let field = Field::try_from(column_schema)?;
+        let mut field = Field::try_from(column_schema)?;
+
+        // Json column performs the same as binary column in Arrow, so we need to mark it
+        if column_schema.data_type.is_json() {
+            let metadata = HashMap::from([(TYPE_KEY.to_string(), column_schema.data_type.name())]);
+            field = field.with_metadata(metadata);
+        }
        fields.push(field);
        ensure!(
            name_to_index
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -22,6 +22,8 @@ use snafu::{ensure, ResultExt};
 use crate::data_type::{ConcreteDataType, DataType};
 use crate::error::{self, Error, Result};
 use crate::schema::constraint::ColumnDefaultConstraint;
+use crate::schema::TYPE_KEY;
+use crate::types::JSON_TYPE_NAME;
 use crate::value::Value;
 use crate::vectors::VectorRef;

@@ -268,7 +270,14 @@ impl TryFrom<&Field> for ColumnSchema {
    type Error = Error;

    fn try_from(field: &Field) -> Result<ColumnSchema> {
-        let data_type = ConcreteDataType::try_from(field.data_type())?;
+        let mut data_type = ConcreteDataType::try_from(field.data_type())?;
+        // Override the data type if it is specified in the metadata.
+        if field.metadata().contains_key(TYPE_KEY) {
+            data_type = match field.metadata().get(TYPE_KEY).unwrap().as_str() {
+                JSON_TYPE_NAME => ConcreteDataType::json_datatype(),
+                _ => data_type,
+            };
+        }
        let mut metadata = field.metadata().clone();
        let default_constraint = match metadata.remove(DEFAULT_CONSTRAINT_KEY) {
            Some(json) => {
@@ -528,4 +537,32 @@ mod tests {
        assert_eq!(formatted_int8, "test_column_1 Int8 null");
        assert_eq!(formatted_int32, "test_column_2 Int32 not null");
    }
+
+    #[test]
+    fn test_from_field_to_column_schema() {
+        let field = Field::new("test", ArrowDataType::Int32, true);
+        let column_schema = ColumnSchema::try_from(&field).unwrap();
+        assert_eq!("test", column_schema.name);
+        assert_eq!(ConcreteDataType::int32_datatype(), column_schema.data_type);
+        assert!(column_schema.is_nullable);
+        assert!(!column_schema.is_time_index);
+        assert!(column_schema.default_constraint.is_none());
+        assert!(column_schema.metadata.is_empty());
+
+        let field = Field::new("test", ArrowDataType::Binary, true);
+        let field = field.with_metadata(Metadata::from([(
+            TYPE_KEY.to_string(),
+            ConcreteDataType::json_datatype().name(),
+        )]));
+        let column_schema = ColumnSchema::try_from(&field).unwrap();
+        assert_eq!("test", column_schema.name);
+        assert_eq!(ConcreteDataType::json_datatype(), column_schema.data_type);
+        assert!(column_schema.is_nullable);
+        assert!(!column_schema.is_time_index);
+        assert!(column_schema.default_constraint.is_none());
+        assert_eq!(
+            column_schema.metadata.get(TYPE_KEY).unwrap(),
+            &ConcreteDataType::json_datatype().name()
+        );
+    }
 }
--- a/src/datatypes/src/type_id.rs
+++ b/src/datatypes/src/type_id.rs
@@ -68,6 +68,8 @@ pub enum LogicalTypeId {

    List,
    Dictionary,
+
+    Json,
 }

 impl LogicalTypeId {
@@ -126,6 +128,7 @@ impl LogicalTypeId {
            LogicalTypeId::DurationMicrosecond => ConcreteDataType::duration_microsecond_datatype(),
            LogicalTypeId::DurationNanosecond => ConcreteDataType::duration_nanosecond_datatype(),
            LogicalTypeId::Decimal128 => ConcreteDataType::decimal128_default_datatype(),
+            LogicalTypeId::Json => ConcreteDataType::json_datatype(),
        }
    }
 }
--- a/src/datatypes/src/types.rs
+++ b/src/datatypes/src/types.rs
@@ -21,6 +21,7 @@ mod decimal_type;
 mod dictionary_type;
 mod duration_type;
 mod interval_type;
+mod json_type;
 mod list_type;
 mod null_type;
 mod primitive_type;
@@ -42,6 +43,7 @@ pub use duration_type::{
 pub use interval_type::{
    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType,
 };
+pub use json_type::{JsonType, JSON_TYPE_NAME};
 pub use list_type::ListType;
 pub use null_type::NullType;
 pub use primitive_type::{
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -0,0 +1,67 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::DataType as ArrowDataType;
+use common_base::bytes::Bytes;
+use serde::{Deserialize, Serialize};
+
+use crate::data_type::{DataType, DataTypeRef};
+use crate::scalars::ScalarVectorBuilder;
+use crate::type_id::LogicalTypeId;
+use crate::value::Value;
+use crate::vectors::{BinaryVectorBuilder, MutableVector};
+
+pub const JSON_TYPE_NAME: &str = "Json";
+
+/// JsonType is a data type for JSON data. It is stored as binary data of jsonb format.
+/// It utilizes current binary value and vector implementation.
+#[derive(Debug, Default, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct JsonType;
+
+impl JsonType {
+    pub fn arc() -> DataTypeRef {
+        Arc::new(Self)
+    }
+}
+
+impl DataType for JsonType {
+    fn name(&self) -> String {
+        JSON_TYPE_NAME.to_string()
+    }
+
+    fn logical_type_id(&self) -> LogicalTypeId {
+        LogicalTypeId::Json
+    }
+
+    fn default_value(&self) -> Value {
+        Bytes::default().into()
+    }
+
+    fn as_arrow_type(&self) -> ArrowDataType {
+        ArrowDataType::Binary
+    }
+
+    fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
+        Box::new(BinaryVectorBuilder::with_capacity(capacity))
+    }
+
+    fn try_cast(&self, from: Value) -> Option<Value> {
+        match from {
+            Value::Binary(v) => Some(Value::Binary(v)),
+            _ => None,
+        }
+    }
+}
--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -18,6 +18,8 @@ use std::sync::Arc;

 use arrow::datatypes::{DataType as ArrowDataType, Field};
 use arrow_array::{Array, ListArray};
+use base64::engine::general_purpose::URL_SAFE;
+use base64::Engine as _;
 use common_base::bytes::{Bytes, StringBytes};
 use common_decimal::Decimal128;
 use common_telemetry::error;
@@ -28,8 +30,10 @@ use common_time::time::Time;
 use common_time::timestamp::{TimeUnit, Timestamp};
 use common_time::{Duration, Interval, Timezone};
 use datafusion_common::ScalarValue;
+use greptime_proto::v1::value::ValueData;
 pub use ordered_float::OrderedFloat;
 use serde::{Deserialize, Serialize, Serializer};
+use serde_json::{Number, Value as JsonValue};
 use snafu::{ensure, ResultExt};

 use crate::error::{self, ConvertArrowArrayToScalarsSnafu, Error, Result, TryFromValueSnafu};
@@ -338,7 +342,8 @@ impl Value {
        let value_type_id = self.logical_type_id();
        let output_type_id = output_type.logical_type_id();
        ensure!(
-            output_type_id == value_type_id || self.is_null(),
+            // Json type leverage Value(Binary) for storage.
+            output_type_id == value_type_id || self.is_null() || (output_type_id == LogicalTypeId::Json && value_type_id == LogicalTypeId::Binary),
            error::ToScalarValueSnafu {
                reason: format!(
                    "expect value to return output_type {output_type_id:?}, actual: {value_type_id:?}",
@@ -480,7 +485,7 @@ pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValu
        ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None),
        ConcreteDataType::Float32(_) => ScalarValue::Float32(None),
        ConcreteDataType::Float64(_) => ScalarValue::Float64(None),
-        ConcreteDataType::Binary(_) => ScalarValue::Binary(None),
+        ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => ScalarValue::Binary(None),
        ConcreteDataType::String(_) => ScalarValue::Utf8(None),
        ConcreteDataType::Date(_) => ScalarValue::Date32(None),
        ConcreteDataType::DateTime(_) => ScalarValue::Date64(None),
@@ -1364,15 +1369,179 @@ impl<'a> ValueRef<'a> {
    }
 }

+pub fn column_data_to_json(data: ValueData) -> JsonValue {
+    match data {
+        ValueData::BinaryValue(b) => JsonValue::String(URL_SAFE.encode(b)),
+        ValueData::BoolValue(b) => JsonValue::Bool(b),
+        ValueData::U8Value(i) => JsonValue::Number(i.into()),
+        ValueData::U16Value(i) => JsonValue::Number(i.into()),
+        ValueData::U32Value(i) => JsonValue::Number(i.into()),
+        ValueData::U64Value(i) => JsonValue::Number(i.into()),
+        ValueData::I8Value(i) => JsonValue::Number(i.into()),
+        ValueData::I16Value(i) => JsonValue::Number(i.into()),
+        ValueData::I32Value(i) => JsonValue::Number(i.into()),
+        ValueData::I64Value(i) => JsonValue::Number(i.into()),
+        ValueData::F32Value(f) => Number::from_f64(f as f64)
+            .map(JsonValue::Number)
+            .unwrap_or(JsonValue::Null),
+        ValueData::F64Value(f) => Number::from_f64(f)
+            .map(JsonValue::Number)
+            .unwrap_or(JsonValue::Null),
+        ValueData::StringValue(s) => JsonValue::String(s),
+        ValueData::DateValue(d) => JsonValue::String(Date::from(d).to_string()),
+        ValueData::DatetimeValue(d) => JsonValue::String(DateTime::from(d).to_string()),
+        ValueData::TimeSecondValue(d) => JsonValue::String(Time::new_second(d).to_iso8601_string()),
+        ValueData::TimeMillisecondValue(d) => {
+            JsonValue::String(Time::new_millisecond(d).to_iso8601_string())
+        }
+        ValueData::TimeMicrosecondValue(d) => {
+            JsonValue::String(Time::new_microsecond(d).to_iso8601_string())
+        }
+        ValueData::TimeNanosecondValue(d) => {
+            JsonValue::String(Time::new_nanosecond(d).to_iso8601_string())
+        }
+        ValueData::TimestampMicrosecondValue(d) => {
+            JsonValue::String(Timestamp::new_microsecond(d).to_iso8601_string())
+        }
+        ValueData::TimestampMillisecondValue(d) => {
+            JsonValue::String(Timestamp::new_millisecond(d).to_iso8601_string())
+        }
+        ValueData::TimestampNanosecondValue(d) => {
+            JsonValue::String(Timestamp::new_nanosecond(d).to_iso8601_string())
+        }
+        ValueData::TimestampSecondValue(d) => {
+            JsonValue::String(Timestamp::new_second(d).to_iso8601_string())
+        }
+        ValueData::IntervalYearMonthValue(d) => JsonValue::String(format!("interval year [{}]", d)),
+        ValueData::IntervalMonthDayNanoValue(d) => JsonValue::String(format!(
+            "interval month [{}][{}][{}]",
+            d.months, d.days, d.nanoseconds
+        )),
+        ValueData::IntervalDayTimeValue(d) => JsonValue::String(format!("interval day [{}]", d)),
+        ValueData::Decimal128Value(d) => {
+            JsonValue::String(format!("decimal128 [{}][{}]", d.hi, d.lo))
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use arrow::datatypes::DataType as ArrowDataType;
    use common_time::timezone::set_default_timezone;
+    use greptime_proto::v1::{Decimal128 as ProtoDecimal128, IntervalMonthDayNano};
    use num_traits::Float;

    use super::*;
    use crate::vectors::ListVectorBuilder;

+    #[test]
+    fn test_column_data_to_json() {
+        assert_eq!(
+            column_data_to_json(ValueData::BinaryValue(b"hello".to_vec())),
+            JsonValue::String("aGVsbG8=".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::BoolValue(true)),
+            JsonValue::Bool(true)
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::U8Value(1)),
+            JsonValue::Number(1.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::U16Value(2)),
+            JsonValue::Number(2.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::U32Value(3)),
+            JsonValue::Number(3.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::U64Value(4)),
+            JsonValue::Number(4.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::I8Value(5)),
+            JsonValue::Number(5.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::I16Value(6)),
+            JsonValue::Number(6.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::I32Value(7)),
+            JsonValue::Number(7.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::I64Value(8)),
+            JsonValue::Number(8.into())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::F32Value(9.0)),
+            JsonValue::Number(Number::from_f64(9.0_f64).unwrap())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::F64Value(10.0)),
+            JsonValue::Number(Number::from_f64(10.0_f64).unwrap())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::StringValue("hello".to_string())),
+            JsonValue::String("hello".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::DateValue(123)),
+            JsonValue::String("1970-05-04".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::DatetimeValue(456)),
+            JsonValue::String("1970-01-01 00:00:00.456+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::TimeSecondValue(789)),
+            JsonValue::String("00:13:09+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::TimeMillisecondValue(789)),
+            JsonValue::String("00:00:00.789+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::TimeMicrosecondValue(789)),
+            JsonValue::String("00:00:00.000789+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::TimestampMillisecondValue(1234567890)),
+            JsonValue::String("1970-01-15 06:56:07.890+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::TimestampNanosecondValue(1234567890123456789)),
+            JsonValue::String("2009-02-13 23:31:30.123456789+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::TimestampSecondValue(1234567890)),
+            JsonValue::String("2009-02-13 23:31:30+0000".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::IntervalYearMonthValue(12)),
+            JsonValue::String("interval year [12]".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::IntervalMonthDayNanoValue(IntervalMonthDayNano {
+                months: 1,
+                days: 2,
+                nanoseconds: 3,
+            })),
+            JsonValue::String("interval month [1][2][3]".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::IntervalDayTimeValue(4)),
+            JsonValue::String("interval day [4]".to_string())
+        );
+        assert_eq!(
+            column_data_to_json(ValueData::Decimal128Value(ProtoDecimal128 { hi: 5, lo: 6 })),
+            JsonValue::String("decimal128 [5][6]".to_string())
+        );
+    }
+
    #[test]
    fn test_try_from_scalar_value() {
        assert_eq!(
@@ -1826,6 +1995,10 @@ mod tests {
            &ConcreteDataType::duration_nanosecond_datatype(),
            &Value::Duration(Duration::new_nanosecond(1)),
        );
+        check_type_and_value(
+            &ConcreteDataType::decimal128_datatype(38, 10),
+            &Value::Decimal128(Decimal128::new(1, 38, 10)),
+        );
    }

    #[test]
@@ -2010,6 +2183,14 @@ mod tests {
            ValueRef::List(ListValueRef::Ref { val: &list }),
            Value::List(list.clone()).as_value_ref()
        );
+
+        let jsonb_value = jsonb::parse_value(r#"{"key": "value"}"#.as_bytes())
+            .unwrap()
+            .to_vec();
+        assert_eq!(
+            ValueRef::Binary(jsonb_value.clone().as_slice()),
+            Value::Binary(jsonb_value.into()).as_value_ref()
+        );
    }

    #[test]
@@ -2223,6 +2404,16 @@ mod tests {
                .try_to_scalar_value(&ConcreteDataType::binary_datatype())
                .unwrap()
        );
+
+        let jsonb_value = jsonb::parse_value(r#"{"key": "value"}"#.as_bytes())
+            .unwrap()
+            .to_vec();
+        assert_eq!(
+            ScalarValue::Binary(Some(jsonb_value.clone())),
+            Value::Binary(jsonb_value.into())
+                .try_to_scalar_value(&ConcreteDataType::json_datatype())
+                .unwrap()
+        );
    }

    #[test]
@@ -2355,6 +2546,12 @@ mod tests {
                .try_to_scalar_value(&ConcreteDataType::duration_nanosecond_datatype())
                .unwrap()
        );
+        assert_eq!(
+            ScalarValue::Binary(None),
+            Value::Null
+                .try_to_scalar_value(&ConcreteDataType::json_datatype())
+                .unwrap()
+        );
    }

    #[test]
--- a/src/datatypes/src/vectors/eq.rs
+++ b/src/datatypes/src/vectors/eq.rs
@@ -80,7 +80,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
    match lhs.data_type() {
        Null(_) => true,
        Boolean(_) => is_vector_eq!(BooleanVector, lhs, rhs),
-        Binary(_) => is_vector_eq!(BinaryVector, lhs, rhs),
+        Binary(_) | Json(_) => is_vector_eq!(BinaryVector, lhs, rhs),
        String(_) => is_vector_eq!(StringVector, lhs, rhs),
        Date(_) => is_vector_eq!(DateVector, lhs, rhs),
        DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs),
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -50,11 +50,9 @@ use crate::adapter::util::column_schemas_to_proto;
 use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
 use crate::compute::ErrCollector;
 use crate::df_optimizer::sql_to_flow_plan;
-use crate::error::{ExternalSnafu, InternalSnafu, TableNotFoundSnafu, UnexpectedSnafu};
-use crate::expr::GlobalId;
-use crate::metrics::{
-    METRIC_FLOW_INPUT_BUF_SIZE, METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_RUN_INTERVAL_MS,
-};
+use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, TableNotFoundSnafu, UnexpectedSnafu};
+use crate::expr::{Batch, GlobalId};
+use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_RUN_INTERVAL_MS};
 use crate::repr::{self, DiffRow, Row, BATCH_SIZE};

 mod flownode_impl;
@@ -227,11 +225,24 @@ pub fn diff_row_to_request(rows: Vec<DiffRow>) -> Vec<DiffRequest> {
    reqs
 }

+pub fn batches_to_rows_req(batches: Vec<Batch>) -> Result<Vec<DiffRequest>, Error> {
+    let mut reqs = Vec::new();
+    for batch in batches {
+        let mut rows = Vec::with_capacity(batch.row_count());
+        for i in 0..batch.row_count() {
+            let row = batch.get_row(i).context(EvalSnafu)?;
+            rows.push((Row::new(row), 0));
+        }
+        reqs.push(DiffRequest::Insert(rows));
+    }
+    Ok(reqs)
+}
+
 /// This impl block contains methods to send writeback requests to frontend
 impl FlowWorkerManager {
    /// Return the number of requests it made
    pub async fn send_writeback_requests(&self) -> Result<usize, Error> {
-        let all_reqs = self.generate_writeback_request().await;
+        let all_reqs = self.generate_writeback_request().await?;
        if all_reqs.is_empty() || all_reqs.iter().all(|v| v.1.is_empty()) {
            return Ok(0);
        }
@@ -242,122 +253,16 @@ impl FlowWorkerManager {
            }
            let (catalog, schema) = (table_name[0].clone(), table_name[1].clone());
            let ctx = Arc::new(QueryContext::with(&catalog, &schema));
-            // TODO(discord9): instead of auto build table from request schema, actually build table
-            // before `create flow` to be able to assign pk and ts etc.
-            let (primary_keys, schema, is_ts_placeholder) = if let Some(table_id) = self
-                .table_info_source
-                .get_table_id_from_name(&table_name)
-                .await?
-            {
-                let table_info = self
-                    .table_info_source
-                    .get_table_info_value(&table_id)
-                    .await?
-                    .unwrap();
-                let meta = table_info.table_info.meta;
-                let primary_keys = meta
-                    .primary_key_indices
-                    .into_iter()
-                    .map(|i| meta.schema.column_schemas[i].name.clone())
-                    .collect_vec();
-                let schema = meta.schema.column_schemas;
-                // check if the last column is the auto created timestamp column, hence the table is auto created from
-                // flow's plan type
-                let is_auto_create = {
-                    let correct_name = schema
-                        .last()
-                        .map(|s| s.name == AUTO_CREATED_PLACEHOLDER_TS_COL)
-                        .unwrap_or(false);
-                    let correct_time_index = meta.schema.timestamp_index == Some(schema.len() - 1);
-                    correct_name && correct_time_index
-                };
-                (primary_keys, schema, is_auto_create)
-            } else {
-                // TODO(discord9): condiser remove buggy auto create by schema

-                let node_ctx = self.node_context.read().await;
-                let gid: GlobalId = node_ctx
-                    .table_repr
-                    .get_by_name(&table_name)
-                    .map(|x| x.1)
-                    .unwrap();
-                let schema = node_ctx
-                    .schema
-                    .get(&gid)
-                    .with_context(|| TableNotFoundSnafu {
-                        name: format!("Table name = {:?}", table_name),
-                    })?
-                    .clone();
-                // TODO(discord9): use default key from schema
-                let primary_keys = schema
-                    .typ()
-                    .keys
-                    .first()
-                    .map(|v| {
-                        v.column_indices
-                            .iter()
-                            .map(|i| {
-                                schema
-                                    .get_name(*i)
-                                    .clone()
-                                    .unwrap_or_else(|| format!("col_{i}"))
-                            })
-                            .collect_vec()
-                    })
-                    .unwrap_or_default();
-                let update_at = ColumnSchema::new(
-                    UPDATE_AT_TS_COL,
-                    ConcreteDataType::timestamp_millisecond_datatype(),
-                    true,
-                );
+            let (is_ts_placeholder, proto_schema) =
+                self.try_fetch_or_create_table(&table_name).await?;
+            let schema_len = proto_schema.len();

-                let original_schema = schema
-                    .typ()
-                    .column_types
-                    .clone()
-                    .into_iter()
-                    .enumerate()
-                    .map(|(idx, typ)| {
-                        let name = schema
-                            .names
-                            .get(idx)
-                            .cloned()
-                            .flatten()
-                            .unwrap_or(format!("col_{}", idx));
-                        let ret = ColumnSchema::new(name, typ.scalar_type, typ.nullable);
-                        if schema.typ().time_index == Some(idx) {
-                            ret.with_time_index(true)
-                        } else {
-                            ret
-                        }
-                    })
-                    .collect_vec();
-
-                let mut with_auto_added_col = original_schema.clone();
-                with_auto_added_col.push(update_at);
-
-                // if no time index, add one as placeholder
-                let no_time_index = schema.typ().time_index.is_none();
-                if no_time_index {
-                    let ts_col = ColumnSchema::new(
-                        AUTO_CREATED_PLACEHOLDER_TS_COL,
-                        ConcreteDataType::timestamp_millisecond_datatype(),
-                        true,
-                    )
-                    .with_time_index(true);
-                    with_auto_added_col.push(ts_col);
-                }
-
-                (primary_keys, with_auto_added_col, no_time_index)
-            };
-            let schema_len = schema.len();
-            let proto_schema = column_schemas_to_proto(schema, &primary_keys)?;
-
-            debug!(
-                "Sending {} writeback requests to table {}, reqs={:?}",
+            trace!(
+                "Sending {} writeback requests to table {}, reqs total rows={}",
                reqs.len(),
                table_name.join("."),
-                reqs
+                reqs.iter().map(|r| r.len()).sum::<usize>()
            );
            let now = self.tick_manager.tick();
            for req in reqs {
@@ -450,8 +355,12 @@ impl FlowWorkerManager {
    }

    /// Generate writeback request for all sink table
-    pub async fn generate_writeback_request(&self) -> BTreeMap<TableName, Vec<DiffRequest>> {
+    pub async fn generate_writeback_request(
+        &self,
+    ) -> Result<BTreeMap<TableName, Vec<DiffRequest>>, Error> {
+        trace!("Start to generate writeback request");
        let mut output = BTreeMap::new();
+        let mut total_row_count = 0;
        for (name, sink_recv) in self
            .node_context
            .write()
@@ -460,14 +369,133 @@ impl FlowWorkerManager {
            .iter_mut()
            .map(|(n, (_s, r))| (n, r))
        {
-            let mut rows = Vec::new();
-            while let Ok(row) = sink_recv.try_recv() {
-                rows.push(row);
+            let mut batches = Vec::new();
+            while let Ok(batch) = sink_recv.try_recv() {
+                total_row_count += batch.row_count();
+                batches.push(batch);
            }
-            let reqs = diff_row_to_request(rows);
+            let reqs = batches_to_rows_req(batches)?;
            output.insert(name.clone(), reqs);
        }
-        output
+        trace!("Prepare writeback req: total row count={}", total_row_count);
+        Ok(output)
+    }
+
+    /// Fetch table info or create table from flow's schema if not exist
+    async fn try_fetch_or_create_table(
+        &self,
+        table_name: &TableName,
+    ) -> Result<(bool, Vec<api::v1::ColumnSchema>), Error> {
+        // TODO(discord9): instead of auto build table from request schema, actually build table
+        // before `create flow` to be able to assign pk and ts etc.
+        let (primary_keys, schema, is_ts_placeholder) = if let Some(table_id) = self
+            .table_info_source
+            .get_table_id_from_name(table_name)
+            .await?
+        {
+            let table_info = self
+                .table_info_source
+                .get_table_info_value(&table_id)
+                .await?
+                .unwrap();
+            let meta = table_info.table_info.meta;
+            let primary_keys = meta
+                .primary_key_indices
+                .into_iter()
+                .map(|i| meta.schema.column_schemas[i].name.clone())
+                .collect_vec();
+            let schema = meta.schema.column_schemas;
+            // check if the last column is the auto created timestamp column, hence the table is auto created from
+            // flow's plan type
+            let is_auto_create = {
+                let correct_name = schema
+                    .last()
+                    .map(|s| s.name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+                    .unwrap_or(false);
+                let correct_time_index = meta.schema.timestamp_index == Some(schema.len() - 1);
+                correct_name && correct_time_index
+            };
+            (primary_keys, schema, is_auto_create)
+        } else {
+            // TODO(discord9): condiser remove buggy auto create by schema
+
+            let node_ctx = self.node_context.read().await;
+            let gid: GlobalId = node_ctx
+                .table_repr
+                .get_by_name(table_name)
+                .map(|x| x.1)
+                .unwrap();
+            let schema = node_ctx
+                .schema
+                .get(&gid)
+                .with_context(|| TableNotFoundSnafu {
+                    name: format!("Table name = {:?}", table_name),
+                })?
+                .clone();
+            // TODO(discord9): use default key from schema
+            let primary_keys = schema
+                .typ()
+                .keys
+                .first()
+                .map(|v| {
+                    v.column_indices
+                        .iter()
+                        .map(|i| {
+                            schema
+                                .get_name(*i)
+                                .clone()
+                                .unwrap_or_else(|| format!("col_{i}"))
+                        })
+                        .collect_vec()
+                })
+                .unwrap_or_default();
+            let update_at = ColumnSchema::new(
+                UPDATE_AT_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                true,
+            );
+
+            let original_schema = schema
+                .typ()
+                .column_types
+                .clone()
+                .into_iter()
+                .enumerate()
+                .map(|(idx, typ)| {
+                    let name = schema
+                        .names
+                        .get(idx)
+                        .cloned()
+                        .flatten()
+                        .unwrap_or(format!("col_{}", idx));
+                    let ret = ColumnSchema::new(name, typ.scalar_type, typ.nullable);
+                    if schema.typ().time_index == Some(idx) {
+                        ret.with_time_index(true)
+                    } else {
+                        ret
+                    }
+                })
+                .collect_vec();
+
+            let mut with_auto_added_col = original_schema.clone();
+            with_auto_added_col.push(update_at);
+
+            // if no time index, add one as placeholder
+            let no_time_index = schema.typ().time_index.is_none();
+            if no_time_index {
+                let ts_col = ColumnSchema::new(
+                    AUTO_CREATED_PLACEHOLDER_TS_COL,
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    true,
+                )
+                .with_time_index(true);
+                with_auto_added_col.push(ts_col);
+            }
+
+            (primary_keys, with_auto_added_col, no_time_index)
+        };
+        let proto_schema = column_schemas_to_proto(schema, &primary_keys)?;
+        Ok((is_ts_placeholder, proto_schema))
    }
 }

@@ -498,10 +526,6 @@ impl FlowWorkerManager {
        }
    }

-    async fn get_buf_size(&self) -> usize {
-        self.node_context.read().await.get_send_buf_size().await
-    }
-
    /// Trigger dataflow running, and then send writeback request to the source sender
    ///
    /// note that this method didn't handle input mirror request, as this should be handled by grpc server
@@ -575,43 +599,37 @@ impl FlowWorkerManager {
    /// TODO(discord9): add flag for subgraph that have input since last run
    pub async fn run_available(&self, blocking: bool) -> Result<usize, Error> {
        let mut row_cnt = 0;
-        loop {
-            let now = self.tick_manager.tick();
-            for worker in self.worker_handles.iter() {
-                // TODO(discord9): consider how to handle error in individual worker
-                if blocking {
-                    worker.lock().await.run_available(now, blocking).await?;
-                } else if let Ok(worker) = worker.try_lock() {
-                    worker.run_available(now, blocking).await?;
-                } else {
-                    return Ok(row_cnt);
-                }
-            }
-            // check row send and rows remain in send buf
-            let (flush_res, _buf_len) = if blocking {
-                let ctx = self.node_context.read().await;
-                (ctx.flush_all_sender().await, ctx.get_send_buf_size().await)
+
+        let now = self.tick_manager.tick();
+        for worker in self.worker_handles.iter() {
+            // TODO(discord9): consider how to handle error in individual worker
+            if blocking {
+                worker.lock().await.run_available(now, blocking).await?;
+            } else if let Ok(worker) = worker.try_lock() {
+                worker.run_available(now, blocking).await?;
            } else {
-                match self.node_context.try_read() {
-                    Ok(ctx) => (ctx.flush_all_sender().await, ctx.get_send_buf_size().await),
-                    Err(_) => return Ok(row_cnt),
-                }
-            };
-            match flush_res {
-                Ok(r) => {
-                    common_telemetry::trace!("Flushed {} rows", r);
-                    row_cnt += r;
-                    // send buf is likely to be somewhere empty now, wait
-                    if r < BATCH_SIZE / 2 {
-                        break;
-                    }
-                }
-                Err(err) => {
-                    common_telemetry::error!("Flush send buf errors: {:?}", err);
-                    break;
-                }
-            };
+                return Ok(row_cnt);
+            }
        }
+        // check row send and rows remain in send buf
+        let flush_res = if blocking {
+            let ctx = self.node_context.read().await;
+            ctx.flush_all_sender().await
+        } else {
+            match self.node_context.try_read() {
+                Ok(ctx) => ctx.flush_all_sender().await,
+                Err(_) => return Ok(row_cnt),
+            }
+        };
+        match flush_res {
+            Ok(r) => {
+                common_telemetry::trace!("Total flushed {} rows", r);
+                row_cnt += r;
+            }
+            Err(err) => {
+                common_telemetry::error!("Flush send buf errors: {:?}", err);
+            }
+        };

        Ok(row_cnt)
    }
@@ -624,14 +642,14 @@ impl FlowWorkerManager {
    ) -> Result<(), Error> {
        let rows_len = rows.len();
        let table_id = region_id.table_id();
-        METRIC_FLOW_INPUT_BUF_SIZE.add(rows_len as _);
        let _timer = METRIC_FLOW_INSERT_ELAPSED
            .with_label_values(&[table_id.to_string().as_str()])
            .start_timer();
        self.node_context.read().await.send(table_id, rows).await?;
-        debug!(
+        trace!(
            "Handling write request for table_id={} with {} rows",
-            table_id, rows_len
+            table_id,
+            rows_len
        );
        Ok(())
    }
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -23,7 +23,7 @@ use api::v1::region::InsertRequests;
 use common_error::ext::BoxedError;
 use common_meta::error::{ExternalSnafu, Result, UnexpectedSnafu};
 use common_meta::node_manager::Flownode;
-use common_telemetry::debug;
+use common_telemetry::{debug, trace};
 use itertools::Itertools;
 use snafu::{OptionExt, ResultExt};
 use store_api::storage::RegionId;
@@ -189,7 +189,7 @@ impl Flownode for FlowWorkerManager {
                    })
                    .try_collect()?;
                if !fetch_order.iter().enumerate().all(|(i, &v)| i == v) {
-                    debug!("Reordering columns: {:?}", fetch_order)
+                    trace!("Reordering columns: {:?}", fetch_order)
                }
                fetch_order
            };
--- a/src/flow/src/adapter/node_context.rs
+++ b/src/flow/src/adapter/node_context.rs
@@ -15,10 +15,10 @@
 //! Node context, prone to change with every incoming requests

 use std::collections::{BTreeMap, BTreeSet, HashMap};
-use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;

-use common_telemetry::debug;
+use common_telemetry::trace;
 use session::context::QueryContext;
 use snafu::{OptionExt, ResultExt};
 use table::metadata::TableId;
@@ -27,9 +27,9 @@ use tokio::sync::{broadcast, mpsc, RwLock};
 use crate::adapter::{FlowId, TableName, TableSource};
 use crate::error::{Error, EvalSnafu, TableNotFoundSnafu};
 use crate::expr::error::InternalSnafu;
-use crate::expr::GlobalId;
+use crate::expr::{Batch, GlobalId};
 use crate::metrics::METRIC_FLOW_INPUT_BUF_SIZE;
-use crate::repr::{DiffRow, RelationDesc, BROADCAST_CAP, SEND_BUF_CAP};
+use crate::repr::{DiffRow, RelationDesc, BATCH_SIZE, BROADCAST_CAP, SEND_BUF_CAP};

 /// A context that holds the information of the dataflow
 #[derive(Default, Debug)]
@@ -47,13 +47,8 @@ pub struct FlownodeContext {
    ///
    /// and send it back to the client, since we are mocking the sink table as a client, we should use table name as the key
    /// note that the sink receiver should only have one, and we are using broadcast as mpsc channel here
-    pub sink_receiver: BTreeMap<
-        TableName,
-        (
-            mpsc::UnboundedSender<DiffRow>,
-            mpsc::UnboundedReceiver<DiffRow>,
-        ),
-    >,
+    pub sink_receiver:
+        BTreeMap<TableName, (mpsc::UnboundedSender<Batch>, mpsc::UnboundedReceiver<Batch>)>,
    /// the schema of the table, query from metasrv or inferred from TypedPlan
    pub schema: HashMap<GlobalId, RelationDesc>,
    /// All the tables that have been registered in the worker
@@ -61,25 +56,27 @@ pub struct FlownodeContext {
    pub query_context: Option<Arc<QueryContext>>,
 }

-/// a simple broadcast sender with backpressure and unbound capacity
+/// a simple broadcast sender with backpressure, bounded capacity and blocking on send when send buf is full
+/// note that it wouldn't evict old data, so it's possible to block forever if the receiver is slow
 ///
 /// receiver still use tokio broadcast channel, since only sender side need to know
 /// backpressure and adjust dataflow running duration to avoid blocking
 #[derive(Debug)]
 pub struct SourceSender {
    // TODO(discord9): make it all Vec<DiffRow>?
-    sender: broadcast::Sender<DiffRow>,
-    send_buf_tx: mpsc::Sender<Vec<DiffRow>>,
-    send_buf_rx: RwLock<mpsc::Receiver<Vec<DiffRow>>>,
+    sender: broadcast::Sender<Batch>,
+    send_buf_tx: mpsc::Sender<Batch>,
+    send_buf_rx: RwLock<mpsc::Receiver<Batch>>,
    send_buf_row_cnt: AtomicUsize,
 }

 impl Default for SourceSender {
    fn default() -> Self {
+        // TODO(discord9): the capacity is arbitrary, we can adjust it later, might also want to limit the max number of rows in send buf
        let (send_buf_tx, send_buf_rx) = mpsc::channel(SEND_BUF_CAP);
        Self {
            // TODO(discord9): found a better way then increase this to prevent lagging and hence missing input data
-            sender: broadcast::Sender::new(BROADCAST_CAP * 2),
+            sender: broadcast::Sender::new(SEND_BUF_CAP),
            send_buf_tx,
            send_buf_rx: RwLock::new(send_buf_rx),
            send_buf_row_cnt: AtomicUsize::new(0),
@@ -90,7 +87,7 @@ impl Default for SourceSender {
 impl SourceSender {
    /// max number of iterations to try flush send buf
    const MAX_ITERATIONS: usize = 16;
-    pub fn get_receiver(&self) -> broadcast::Receiver<DiffRow> {
+    pub fn get_receiver(&self) -> broadcast::Receiver<Batch> {
        self.sender.subscribe()
    }

@@ -106,30 +103,27 @@ impl SourceSender {
                break;
            }
            // TODO(discord9): send rows instead so it's just moving a point
-            if let Some(rows) = send_buf.recv().await {
-                let len = rows.len();
-                self.send_buf_row_cnt
-                    .fetch_sub(len, std::sync::atomic::Ordering::SeqCst);
-                for row in rows {
-                    self.sender
-                        .send(row)
-                        .map_err(|err| {
-                            InternalSnafu {
-                                reason: format!("Failed to send row, error = {:?}", err),
-                            }
-                            .build()
-                        })
-                        .with_context(|_| EvalSnafu)?;
-                    row_cnt += 1;
-                }
+            if let Some(batch) = send_buf.recv().await {
+                let len = batch.row_count();
+                self.send_buf_row_cnt.fetch_sub(len, Ordering::SeqCst);
+                row_cnt += len;
+                self.sender
+                    .send(batch)
+                    .map_err(|err| {
+                        InternalSnafu {
+                            reason: format!("Failed to send row, error = {:?}", err),
+                        }
+                        .build()
+                    })
+                    .with_context(|_| EvalSnafu)?;
            }
        }
        if row_cnt > 0 {
-            debug!("Send {} rows", row_cnt);
+            trace!("Source Flushed {} rows", row_cnt);
            METRIC_FLOW_INPUT_BUF_SIZE.sub(row_cnt as _);
-            debug!(
-                "Remaining Send buf.len() = {}",
-                self.send_buf_rx.read().await.len()
+            trace!(
+                "Remaining Source Send buf.len() = {}",
+                METRIC_FLOW_INPUT_BUF_SIZE.get()
            );
        }

@@ -138,12 +132,23 @@ impl SourceSender {

    /// return number of rows it actual send(including what's in the buffer)
    pub async fn send_rows(&self, rows: Vec<DiffRow>) -> Result<usize, Error> {
-        self.send_buf_tx.send(rows).await.map_err(|e| {
+        METRIC_FLOW_INPUT_BUF_SIZE.add(rows.len() as _);
+        while self.send_buf_row_cnt.load(Ordering::SeqCst) >= BATCH_SIZE * 4 {
+            tokio::task::yield_now().await;
+        }
+        // row count metrics is approx so relaxed order is ok
+        self.send_buf_row_cnt
+            .fetch_add(rows.len(), Ordering::SeqCst);
+        let batch = Batch::try_from_rows(rows.into_iter().map(|(row, _, _)| row).collect())
+            .context(EvalSnafu)?;
+        common_telemetry::trace!("Send one batch to worker with {} rows", batch.row_count());
+        self.send_buf_tx.send(batch).await.map_err(|e| {
            crate::error::InternalSnafu {
                reason: format!("Failed to send row, error = {:?}", e),
            }
            .build()
        })?;
+
        Ok(0)
    }
 }
@@ -159,8 +164,6 @@ impl FlownodeContext {
            .with_context(|| TableNotFoundSnafu {
                name: table_id.to_string(),
            })?;
-
-        debug!("FlownodeContext::send: trying to send {} rows", rows.len());
        sender.send_rows(rows).await
    }

@@ -174,16 +177,6 @@ impl FlownodeContext {
        }
        Ok(sum)
    }
-
-    /// Return the sum number of rows in all send buf
-    /// TODO(discord9): remove this since we can't get correct row cnt anyway
-    pub async fn get_send_buf_size(&self) -> usize {
-        let mut sum = 0;
-        for sender in self.source_sender.values() {
-            sum += sender.send_buf_rx.read().await.len();
-        }
-        sum
-    }
 }

 impl FlownodeContext {
@@ -230,7 +223,7 @@ impl FlownodeContext {
    pub fn add_sink_receiver(&mut self, table_name: TableName) {
        self.sink_receiver
            .entry(table_name)
-            .or_insert_with(mpsc::unbounded_channel::<DiffRow>);
+            .or_insert_with(mpsc::unbounded_channel);
    }

    pub fn get_source_by_global_id(&self, id: &GlobalId) -> Result<&SourceSender, Error> {
@@ -254,7 +247,7 @@ impl FlownodeContext {
    pub fn get_sink_by_global_id(
        &self,
        id: &GlobalId,
-    ) -> Result<mpsc::UnboundedSender<DiffRow>, Error> {
+    ) -> Result<mpsc::UnboundedSender<Batch>, Error> {
        let table_name = self
            .table_repr
            .get_by_global_id(id)
--- a/src/flow/src/adapter/worker.rs
+++ b/src/flow/src/adapter/worker.rs
@@ -27,7 +27,7 @@ use tokio::sync::{broadcast, mpsc, oneshot, Mutex};
 use crate::adapter::FlowId;
 use crate::compute::{Context, DataflowState, ErrCollector};
 use crate::error::{Error, FlowAlreadyExistSnafu, InternalSnafu, UnexpectedSnafu};
-use crate::expr::GlobalId;
+use crate::expr::{Batch, GlobalId};
 use crate::plan::TypedPlan;
 use crate::repr::{self, DiffRow};

@@ -89,6 +89,8 @@ impl<'subgraph> ActiveDataflowState<'subgraph> {
            err_collector: self.err_collector.clone(),
            input_collection: Default::default(),
            local_scope: Default::default(),
+            input_collection_batch: Default::default(),
+            local_scope_batch: Default::default(),
        }
    }

@@ -156,13 +158,13 @@ impl WorkerHandle {
    ///
    /// the returned error is unrecoverable, and the worker should be shutdown/rebooted
    pub async fn run_available(&self, now: repr::Timestamp, blocking: bool) -> Result<(), Error> {
-        common_telemetry::debug!("Running available with blocking={}", blocking);
+        common_telemetry::trace!("Running available with blocking={}", blocking);
        if blocking {
            let resp = self
                .itc_client
                .call_with_resp(Request::RunAvail { now, blocking })
                .await?;
-            common_telemetry::debug!("Running available with response={:?}", resp);
+            common_telemetry::trace!("Running available with response={:?}", resp);
            Ok(())
        } else {
            self.itc_client
@@ -225,9 +227,9 @@ impl<'s> Worker<'s> {
        flow_id: FlowId,
        plan: TypedPlan,
        sink_id: GlobalId,
-        sink_sender: mpsc::UnboundedSender<DiffRow>,
+        sink_sender: mpsc::UnboundedSender<Batch>,
        source_ids: &[GlobalId],
-        src_recvs: Vec<broadcast::Receiver<DiffRow>>,
+        src_recvs: Vec<broadcast::Receiver<Batch>>,
        // TODO(discord9): set expire duration for all arrangement and compare to sys timestamp instead
        expire_after: Option<repr::Duration>,
        create_if_not_exists: bool,
@@ -249,12 +251,12 @@ impl<'s> Worker<'s> {
        {
            let mut ctx = cur_task_state.new_ctx(sink_id);
            for (source_id, src_recv) in source_ids.iter().zip(src_recvs) {
-                let bundle = ctx.render_source(src_recv)?;
-                ctx.insert_global(*source_id, bundle);
+                let bundle = ctx.render_source_batch(src_recv)?;
+                ctx.insert_global_batch(*source_id, bundle);
            }

-            let rendered = ctx.render_plan(plan)?;
-            ctx.render_unbounded_sink(rendered, sink_sender);
+            let rendered = ctx.render_plan_batch(plan)?;
+            ctx.render_unbounded_sink_batch(rendered, sink_sender);
        }
        self.task_states.insert(flow_id, cur_task_state);
        Ok(Some(flow_id))
@@ -370,9 +372,9 @@ pub enum Request {
        flow_id: FlowId,
        plan: TypedPlan,
        sink_id: GlobalId,
-        sink_sender: mpsc::UnboundedSender<DiffRow>,
+        sink_sender: mpsc::UnboundedSender<Batch>,
        source_ids: Vec<GlobalId>,
-        src_recvs: Vec<broadcast::Receiver<DiffRow>>,
+        src_recvs: Vec<broadcast::Receiver<Batch>>,
        expire_after: Option<repr::Duration>,
        create_if_not_exists: bool,
        err_collector: ErrCollector,
@@ -472,7 +474,7 @@ mod test {
    use super::*;
    use crate::expr::Id;
    use crate::plan::Plan;
-    use crate::repr::{RelationType, Row};
+    use crate::repr::RelationType;

    #[test]
    fn drop_handle() {
@@ -497,8 +499,8 @@ mod test {
        });
        let handle = rx.await.unwrap();
        let src_ids = vec![GlobalId::User(1)];
-        let (tx, rx) = broadcast::channel::<DiffRow>(1024);
-        let (sink_tx, mut sink_rx) = mpsc::unbounded_channel::<DiffRow>();
+        let (tx, rx) = broadcast::channel::<Batch>(1024);
+        let (sink_tx, mut sink_rx) = mpsc::unbounded_channel::<Batch>();
        let (flow_id, plan) = (
            1,
            TypedPlan {
@@ -523,9 +525,9 @@ mod test {
            handle.create_flow(create_reqs).await.unwrap(),
            Some(flow_id)
        );
-        tx.send((Row::empty(), 0, 0)).unwrap();
+        tx.send(Batch::empty()).unwrap();
        handle.run_available(0, true).await.unwrap();
-        assert_eq!(sink_rx.recv().await.unwrap().0, Row::empty());
+        assert_eq!(sink_rx.recv().await.unwrap(), Batch::empty());
        drop(handle);
        worker_thread_handle.join().unwrap();
    }
--- a/src/flow/src/compute/render.rs
+++ b/src/flow/src/compute/render.rs
@@ -49,6 +49,14 @@ pub struct Context<'referred, 'df> {
    ///
    /// TODO(discord9): consider if use Vec<(LocalId, CollectionBundle)> instead
    pub local_scope: Vec<BTreeMap<LocalId, CollectionBundle>>,
+    /// a list of all collections being used in the operator
+    ///
+    /// TODO(discord9): remove extra clone by counting usage and remove it on last usage?
+    pub input_collection_batch: BTreeMap<GlobalId, CollectionBundle<Batch>>,
+    /// used by `Get`/`Let` Plan for getting/setting local variables
+    ///
+    /// TODO(discord9): consider if use Vec<(LocalId, CollectionBundle)> instead
+    pub local_scope_batch: Vec<BTreeMap<LocalId, CollectionBundle<Batch>>>,
    // Collect all errors in this operator's evaluation
    pub err_collector: ErrCollector,
 }
@@ -67,6 +75,19 @@ impl<'referred, 'df> Drop for Context<'referred, 'df> {
            bundle.collection.into_inner().drop(self.df);
            drop(bundle.arranged);
        }
+
+        for bundle in std::mem::take(&mut self.input_collection_batch)
+            .into_values()
+            .chain(
+                std::mem::take(&mut self.local_scope_batch)
+                    .into_iter()
+                    .flat_map(|v| v.into_iter())
+                    .map(|(_k, v)| v),
+            )
+        {
+            bundle.collection.into_inner().drop(self.df);
+            drop(bundle.arranged);
+        }
        // The automatically generated "drop glue" which recursively calls the destructors of all the fields (including the now empty `input_collection`)
    }
 }
@@ -84,6 +105,19 @@ impl<'referred, 'df> Context<'referred, 'df> {
            self.local_scope.push(first);
        }
    }
+
+    pub fn insert_global_batch(&mut self, id: GlobalId, collection: CollectionBundle<Batch>) {
+        self.input_collection_batch.insert(id, collection);
+    }
+
+    pub fn insert_local_batch(&mut self, id: LocalId, collection: CollectionBundle<Batch>) {
+        if let Some(last) = self.local_scope_batch.last_mut() {
+            last.insert(id, collection);
+        } else {
+            let first = BTreeMap::from([(id, collection)]);
+            self.local_scope_batch.push(first);
+        }
+    }
 }

 impl<'referred, 'df> Context<'referred, 'df> {
@@ -91,14 +125,8 @@ impl<'referred, 'df> Context<'referred, 'df> {
    pub fn render_plan_batch(&mut self, plan: TypedPlan) -> Result<CollectionBundle<Batch>, Error> {
        match plan.plan {
            Plan::Constant { rows } => Ok(self.render_constant_batch(rows)),
-            Plan::Get { .. } => NotImplementedSnafu {
-                reason: "Get is still WIP in batchmode",
-            }
-            .fail(),
-            Plan::Let { .. } => NotImplementedSnafu {
-                reason: "Let is still WIP in batchmode",
-            }
-            .fail(),
+            Plan::Get { id } => self.get_batch_by_id(id),
+            Plan::Let { id, value, body } => self.eval_batch_let(id, value, body),
            Plan::Mfp { input, mfp } => self.render_mfp_batch(input, mfp),
            Plan::Reduce {
                input,
@@ -225,6 +253,32 @@ impl<'referred, 'df> Context<'referred, 'df> {
        CollectionBundle::from_collection(Collection::from_port(recv_port))
    }

+    pub fn get_batch_by_id(&mut self, id: expr::Id) -> Result<CollectionBundle<Batch>, Error> {
+        let ret = match id {
+            expr::Id::Local(local) => {
+                let bundle = self
+                    .local_scope_batch
+                    .iter()
+                    .rev()
+                    .find_map(|scope| scope.get(&local))
+                    .with_context(|| InvalidQuerySnafu {
+                        reason: format!("Local variable {:?} not found", local),
+                    })?;
+                bundle.clone(self.df)
+            }
+            expr::Id::Global(id) => {
+                let bundle =
+                    self.input_collection_batch
+                        .get(&id)
+                        .with_context(|| InvalidQuerySnafu {
+                            reason: format!("Collection {:?} not found", id),
+                        })?;
+                bundle.clone(self.df)
+            }
+        };
+        Ok(ret)
+    }
+
    pub fn get_by_id(&mut self, id: expr::Id) -> Result<CollectionBundle, Error> {
        let ret = match id {
            expr::Id::Local(local) => {
@@ -251,6 +305,21 @@ impl<'referred, 'df> Context<'referred, 'df> {
        Ok(ret)
    }

+    /// Eval `Let` operator, useful for assigning a value to a local variable
+    pub fn eval_batch_let(
+        &mut self,
+        id: LocalId,
+        value: Box<TypedPlan>,
+        body: Box<TypedPlan>,
+    ) -> Result<CollectionBundle<Batch>, Error> {
+        let value = self.render_plan_batch(*value)?;
+
+        self.local_scope_batch.push(Default::default());
+        self.insert_local_batch(id, value);
+        let ret = self.render_plan_batch(*body)?;
+        Ok(ret)
+    }
+
    /// Eval `Let` operator, useful for assigning a value to a local variable
    pub fn eval_let(
        &mut self,
@@ -268,11 +337,11 @@ impl<'referred, 'df> Context<'referred, 'df> {
 }

 /// The Common argument for all `Subgraph` in the render process
-struct SubgraphArg<'a> {
+struct SubgraphArg<'a, T = Toff> {
    now: repr::Timestamp,
    err_collector: &'a ErrCollector,
    scheduler: &'a Scheduler,
-    send: &'a PortCtx<SEND, Toff>,
+    send: &'a PortCtx<SEND, T>,
 }

 #[cfg(test)]
@@ -345,6 +414,8 @@ mod test {
            compute_state: state,
            input_collection: BTreeMap::new(),
            local_scope: Default::default(),
+            input_collection_batch: BTreeMap::new(),
+            local_scope_batch: Default::default(),
            err_collector,
        }
    }
--- a/src/flow/src/compute/render/reduce.rs
+++ b/src/flow/src/compute/render/reduce.rs
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet};
 use std::ops::Range;
 use std::sync::Arc;

+use common_telemetry::trace;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::prelude::DataType;
 use datatypes::value::{ListValue, Value};
-use datatypes::vectors::NullVector;
+use datatypes::vectors::{BooleanVector, NullVector};
 use hydroflow::scheduled::graph_ext::GraphExt;
 use itertools::Itertools;
 use snafu::{ensure, OptionExt, ResultExt};
@@ -27,8 +28,8 @@ use snafu::{ensure, OptionExt, ResultExt};
 use crate::compute::render::{Context, SubgraphArg};
 use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
 use crate::error::{Error, NotImplementedSnafu, PlanSnafu};
-use crate::expr::error::{DataAlreadyExpiredSnafu, DataTypeSnafu, InternalSnafu};
-use crate::expr::{Batch, EvalError, ScalarExpr};
+use crate::expr::error::{ArrowSnafu, DataAlreadyExpiredSnafu, DataTypeSnafu, InternalSnafu};
+use crate::expr::{Accum, Accumulator, Batch, EvalError, ScalarExpr, VectorDiff};
 use crate::plan::{AccumulablePlan, AggrWithIndex, KeyValPlan, ReducePlan, TypedPlan};
 use crate::repr::{self, DiffRow, KeyValDiffRow, RelationType, Row};
 use crate::utils::{ArrangeHandler, ArrangeReader, ArrangeWriter, KeyExpiryManager};
@@ -93,152 +94,39 @@ impl<'referred, 'df> Context<'referred, 'df> {
        // TODO(discord9): better way to schedule future run
        let scheduler = self.compute_state.get_scheduler();

+        let scheduler_inner = scheduler.clone();
+
        let (out_send_port, out_recv_port) =
            self.df.make_edge::<_, Toff<Batch>>(Self::REDUCE_BATCH);

-        let subgraph =
-            self.df.add_subgraph_in_out(
-                Self::REDUCE_BATCH,
-                input.collection.into_inner(),
-                out_send_port,
-                move |_ctx, recv, send| {
-                    let now = *(now.borrow());
-                    let arrange = arrange_handler_inner.clone();
-                    // mfp only need to passively receive updates from recvs
-                    let src_data = recv
-                        .take_inner()
-                        .into_iter()
-                        .flat_map(|v| v.into_iter())
-                        .collect_vec();
+        let subgraph = self.df.add_subgraph_in_out(
+            Self::REDUCE_BATCH,
+            input.collection.into_inner(),
+            out_send_port,
+            move |_ctx, recv, send| {
+                let now = *(now.borrow());
+                let arrange = arrange_handler_inner.clone();
+                // mfp only need to passively receive updates from recvs
+                let src_data = recv
+                    .take_inner()
+                    .into_iter()
+                    .flat_map(|v| v.into_iter())
+                    .collect_vec();

-                    let mut key_to_many_vals = BTreeMap::<Row, Batch>::new();
-                    for batch in src_data {
-                        err_collector.run(|| {
-                            let (key_batch, val_batch) =
-                                batch_split_by_key_val(&batch, &key_val_plan, &err_collector);
-                            ensure!(
-                                key_batch.row_count() == val_batch.row_count(),
-                                InternalSnafu {
-                                    reason: format!(
-                                        "Key and val batch should have the same row count, found {} and {}", 
-                                        key_batch.row_count(),
-                                        val_batch.row_count()
-                                    )
-                                }
-                            );
-
-                            for row_idx in 0..key_batch.row_count() {
-                                let key_row = key_batch.get_row(row_idx).unwrap();
-                                let val_row = val_batch.slice(row_idx, 1)?;
-                                let val_batch =
-                                    key_to_many_vals.entry(Row::new(key_row)).or_default();
-                                val_batch.append_batch(val_row)?;
-                            }
-
-                            Ok(())
-                        });
-                    }
-
-                    // write lock the arrange for the rest of the function body
-                    // to prevent wired race condition
-                    let mut arrange = arrange.write();
-                    let mut all_arrange_updates = Vec::with_capacity(key_to_many_vals.len());
-                    let mut all_output_rows = Vec::with_capacity(key_to_many_vals.len());
-
-                    for (key, val_batch) in key_to_many_vals {
-                        err_collector.run(|| -> Result<(), _> {
-                            let (accums, _, _) = arrange.get(now, &key).unwrap_or_default();
-                            let accum_list = from_accum_values_to_live_accums(
-                                accums.unpack(),
-                                accum_plan.simple_aggrs.len(),
-                            )?;
-
-                            let mut accum_output = AccumOutput::new();
-                            for AggrWithIndex {
-                                expr,
-                                input_idx,
-                                output_idx,
-                            } in accum_plan.simple_aggrs.iter()
-                            {
-                                let cur_old_accum = accum_list.get(*output_idx).cloned().unwrap_or_default();
-                                // if batch is empty, input null instead
-                                let cur_input = val_batch.batch().get(*input_idx).cloned().unwrap_or_else(||Arc::new(NullVector::new(val_batch.row_count())));
-
-                                let (output, new_accum) =
-                                    expr.func.eval_batch(cur_old_accum, cur_input, None)?;
-
-                                accum_output.insert_accum(*output_idx, new_accum);
-                                accum_output.insert_output(*output_idx, output);
-                            }
-
-                            let (new_accums, res_val_row) = accum_output.into_accum_output()?;
-
-                            let arrange_update = ((key.clone(), Row::new(new_accums)), now, 1);
-                            all_arrange_updates.push(arrange_update);
-
-                            let mut key_val = key;
-                            key_val.extend(res_val_row);
-                            all_output_rows.push((key_val, now, 1));
-
-                            Ok(())
-                        });
-                    }
-
-                    err_collector.run(|| {
-                        arrange.apply_updates(now, all_arrange_updates)?;
-                        arrange.compact_to(now)
-                    });
-
-                    // this output part is not supposed to be resource intensive
-                    // (because for every batch there wouldn't usually be as many output row?), 
-                    // so we can do some costly operation here
-                    let output_types = all_output_rows.first().map(|(row, _, _)| {
-                        row.iter()
-                            .map(|v| v.data_type())
-                            .collect::<Vec<ConcreteDataType>>()
-                    });
-
-                    if let Some(output_types) = output_types {
-                        err_collector.run(|| {
-                            let column_cnt = output_types.len();
-                            let row_cnt = all_output_rows.len();
-
-                            let mut output_builder = output_types
-                                .into_iter()
-                                .map(|t| t.create_mutable_vector(row_cnt))
-                                .collect_vec();
-
-                            for (row, _, _) in all_output_rows {
-                                for (i, v) in row.into_iter().enumerate() {
-                                    output_builder
-                                    .get_mut(i)
-                                    .context(InternalSnafu{
-                                        reason: format!(
-                                            "Output builder should have the same length as the row, expected at most {} but got {}", 
-                                            column_cnt-1,
-                                            i
-                                        )
-                                    })?
-                                    .try_push_value_ref(v.as_value_ref())
-                                    .context(DataTypeSnafu {
-                                        msg: "Failed to push value",
-                                    })?;
-                                }
-                            }
-
-                            let output_columns = output_builder
-                                .into_iter()
-                                .map(|mut b| b.to_vector())
-                                .collect_vec();
-
-                            let output_batch = Batch::try_new(output_columns, row_cnt)?;
-                            send.give(vec![output_batch]);
-
-                            Ok(())
-                        });
-                    }
-                },
-            );
+                reduce_batch_subgraph(
+                    &arrange,
+                    src_data,
+                    &key_val_plan,
+                    &accum_plan,
+                    SubgraphArg {
+                        now,
+                        err_collector: &err_collector,
+                        scheduler: &scheduler_inner,
+                        send,
+                    },
+                )
+            },
+        );

        scheduler.set_cur_subgraph(subgraph);

@@ -461,6 +349,245 @@ fn split_rows_to_key_val(
    )
 }

+fn reduce_batch_subgraph(
+    arrange: &ArrangeHandler,
+    src_data: impl IntoIterator<Item = Batch>,
+    key_val_plan: &KeyValPlan,
+    accum_plan: &AccumulablePlan,
+    SubgraphArg {
+        now,
+        err_collector,
+        scheduler: _,
+        send,
+    }: SubgraphArg<Toff<Batch>>,
+) {
+    let mut key_to_many_vals = BTreeMap::<Row, Vec<Batch>>::new();
+    let mut input_row_count = 0;
+    let mut input_batch_count = 0;
+
+    for batch in src_data {
+        input_batch_count += 1;
+        input_row_count += batch.row_count();
+        err_collector.run(|| {
+            let (key_batch, val_batch) =
+                batch_split_by_key_val(&batch, key_val_plan, err_collector);
+            ensure!(
+                key_batch.row_count() == val_batch.row_count(),
+                InternalSnafu {
+                    reason: format!(
+                        "Key and val batch should have the same row count, found {} and {}",
+                        key_batch.row_count(),
+                        val_batch.row_count()
+                    )
+                }
+            );
+
+            let mut distinct_keys = BTreeSet::new();
+            for row_idx in 0..key_batch.row_count() {
+                let key_row = key_batch.get_row(row_idx)?;
+                let key_row = Row::new(key_row);
+
+                if distinct_keys.contains(&key_row) {
+                    continue;
+                } else {
+                    distinct_keys.insert(key_row.clone());
+                }
+            }
+
+            // TODO: here reduce numbers of eq to minimal by keeping slicing key/val batch
+            for key_row in distinct_keys {
+                let key_scalar_value = {
+                    let mut key_scalar_value = Vec::with_capacity(key_row.len());
+                    for key in key_row.iter() {
+                        let v =
+                            key.try_to_scalar_value(&key.data_type())
+                                .context(DataTypeSnafu {
+                                    msg: "can't convert key values to datafusion value",
+                                })?;
+                        let arrow_value =
+                            v.to_scalar().context(crate::expr::error::DatafusionSnafu {
+                                context: "can't convert key values to arrow value",
+                            })?;
+                        key_scalar_value.push(arrow_value);
+                    }
+                    key_scalar_value
+                };
+
+                // first compute equal from separate columns
+                let eq_results = key_scalar_value
+                    .into_iter()
+                    .zip(key_batch.batch().iter())
+                    .map(|(key, col)| {
+                        // TODO(discord9): this takes half of the cpu! And this is redundant amount of `eq`!
+                        arrow::compute::kernels::cmp::eq(&key, &col.to_arrow_array().as_ref() as _)
+                    })
+                    .try_collect::<_, Vec<_>, _>()
+                    .context(ArrowSnafu {
+                        context: "Failed to compare key values",
+                    })?;
+
+                // then combine all equal results to finally found equal key rows
+                let opt_eq_mask = eq_results
+                    .into_iter()
+                    .fold(None, |acc, v| match acc {
+                        Some(Ok(acc)) => Some(arrow::compute::kernels::boolean::and(&acc, &v)),
+                        Some(Err(_)) => acc,
+                        None => Some(Ok(v)),
+                    })
+                    .transpose()
+                    .context(ArrowSnafu {
+                        context: "Failed to combine key comparison results",
+                    })?;
+
+                let key_eq_mask = if let Some(eq_mask) = opt_eq_mask {
+                    BooleanVector::from(eq_mask)
+                } else {
+                    // if None, meaning key_batch's column number is zero, which means
+                    // the key is empty, so we just return a mask of all true
+                    // meaning taking all values
+                    BooleanVector::from(vec![true; key_batch.row_count()])
+                };
+                // TODO: both slice and mutate remaining batch
+
+                let cur_val_batch = val_batch.filter(&key_eq_mask)?;
+
+                key_to_many_vals
+                    .entry(key_row)
+                    .or_default()
+                    .push(cur_val_batch);
+            }
+
+            Ok(())
+        });
+    }
+
+    trace!(
+        "Reduce take {} batches, {} rows",
+        input_batch_count,
+        input_row_count
+    );
+
+    // write lock the arrange for the rest of the function body
+    // to prevent wired race condition
+    let mut arrange = arrange.write();
+    let mut all_arrange_updates = Vec::with_capacity(key_to_many_vals.len());
+
+    let mut all_output_dict = BTreeMap::new();
+
+    for (key, val_batches) in key_to_many_vals {
+        err_collector.run(|| -> Result<(), _> {
+            let (accums, _, _) = arrange.get(now, &key).unwrap_or_default();
+            let accum_list =
+                from_accum_values_to_live_accums(accums.unpack(), accum_plan.simple_aggrs.len())?;
+
+            let mut accum_output = AccumOutput::new();
+            for AggrWithIndex {
+                expr,
+                input_idx,
+                output_idx,
+            } in accum_plan.simple_aggrs.iter()
+            {
+                let cur_accum_value = accum_list.get(*output_idx).cloned().unwrap_or_default();
+                let mut cur_accum = if cur_accum_value.is_empty() {
+                    Accum::new_accum(&expr.func.clone())?
+                } else {
+                    Accum::try_into_accum(&expr.func, cur_accum_value)?
+                };
+
+                for val_batch in val_batches.iter() {
+                    // if batch is empty, input null instead
+                    let cur_input = val_batch
+                        .batch()
+                        .get(*input_idx)
+                        .cloned()
+                        .unwrap_or_else(|| Arc::new(NullVector::new(val_batch.row_count())));
+                    let len = cur_input.len();
+                    cur_accum.update_batch(&expr.func, VectorDiff::from(cur_input))?;
+
+                    trace!("Reduce accum after take {} rows: {:?}", len, cur_accum);
+                }
+                let final_output = cur_accum.eval(&expr.func)?;
+                trace!("Reduce accum final output: {:?}", final_output);
+                accum_output.insert_output(*output_idx, final_output);
+
+                let cur_accum_value = cur_accum.into_state();
+                accum_output.insert_accum(*output_idx, cur_accum_value);
+            }
+
+            let (new_accums, res_val_row) = accum_output.into_accum_output()?;
+
+            let arrange_update = ((key.clone(), Row::new(new_accums)), now, 1);
+            all_arrange_updates.push(arrange_update);
+
+            all_output_dict.insert(key, Row::from(res_val_row));
+
+            Ok(())
+        });
+    }
+
+    err_collector.run(|| {
+        arrange.apply_updates(now, all_arrange_updates)?;
+        arrange.compact_to(now)
+    });
+    // release the lock
+    drop(arrange);
+
+    // this output part is not supposed to be resource intensive
+    // (because for every batch there wouldn't usually be as many output row?),
+    // so we can do some costly operation here
+    let output_types = all_output_dict.first_entry().map(|entry| {
+        entry
+            .key()
+            .iter()
+            .chain(entry.get().iter())
+            .map(|v| v.data_type())
+            .collect::<Vec<ConcreteDataType>>()
+    });
+
+    if let Some(output_types) = output_types {
+        err_collector.run(|| {
+            let column_cnt = output_types.len();
+            let row_cnt = all_output_dict.len();
+
+            let mut output_builder = output_types
+                .into_iter()
+                .map(|t| t.create_mutable_vector(row_cnt))
+                .collect_vec();
+
+            for (key, val) in all_output_dict {
+                for (i, v) in key.into_iter().chain(val.into_iter()).enumerate() {
+                    output_builder
+                    .get_mut(i)
+                    .context(InternalSnafu{
+                        reason: format!(
+                            "Output builder should have the same length as the row, expected at most {} but got {}", 
+                            column_cnt - 1,
+                            i
+                        )
+                    })?
+                    .try_push_value_ref(v.as_value_ref())
+                    .context(DataTypeSnafu {
+                        msg: "Failed to push value",
+                    })?;
+                }
+            }
+
+            let output_columns = output_builder
+                .into_iter()
+                .map(|mut b| b.to_vector())
+                .collect_vec();
+
+            let output_batch = Batch::try_new(output_columns, row_cnt)?;
+
+            trace!("Reduce output batch: {:?}", output_batch);
+
+            send.give(vec![output_batch]);
+
+            Ok(())
+        });
+    }
+}
+
 /// reduce subgraph, reduce the input data into a single row
 /// output is concat from key and val
 fn reduce_subgraph(
@@ -856,6 +983,9 @@ impl AccumOutput {

    /// return (accums, output)
    fn into_accum_output(self) -> Result<(Vec<Value>, Vec<Value>), EvalError> {
+        if self.accum.is_empty() && self.output.is_empty() {
+            return Ok((vec![], vec![]));
+        }
        ensure!(
            !self.accum.is_empty() && self.accum.len() == self.output.len(),
            InternalSnafu {
--- a/src/flow/src/compute/render/src_sink.rs
+++ b/src/flow/src/compute/render/src_sink.rs
@@ -16,7 +16,7 @@

 use std::collections::{BTreeMap, VecDeque};

-use common_telemetry::debug;
+use common_telemetry::{debug, trace};
 use hydroflow::scheduled::graph_ext::GraphExt;
 use itertools::Itertools;
 use snafu::OptionExt;
@@ -48,10 +48,13 @@ impl<'referred, 'df> Context<'referred, 'df> {
        let sub = self
            .df
            .add_subgraph_source("source_batch", send_port, move |_ctx, send| {
+                let mut total_batches = vec![];
+                let mut total_row_count = 0;
                loop {
                    match src_recv.try_recv() {
                        Ok(batch) => {
-                            send.give(vec![batch]);
+                            total_row_count += batch.row_count();
+                            total_batches.push(batch);
                        }
                        Err(TryRecvError::Empty) => {
                            break;
@@ -78,6 +81,13 @@ impl<'referred, 'df> Context<'referred, 'df> {
                    }
                }

+                trace!(
+                    "Send {} rows in {} batches",
+                    total_row_count,
+                    total_batches.len()
+                );
+                send.give(total_batches);
+
                let now = *now.borrow();
                // always schedule source to run at now so we can
                // repeatedly run source if needed
@@ -185,13 +195,18 @@ impl<'referred, 'df> Context<'referred, 'df> {
            collection.into_inner(),
            move |_ctx, recv| {
                let data = recv.take_inner();
+                let mut row_count = 0;
+                let mut batch_count = 0;
                for batch in data.into_iter().flat_map(|i| i.into_iter()) {
+                    row_count += batch.row_count();
+                    batch_count += 1;
                    // if the sender is closed unexpectedly, stop sending
                    if sender.is_closed() || sender.send(batch).is_err() {
                        common_telemetry::error!("UnboundedSinkBatch is closed");
                        break;
                    }
                }
+                trace!("sink send {} rows in {} batches", row_count, batch_count);
            },
        );
    }
--- a/src/flow/src/compute/types.rs
+++ b/src/flow/src/compute/types.rs
@@ -24,7 +24,7 @@ use hydroflow::scheduled::SubgraphId;
 use itertools::Itertools;
 use tokio::sync::Mutex;

-use crate::expr::{EvalError, ScalarExpr};
+use crate::expr::{Batch, EvalError, ScalarExpr};
 use crate::repr::DiffRow;
 use crate::utils::ArrangeHandler;

@@ -123,6 +123,38 @@ pub struct CollectionBundle<T: 'static = DiffRow> {
    pub arranged: BTreeMap<Vec<ScalarExpr>, Arranged>,
 }

+pub trait GenericBundle {
+    fn is_batch(&self) -> bool;
+
+    fn try_as_batch(&self) -> Option<&CollectionBundle<Batch>> {
+        None
+    }
+
+    fn try_as_row(&self) -> Option<&CollectionBundle<DiffRow>> {
+        None
+    }
+}
+
+impl GenericBundle for CollectionBundle<Batch> {
+    fn is_batch(&self) -> bool {
+        true
+    }
+
+    fn try_as_batch(&self) -> Option<&CollectionBundle<Batch>> {
+        Some(self)
+    }
+}
+
+impl GenericBundle for CollectionBundle<DiffRow> {
+    fn is_batch(&self) -> bool {
+        false
+    }
+
+    fn try_as_row(&self) -> Option<&CollectionBundle<DiffRow>> {
+        Some(self)
+    }
+}
+
 impl<T: 'static> CollectionBundle<T> {
    pub fn from_collection(collection: Collection<T>) -> Self {
        Self {
--- a/src/flow/src/expr.rs
+++ b/src/flow/src/expr.rs
@@ -16,27 +16,29 @@

 mod df_func;
 pub(crate) mod error;
-mod func;
+pub(crate) mod func;
 mod id;
 mod linear;
-mod relation;
+pub(crate) mod relation;
 mod scalar;
 mod signature;

+use arrow::compute::FilterBuilder;
 use datatypes::prelude::DataType;
 use datatypes::value::Value;
-use datatypes::vectors::VectorRef;
+use datatypes::vectors::{BooleanVector, Helper, VectorRef};
 pub(crate) use df_func::{DfScalarFunction, RawDfScalarFn};
 pub(crate) use error::{EvalError, InvalidArgumentSnafu};
 pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
 pub(crate) use id::{GlobalId, Id, LocalId};
 use itertools::Itertools;
 pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
-pub(crate) use relation::{AggregateExpr, AggregateFunc};
+pub(crate) use relation::{Accum, Accumulator, AggregateExpr, AggregateFunc};
 pub(crate) use scalar::{ScalarExpr, TypedExpr};
 use snafu::{ensure, ResultExt};

-use crate::expr::error::DataTypeSnafu;
+use crate::expr::error::{ArrowSnafu, DataTypeSnafu};
+use crate::repr::Diff;

 pub const TUMBLE_START: &str = "tumble_start";
 pub const TUMBLE_END: &str = "tumble_end";
@@ -179,7 +181,9 @@ impl Batch {
                )
            }
        );
-        Ok(self.batch.iter().map(|v| v.get(idx)).collect_vec())
+        let mut ret = Vec::with_capacity(self.column_count());
+        ret.extend(self.batch.iter().map(|v| v.get(idx)));
+        Ok(ret)
    }

    /// Slices the `Batch`, returning a new `Batch`.
@@ -248,4 +252,97 @@ impl Batch {
        self.row_count = self_row_count + other_row_count;
        Ok(())
    }
+
+    /// filter the batch with given predicate
+    pub fn filter(&self, predicate: &BooleanVector) -> Result<Self, EvalError> {
+        let len = predicate.as_boolean_array().true_count();
+        let filter_builder = FilterBuilder::new(predicate.as_boolean_array()).optimize();
+        let filter_pred = filter_builder.build();
+        let filtered = self
+            .batch()
+            .iter()
+            .map(|col| filter_pred.filter(col.to_arrow_array().as_ref()))
+            .try_collect::<_, Vec<_>, _>()
+            .context(ArrowSnafu {
+                context: "Failed to filter val batches",
+            })?;
+        let res_vector = Helper::try_into_vectors(&filtered).context(DataTypeSnafu {
+            msg: "can't convert arrow array to vector",
+        })?;
+        Self::try_new(res_vector, len)
+    }
+}
+
+/// Vector with diff to note the insert and delete
+pub(crate) struct VectorDiff {
+    vector: VectorRef,
+    diff: Option<VectorRef>,
+}
+
+impl From<VectorRef> for VectorDiff {
+    fn from(vector: VectorRef) -> Self {
+        Self { vector, diff: None }
+    }
+}
+
+impl VectorDiff {
+    fn len(&self) -> usize {
+        self.vector.len()
+    }
+
+    fn try_new(vector: VectorRef, diff: Option<VectorRef>) -> Result<Self, EvalError> {
+        ensure!(
+            diff.as_ref()
+                .map_or(true, |diff| diff.len() == vector.len()),
+            InvalidArgumentSnafu {
+                reason: "Length of vector and diff should be the same"
+            }
+        );
+        Ok(Self { vector, diff })
+    }
+}
+
+impl IntoIterator for VectorDiff {
+    type Item = (Value, Diff);
+    type IntoIter = VectorDiffIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        VectorDiffIter {
+            vector: self.vector,
+            diff: self.diff,
+            idx: 0,
+        }
+    }
+}
+
+/// iterator for VectorDiff
+pub(crate) struct VectorDiffIter {
+    vector: VectorRef,
+    diff: Option<VectorRef>,
+    idx: usize,
+}
+
+impl std::iter::Iterator for VectorDiffIter {
+    type Item = (Value, Diff);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.idx >= self.vector.len() {
+            return None;
+        }
+        let value = self.vector.get(self.idx);
+        // +1 means insert, -1 means delete, and default to +1 insert when diff is not provided
+        let diff = if let Some(diff) = self.diff.as_ref() {
+            if let Ok(diff_at) = diff.get(self.idx).try_into() {
+                diff_at
+            } else {
+                common_telemetry::warn!("Invalid diff value at index {}", self.idx);
+                return None;
+            }
+        } else {
+            1
+        };
+
+        self.idx += 1;
+        Some((value, diff))
+    }
 }
--- a/src/flow/src/expr/df_func.rs
+++ b/src/flow/src/expr/df_func.rs
@@ -92,12 +92,8 @@ impl DfScalarFunction {

        let len = rb.num_rows();

-        let res = self.fn_impl.evaluate(&rb).map_err(|err| {
-            EvalDatafusionSnafu {
-                raw: err,
-                context: "Failed to evaluate datafusion scalar function",
-            }
-            .build()
+        let res = self.fn_impl.evaluate(&rb).context(EvalDatafusionSnafu {
+            context: "Failed to evaluate datafusion scalar function",
        })?;
        let res = common_query::columnar_value::ColumnarValue::try_from(&res)
            .map_err(BoxedError::new)
@@ -157,12 +153,8 @@ impl DfScalarFunction {
            .into_error(err)
        })?;

-        let res = self.fn_impl.evaluate(&rb).map_err(|err| {
-            EvalDatafusionSnafu {
-                raw: err,
-                context: "Failed to evaluate datafusion scalar function",
-            }
-            .build()
+        let res = self.fn_impl.evaluate(&rb).context(EvalDatafusionSnafu {
+            context: "Failed to evaluate datafusion scalar function",
        })?;
        let res = common_query::columnar_value::ColumnarValue::try_from(&res)
            .map_err(BoxedError::new)
--- a/src/flow/src/expr/error.rs
+++ b/src/flow/src/expr/error.rs
@@ -106,18 +106,19 @@ pub enum EvalError {
        location: Location,
    },

-    #[snafu(display("Arrow error: {raw:?}, context: {context}"))]
+    #[snafu(display("Arrow error: {error:?}, context: {context}"))]
    Arrow {
        #[snafu(source)]
-        raw: ArrowError,
+        error: ArrowError,
        context: String,
        #[snafu(implicit)]
        location: Location,
    },

-    #[snafu(display("DataFusion error: {raw:?}, context: {context}"))]
+    #[snafu(display("DataFusion error: {error:?}, context: {context}"))]
    Datafusion {
-        raw: DataFusionError,
+        #[snafu(source)]
+        error: DataFusionError,
        context: String,
        #[snafu(implicit)]
        location: Location,
--- a/src/flow/src/expr/func.rs
+++ b/src/flow/src/expr/func.rs
@@ -967,7 +967,7 @@ impl BinaryFunc {
            | Self::DivUInt32
            | Self::DivUInt64
            | Self::DivFloat32
-            | Self::DivFloat64 => arrow::compute::kernels::numeric::mul(&left, &right)
+            | Self::DivFloat64 => arrow::compute::kernels::numeric::div(&left, &right)
                .context(ArrowSnafu { context: "div" })?,

            Self::ModInt16
@@ -1280,119 +1280,183 @@ where
    Ok(Value::from(left % right))
 }

-#[test]
-fn test_num_ops() {
-    let left = Value::from(10);
-    let right = Value::from(3);
-    let res = add::<i32>(left.clone(), right.clone()).unwrap();
-    assert_eq!(res, Value::from(13));
-    let res = sub::<i32>(left.clone(), right.clone()).unwrap();
-    assert_eq!(res, Value::from(7));
-    let res = mul::<i32>(left.clone(), right.clone()).unwrap();
-    assert_eq!(res, Value::from(30));
-    let res = div::<i32>(left.clone(), right.clone()).unwrap();
-    assert_eq!(res, Value::from(3));
-    let res = rem::<i32>(left, right).unwrap();
-    assert_eq!(res, Value::from(1));
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;

-    let values = vec![Value::from(true), Value::from(false)];
-    let exprs = vec![ScalarExpr::Column(0), ScalarExpr::Column(1)];
-    let res = and(&values, &exprs).unwrap();
-    assert_eq!(res, Value::from(false));
-    let res = or(&values, &exprs).unwrap();
-    assert_eq!(res, Value::from(true));
-}
+    use common_time::Interval;
+    use datatypes::vectors::Vector;
+    use pretty_assertions::assert_eq;

-/// test if the binary function specialization works
-/// whether from direct type or from the expression that is literal
-#[test]
-fn test_binary_func_spec() {
-    assert_eq!(
-        BinaryFunc::from_str_expr_and_type(
-            "add",
-            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
-            &[
-                Some(ConcreteDataType::int32_datatype()),
-                Some(ConcreteDataType::int32_datatype())
-            ]
-        )
-        .unwrap(),
-        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
-    );
+    use super::*;

-    assert_eq!(
-        BinaryFunc::from_str_expr_and_type(
-            "add",
-            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
-            &[Some(ConcreteDataType::int32_datatype()), None]
-        )
-        .unwrap(),
-        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
-    );
+    #[test]
+    fn test_tumble_batch() {
+        let datetime_vector = DateTimeVector::from_vec(vec![1, 2, 10, 13, 14, 20, 25]);
+        let tumble_start = UnaryFunc::TumbleWindowFloor {
+            window_size: Interval::from_day_time(0, 10),
+            start_time: None,
+        };
+        let tumble_end = UnaryFunc::TumbleWindowCeiling {
+            window_size: Interval::from_day_time(0, 10),
+            start_time: None,
+        };

-    assert_eq!(
-        BinaryFunc::from_str_expr_and_type(
-            "add",
-            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
-            &[Some(ConcreteDataType::int32_datatype()), None]
-        )
-        .unwrap(),
-        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
-    );
+        let len = datetime_vector.len();
+        let batch = Batch::try_new(vec![Arc::new(datetime_vector)], len).unwrap();
+        let arg = ScalarExpr::Column(0);

-    assert_eq!(
-        BinaryFunc::from_str_expr_and_type(
-            "add",
-            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
-            &[Some(ConcreteDataType::int32_datatype()), None]
-        )
-        .unwrap(),
-        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
-    );
+        let start = tumble_start.eval_batch(&batch, &arg).unwrap();
+        let end = tumble_end.eval_batch(&batch, &arg).unwrap();
+        assert_eq!(
+            start.to_arrow_array().as_ref(),
+            TimestampMillisecondVector::from_vec(vec![0, 0, 10, 10, 10, 20, 20])
+                .to_arrow_array()
+                .as_ref()
+        );

-    assert_eq!(
-        BinaryFunc::from_str_expr_and_type(
-            "add",
-            &[
-                ScalarExpr::Literal(Value::from(1i32), ConcreteDataType::int32_datatype()),
-                ScalarExpr::Column(0)
-            ],
-            &[None, None]
-        )
-        .unwrap(),
-        (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
-    );
+        assert_eq!(
+            end.to_arrow_array().as_ref(),
+            TimestampMillisecondVector::from_vec(vec![10, 10, 20, 20, 20, 30, 30])
+                .to_arrow_array()
+                .as_ref()
+        );

-    // this testcase make sure the specialization can find actual type from expression and fill in signature
-    assert_eq!(
-        BinaryFunc::from_str_expr_and_type(
-            "equal",
-            &[
-                ScalarExpr::Literal(Value::from(1i32), ConcreteDataType::int32_datatype()),
-                ScalarExpr::Column(0)
-            ],
-            &[None, None]
-        )
-        .unwrap(),
-        (
-            BinaryFunc::Eq,
-            Signature {
-                input: smallvec![
-                    ConcreteDataType::int32_datatype(),
-                    ConcreteDataType::int32_datatype()
+        let ts_ms_vector = TimestampMillisecondVector::from_vec(vec![1, 2, 10, 13, 14, 20, 25]);
+        let batch = Batch::try_new(vec![Arc::new(ts_ms_vector)], len).unwrap();
+
+        let start = tumble_start.eval_batch(&batch, &arg).unwrap();
+        let end = tumble_end.eval_batch(&batch, &arg).unwrap();
+
+        assert_eq!(
+            start.to_arrow_array().as_ref(),
+            TimestampMillisecondVector::from_vec(vec![0, 0, 10, 10, 10, 20, 20])
+                .to_arrow_array()
+                .as_ref()
+        );
+
+        assert_eq!(
+            end.to_arrow_array().as_ref(),
+            TimestampMillisecondVector::from_vec(vec![10, 10, 20, 20, 20, 30, 30])
+                .to_arrow_array()
+                .as_ref()
+        );
+    }
+
+    #[test]
+    fn test_num_ops() {
+        let left = Value::from(10);
+        let right = Value::from(3);
+        let res = add::<i32>(left.clone(), right.clone()).unwrap();
+        assert_eq!(res, Value::from(13));
+        let res = sub::<i32>(left.clone(), right.clone()).unwrap();
+        assert_eq!(res, Value::from(7));
+        let res = mul::<i32>(left.clone(), right.clone()).unwrap();
+        assert_eq!(res, Value::from(30));
+        let res = div::<i32>(left.clone(), right.clone()).unwrap();
+        assert_eq!(res, Value::from(3));
+        let res = rem::<i32>(left, right).unwrap();
+        assert_eq!(res, Value::from(1));
+
+        let values = vec![Value::from(true), Value::from(false)];
+        let exprs = vec![ScalarExpr::Column(0), ScalarExpr::Column(1)];
+        let res = and(&values, &exprs).unwrap();
+        assert_eq!(res, Value::from(false));
+        let res = or(&values, &exprs).unwrap();
+        assert_eq!(res, Value::from(true));
+    }
+
+    /// test if the binary function specialization works
+    /// whether from direct type or from the expression that is literal
+    #[test]
+    fn test_binary_func_spec() {
+        assert_eq!(
+            BinaryFunc::from_str_expr_and_type(
+                "add",
+                &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+                &[
+                    Some(ConcreteDataType::int32_datatype()),
+                    Some(ConcreteDataType::int32_datatype())
+                ]
+            )
+            .unwrap(),
+            (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+        );
+
+        assert_eq!(
+            BinaryFunc::from_str_expr_and_type(
+                "add",
+                &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+                &[Some(ConcreteDataType::int32_datatype()), None]
+            )
+            .unwrap(),
+            (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+        );
+
+        assert_eq!(
+            BinaryFunc::from_str_expr_and_type(
+                "add",
+                &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+                &[Some(ConcreteDataType::int32_datatype()), None]
+            )
+            .unwrap(),
+            (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+        );
+
+        assert_eq!(
+            BinaryFunc::from_str_expr_and_type(
+                "add",
+                &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+                &[Some(ConcreteDataType::int32_datatype()), None]
+            )
+            .unwrap(),
+            (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+        );
+
+        assert_eq!(
+            BinaryFunc::from_str_expr_and_type(
+                "add",
+                &[
+                    ScalarExpr::Literal(Value::from(1i32), ConcreteDataType::int32_datatype()),
+                    ScalarExpr::Column(0)
                ],
-                output: ConcreteDataType::boolean_datatype(),
-                generic_fn: GenericFn::Eq
-            }
-        )
-    );
+                &[None, None]
+            )
+            .unwrap(),
+            (BinaryFunc::AddInt32, BinaryFunc::AddInt32.signature())
+        );

-    matches!(
-        BinaryFunc::from_str_expr_and_type(
-            "add",
-            &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
-            &[None, None]
-        ),
-        Err(Error::InvalidQuery { .. })
-    );
+        // this testcase make sure the specialization can find actual type from expression and fill in signature
+        assert_eq!(
+            BinaryFunc::from_str_expr_and_type(
+                "equal",
+                &[
+                    ScalarExpr::Literal(Value::from(1i32), ConcreteDataType::int32_datatype()),
+                    ScalarExpr::Column(0)
+                ],
+                &[None, None]
+            )
+            .unwrap(),
+            (
+                BinaryFunc::Eq,
+                Signature {
+                    input: smallvec![
+                        ConcreteDataType::int32_datatype(),
+                        ConcreteDataType::int32_datatype()
+                    ],
+                    output: ConcreteDataType::boolean_datatype(),
+                    generic_fn: GenericFn::Eq
+                }
+            )
+        );
+
+        matches!(
+            BinaryFunc::from_str_expr_and_type(
+                "add",
+                &[ScalarExpr::Column(0), ScalarExpr::Column(0)],
+                &[None, None]
+            ),
+            Err(Error::InvalidQuery { .. })
+        );
+    }
 }
--- a/src/flow/src/expr/linear.rs
+++ b/src/flow/src/expr/linear.rs
@@ -17,8 +17,9 @@
 use std::collections::{BTreeMap, BTreeSet};

 use arrow::array::BooleanArray;
+use arrow::buffer::BooleanBuffer;
 use arrow::compute::FilterBuilder;
-use common_telemetry::debug;
+use common_telemetry::trace;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::value::Value;
 use datatypes::vectors::{BooleanVector, Helper};
@@ -500,7 +501,7 @@ impl SafeMfpPlan {
        for col in batch.batch() {
            let filtered = pred
                .filter(col.to_arrow_array().as_ref())
-                .context(ArrowSnafu {
+                .with_context(|_| ArrowSnafu {
                    context: format!("failed to filter column for mfp operator {:?}", self),
                })?;
            result.push(Helper::try_into_vector(filtered).context(DataTypeSnafu {
@@ -523,7 +524,9 @@ impl SafeMfpPlan {
        // mark the columns that have been evaluated and appended to the `batch`
        let mut expression = 0;
        // preds default to true and will be updated as we evaluate each predicate
-        let mut all_preds = BooleanVector::from(vec![Some(true); batch.row_count()]);
+        let buf = BooleanBuffer::new_set(batch.row_count());
+        let arr = BooleanArray::new(buf, None);
+        let mut all_preds = BooleanVector::from(arr);

        // to compute predicate, need to first compute all expressions used in predicates
        for (support, predicate) in self.mfp.predicates.iter() {
@@ -793,7 +796,7 @@ impl MfpPlan {

        if Some(lower_bound) != upper_bound && !null_eval {
            if self.mfp.mfp.projection.iter().any(|c| values.len() <= *c) {
-                debug!("values={:?}, mfp={:?}", &values, &self.mfp.mfp);
+                trace!("values={:?}, mfp={:?}", &values, &self.mfp.mfp);
                let err = InternalSnafu {
                    reason: format!(
                        "Index out of bound for mfp={:?} and values={:?}",
--- a/src/flow/src/expr/relation.rs
+++ b/src/flow/src/expr/relation.rs
@@ -14,6 +14,7 @@

 //! Describes an aggregation function and it's input expression.

+pub(crate) use accum::{Accum, Accumulator};
 pub(crate) use func::AggregateFunc;

 use crate::expr::ScalarExpr;
--- a/src/flow/src/expr/relation/func.rs
+++ b/src/flow/src/expr/relation/func.rs
@@ -21,14 +21,14 @@ use datatypes::value::Value;
 use datatypes::vectors::VectorRef;
 use serde::{Deserialize, Serialize};
 use smallvec::smallvec;
-use snafu::{ensure, IntoError, OptionExt};
+use snafu::{IntoError, OptionExt};
 use strum::{EnumIter, IntoEnumIterator};

 use crate::error::{DatafusionSnafu, Error, InvalidQuerySnafu};
 use crate::expr::error::EvalError;
 use crate::expr::relation::accum::{Accum, Accumulator};
 use crate::expr::signature::{GenericFn, Signature};
-use crate::expr::InvalidArgumentSnafu;
+use crate::expr::VectorDiff;
 use crate::repr::Diff;

 /// Aggregate functions that can be applied to a group of rows.
@@ -161,72 +161,6 @@ impl AggregateFunc {
    }
 }

-struct VectorDiff {
-    vector: VectorRef,
-    diff: Option<VectorRef>,
-}
-
-impl VectorDiff {
-    fn len(&self) -> usize {
-        self.vector.len()
-    }
-
-    fn try_new(vector: VectorRef, diff: Option<VectorRef>) -> Result<Self, EvalError> {
-        ensure!(
-            diff.as_ref()
-                .map_or(true, |diff| diff.len() == vector.len()),
-            InvalidArgumentSnafu {
-                reason: "Length of vector and diff should be the same"
-            }
-        );
-        Ok(Self { vector, diff })
-    }
-}
-
-impl IntoIterator for VectorDiff {
-    type Item = (Value, Diff);
-    type IntoIter = VectorDiffIter;
-
-    fn into_iter(self) -> Self::IntoIter {
-        VectorDiffIter {
-            vector: self.vector,
-            diff: self.diff,
-            idx: 0,
-        }
-    }
-}
-
-struct VectorDiffIter {
-    vector: VectorRef,
-    diff: Option<VectorRef>,
-    idx: usize,
-}
-
-impl std::iter::Iterator for VectorDiffIter {
-    type Item = (Value, Diff);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.idx >= self.vector.len() {
-            return None;
-        }
-        let value = self.vector.get(self.idx);
-        // +1 means insert, -1 means delete, and default to +1 insert when diff is not provided
-        let diff = if let Some(diff) = self.diff.as_ref() {
-            if let Ok(diff_at) = diff.get(self.idx).try_into() {
-                diff_at
-            } else {
-                common_telemetry::warn!("Invalid diff value at index {}", self.idx);
-                return None;
-            }
-        } else {
-            1
-        };
-
-        self.idx += 1;
-        Some((value, diff))
-    }
-}
-
 /// Generate signature for each aggregate function
 macro_rules! generate_signature {
    ($value:ident,
--- a/src/flow/src/expr/scalar.rs
+++ b/src/flow/src/expr/scalar.rs
@@ -16,17 +16,20 @@

 use std::collections::{BTreeMap, BTreeSet};

+use arrow::array::{make_array, ArrayData, ArrayRef};
 use common_error::ext::BoxedError;
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::value::Value;
-use datatypes::vectors::{BooleanVector, Helper, NullVector, Vector, VectorRef};
+use datatypes::vectors::{BooleanVector, Helper, VectorRef};
+use hydroflow::lattices::cc_traits::Iter;
+use itertools::Itertools;
 use snafu::{ensure, OptionExt, ResultExt};

 use crate::error::{
    DatafusionSnafu, Error, InvalidQuerySnafu, UnexpectedSnafu, UnsupportedTemporalFilterSnafu,
 };
 use crate::expr::error::{
-    DataTypeSnafu, EvalError, InternalSnafu, InvalidArgumentSnafu, OptimizeSnafu, TypeMismatchSnafu,
+    ArrowSnafu, DataTypeSnafu, EvalError, InvalidArgumentSnafu, OptimizeSnafu, TypeMismatchSnafu,
 };
 use crate::expr::func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
 use crate::expr::{Batch, DfScalarFunction};
@@ -222,6 +225,8 @@ impl ScalarExpr {
        }
    }

+    /// NOTE: this if then eval impl assume all given expr are pure, and will not change the state of the world
+    /// since it will evaluate both then and else branch and filter the result
    fn eval_if_then(
        batch: &Batch,
        cond: &ScalarExpr,
@@ -240,130 +245,69 @@ impl ScalarExpr {
            })?
            .as_boolean_array();

-        let mut then_input_batch = None;
-        let mut else_input_batch = None;
-        let mut null_input_batch = None;
+        let indices = bool_conds
+            .into_iter()
+            .enumerate()
+            .map(|(idx, b)| {
+                (
+                    match b {
+                        Some(true) => 0,  // then branch vector
+                        Some(false) => 1, // else branch vector
+                        None => 2,        // null vector
+                    },
+                    idx,
+                )
+            })
+            .collect_vec();

-        // instructions for how to reassembly result vector,
-        // iterate over (type of vec, offset, length) and append to resulting vec
-        let mut assembly_idx = vec![];
+        let then_input_vec = then.eval_batch(batch)?;
+        let else_input_vec = els.eval_batch(batch)?;

-        // append batch, returning appended batch's slice in (offset, length)
-        fn append_batch(
-            batch: &mut Option<Batch>,
-            to_be_append: Batch,
-        ) -> Result<(usize, usize), EvalError> {
-            let len = to_be_append.row_count();
-            if let Some(batch) = batch {
-                let offset = batch.row_count();
-                batch.append_batch(to_be_append)?;
-                Ok((offset, len))
-            } else {
-                *batch = Some(to_be_append);
-                Ok((0, len))
+        ensure!(
+            then_input_vec.data_type() == else_input_vec.data_type(),
+            TypeMismatchSnafu {
+                expected: then_input_vec.data_type(),
+                actual: else_input_vec.data_type(),
            }
+        );
+
+        ensure!(
+            then_input_vec.len() == else_input_vec.len() && then_input_vec.len() == batch.row_count(),
+            InvalidArgumentSnafu {
+                reason: format!(
+                    "then and else branch must have the same length(found {} and {}) which equals input batch's row count(which is {})",
+                    then_input_vec.len(),
+                    else_input_vec.len(),
+                    batch.row_count()
+                )
+            }
+        );
+
+        fn new_nulls(dt: &arrow_schema::DataType, len: usize) -> ArrayRef {
+            let data = ArrayData::new_null(dt, len);
+            make_array(data)
        }

-        let mut prev_cond: Option<Option<bool>> = None;
-        let mut prev_start_idx: Option<usize> = None;
-        // first put different conds' vector into different batches
-        for (idx, cond) in bool_conds.iter().enumerate() {
-            // if belong to same slice and not last one continue
-            if prev_cond == Some(cond) {
-                continue;
-            } else if let Some(prev_cond_idx) = prev_start_idx {
-                let prev_cond = prev_cond.unwrap();
+        let null_input_vec = new_nulls(
+            &then_input_vec.data_type().as_arrow_type(),
+            batch.row_count(),
+        );

-                // put a slice to corresponding batch
-                let slice_offset = prev_cond_idx;
-                let slice_length = idx - prev_cond_idx;
-                let to_be_append = batch.slice(slice_offset, slice_length)?;
+        let interleave_values = vec![
+            then_input_vec.to_arrow_array(),
+            else_input_vec.to_arrow_array(),
+            null_input_vec,
+        ];
+        let int_ref: Vec<_> = interleave_values.iter().map(|x| x.as_ref()).collect();

-                let to_put_back = match prev_cond {
-                    Some(true) => (
-                        Some(true),
-                        append_batch(&mut then_input_batch, to_be_append)?,
-                    ),
-                    Some(false) => (
-                        Some(false),
-                        append_batch(&mut else_input_batch, to_be_append)?,
-                    ),
-                    None => (None, append_batch(&mut null_input_batch, to_be_append)?),
-                };
-                assembly_idx.push(to_put_back);
-            }
-            prev_cond = Some(cond);
-            prev_start_idx = Some(idx);
-        }
-
-        // deal with empty and last slice case
-        if let Some(slice_offset) = prev_start_idx {
-            let prev_cond = prev_cond.unwrap();
-            let slice_length = bool_conds.len() - slice_offset;
-            let to_be_append = batch.slice(slice_offset, slice_length)?;
-            let to_put_back = match prev_cond {
-                Some(true) => (
-                    Some(true),
-                    append_batch(&mut then_input_batch, to_be_append)?,
-                ),
-                Some(false) => (
-                    Some(false),
-                    append_batch(&mut else_input_batch, to_be_append)?,
-                ),
-                None => (None, append_batch(&mut null_input_batch, to_be_append)?),
-            };
-            assembly_idx.push(to_put_back);
-        }
-
-        let then_output_vec = then_input_batch
-            .map(|batch| then.eval_batch(&batch))
-            .transpose()?;
-        let else_output_vec = else_input_batch
-            .map(|batch| els.eval_batch(&batch))
-            .transpose()?;
-        let null_output_vec = null_input_batch
-            .map(|null| NullVector::new(null.row_count()).slice(0, null.row_count()));
-
-        let dt = then_output_vec
-            .as_ref()
-            .map(|v| v.data_type())
-            .or(else_output_vec.as_ref().map(|v| v.data_type()))
-            .unwrap_or(ConcreteDataType::null_datatype());
-        let mut builder = dt.create_mutable_vector(conds.len());
-        for (cond, (offset, length)) in assembly_idx {
-            let slice = match cond {
-                Some(true) => then_output_vec.as_ref(),
-                Some(false) => else_output_vec.as_ref(),
-                None => null_output_vec.as_ref(),
-            }
-            .context(InternalSnafu {
-                reason: "Expect corresponding output vector to exist",
+        let interleave_res_arr =
+            arrow::compute::interleave(&int_ref, &indices).context(ArrowSnafu {
+                context: "Failed to interleave output arrays",
            })?;
-            // TODO(discord9): seems `extend_slice_of` doesn't support NullVector or ConstantVector
-            // consider adding it maybe?
-            if slice.data_type().is_null() {
-                builder.push_nulls(length);
-            } else if slice.is_const() {
-                let arr = slice.slice(offset, length).to_arrow_array();
-                let vector = Helper::try_into_vector(arr).context(DataTypeSnafu {
-                    msg: "Failed to convert arrow array to vector",
-                })?;
-                builder
-                    .extend_slice_of(vector.as_ref(), 0, vector.len())
-                    .context(DataTypeSnafu {
-                        msg: "Failed to build result vector for if-then expression",
-                    })?;
-            } else {
-                builder
-                    .extend_slice_of(slice.as_ref(), offset, length)
-                    .context(DataTypeSnafu {
-                        msg: "Failed to build result vector for if-then expression",
-                    })?;
-            }
-        }
-        let result_vec = builder.to_vector();
-
-        Ok(result_vec)
+        let res_vec = Helper::try_into_vector(interleave_res_arr).context(DataTypeSnafu {
+            msg: "Failed to convert arrow array to vector",
+        })?;
+        Ok(res_vec)
    }

    /// Eval this expression with the given values.
@@ -685,7 +629,7 @@ impl ScalarExpr {

 #[cfg(test)]
 mod test {
-    use datatypes::vectors::Int32Vector;
+    use datatypes::vectors::{Int32Vector, Vector};
    use pretty_assertions::assert_eq;

    use super::*;
@@ -781,7 +725,7 @@ mod test {
    }

    #[test]
-    fn test_eval_batch() {
+    fn test_eval_batch_if_then() {
        // TODO(discord9): add more tests
        {
            let expr = ScalarExpr::If {
@@ -840,7 +784,7 @@ mod test {
            let vectors = vec![Int32Vector::from(raw).slice(0, raw_len)];

            let batch = Batch::try_new(vectors, raw_len).unwrap();
-            let expected = NullVector::new(raw_len).slice(0, raw_len);
+            let expected = Int32Vector::from(vec![]).slice(0, raw_len);
            assert_eq!(expr.eval_batch(&batch).unwrap(), expected);
        }
    }
--- a/src/flow/src/repr.rs
+++ b/src/flow/src/repr.rs
@@ -50,12 +50,13 @@ pub type KeyValDiffRow = ((Row, Row), Timestamp, Diff);
 /// broadcast channel capacity, can be important to memory consumption, since this influence how many
 /// updates can be buffered in memory in the entire dataflow
 /// TODO(discord9): add config for this, so cpu&mem usage can be balanced and configured by this
-pub const BROADCAST_CAP: usize = 65535;
+pub const BROADCAST_CAP: usize = 1024;

 /// The maximum capacity of the send buffer, to prevent the buffer from growing too large
 pub const SEND_BUF_CAP: usize = BROADCAST_CAP * 2;

-pub const BATCH_SIZE: usize = BROADCAST_CAP / 2;
+/// Flow worker will try to at least accumulate this many rows before processing them(if one second havn't passed)
+pub const BATCH_SIZE: usize = 32 * 16384;

 /// Convert a value that is or can be converted to Datetime to internal timestamp
 ///
--- a/src/flow/src/utils.rs
+++ b/src/flow/src/utils.rs
@@ -18,7 +18,7 @@ use std::collections::{BTreeMap, BTreeSet};
 use std::ops::Bound;
 use std::sync::Arc;

-use common_telemetry::debug;
+use common_telemetry::trace;
 use smallvec::{smallvec, SmallVec};
 use tokio::sync::RwLock;

@@ -235,9 +235,11 @@ impl Arrangement {
            if let Some(s) = &mut self.expire_state {
                if let Some(expired_by) = s.get_expire_duration_and_update_event_ts(now, &key)? {
                    max_expired_by = max_expired_by.max(Some(expired_by));
-                    debug!(
+                    trace!(
                        "Expired key: {:?}, expired by: {:?} with time being now={}",
-                        key, expired_by, now
+                        key,
+                        expired_by,
+                        now
                    );
                    continue;
                }
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -356,9 +356,10 @@ impl SqlQueryHandler for Instance {

    async fn is_valid_schema(&self, catalog: &str, schema: &str) -> Result<bool> {
        self.catalog_manager
-            .schema_exists(catalog, schema)
+            .schema_exists(catalog, schema, None)
            .await
            .context(error::CatalogSnafu)
+            .map(|b| b && !self.catalog_manager.is_reserved_schema_name(schema))
    }
 }

--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -87,7 +87,7 @@ impl OpenTelemetryProtocolHandler for Instance {

        OTLP_TRACES_ROWS.inc_by(rows as u64);

-        self.handle_row_inserts(requests, ctx)
+        self.handle_log_inserts(requests, ctx)
            .await
            .map_err(BoxedError::new)
            .context(error::ExecuteGrpcQuerySnafu)
--- a/src/frontend/src/instance/prom_store.rs
+++ b/src/frontend/src/instance/prom_store.rs
@@ -102,7 +102,7 @@ impl Instance {
    ) -> Result<Output> {
        let table = self
            .catalog_manager
-            .table(catalog_name, schema_name, table_name)
+            .table(catalog_name, schema_name, table_name, Some(ctx))
            .await
            .context(CatalogSnafu)?
            .with_context(|| TableNotFoundSnafu {
--- a/src/frontend/src/script.rs
+++ b/src/frontend/src/script.rs
@@ -152,7 +152,12 @@ mod python {

            if let Some(table) = self
                .catalog_manager
-                .table(&expr.catalog_name, &expr.schema_name, &expr.table_name)
+                .table(
+                    &expr.catalog_name,
+                    &expr.schema_name,
+                    &expr.table_name,
+                    None,
+                )
                .await
                .context(CatalogSnafu)?
            {
@@ -185,6 +190,7 @@ mod python {
                    &table_name.catalog_name,
                    &table_name.schema_name,
                    &table_name.table_name,
+                    None,
                )
                .await
                .context(CatalogSnafu)?
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::io::SeekFrom;
 use std::sync::Arc;

 use async_trait::async_trait;
-use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
+use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::InvertedIndexMetas;
 use snafu::{ensure, ResultExt};

-use crate::inverted_index::error::{ReadSnafu, Result, SeekSnafu, UnexpectedBlobSizeSnafu};
+use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
 use crate::inverted_index::format::reader::footer::InvertedIndeFooterReader;
 use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::format::MIN_BLOB_SIZE;
@@ -49,28 +48,28 @@ impl<R> InvertedIndexBlobReader<R> {
 }

 #[async_trait]
-impl<R: AsyncRead + AsyncSeek + Unpin + Send> InvertedIndexReader for InvertedIndexBlobReader<R> {
+impl<R: RangeReader> InvertedIndexReader for InvertedIndexBlobReader<R> {
    async fn read_all(&mut self, dest: &mut Vec<u8>) -> Result<usize> {
+        let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
        self.source
-            .seek(SeekFrom::Start(0))
+            .read_into(0..metadata.content_length, dest)
            .await
-            .context(SeekSnafu)?;
-        self.source.read_to_end(dest).await.context(ReadSnafu)
+            .context(CommonIoSnafu)?;
+        Ok(metadata.content_length as usize)
    }

    async fn seek_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>> {
-        self.source
-            .seek(SeekFrom::Start(offset))
+        let buf = self
+            .source
+            .read(offset..offset + size as u64)
            .await
-            .context(SeekSnafu)?;
-        let mut buf = vec![0u8; size as usize];
-        self.source.read(&mut buf).await.context(ReadSnafu)?;
-        Ok(buf)
+            .context(CommonIoSnafu)?;
+        Ok(buf.into())
    }

    async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>> {
-        let end = SeekFrom::End(0);
-        let blob_size = self.source.seek(end).await.context(SeekSnafu)?;
+        let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
+        let blob_size = metadata.content_length;
        Self::validate_blob_size(blob_size)?;

        let mut footer_reader = InvertedIndeFooterReader::new(&mut self.source, blob_size);
@@ -81,6 +80,7 @@ impl<R: AsyncRead + AsyncSeek + Unpin + Send> InvertedIndexReader for InvertedIn
 #[cfg(test)]
 mod tests {
    use common_base::bit_vec::prelude::*;
+    use common_base::range_read::RangeReaderAdapter;
    use fst::MapBuilder;
    use futures::io::Cursor;
    use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
@@ -163,7 +163,8 @@ mod tests {
    #[tokio::test]
    async fn test_inverted_index_blob_reader_metadata() {
        let blob = create_inverted_index_blob();
-        let mut blob_reader = InvertedIndexBlobReader::new(Cursor::new(blob));
+        let cursor = RangeReaderAdapter(Cursor::new(blob));
+        let mut blob_reader = InvertedIndexBlobReader::new(cursor);

        let metas = blob_reader.metadata().await.unwrap();
        assert_eq!(metas.metas.len(), 2);
@@ -190,7 +191,8 @@ mod tests {
    #[tokio::test]
    async fn test_inverted_index_blob_reader_fst() {
        let blob = create_inverted_index_blob();
-        let mut blob_reader = InvertedIndexBlobReader::new(Cursor::new(blob));
+        let cursor = RangeReaderAdapter(Cursor::new(blob));
+        let mut blob_reader = InvertedIndexBlobReader::new(cursor);

        let metas = blob_reader.metadata().await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();
@@ -222,7 +224,8 @@ mod tests {
    #[tokio::test]
    async fn test_inverted_index_blob_reader_bitmap() {
        let blob = create_inverted_index_blob();
-        let mut blob_reader = InvertedIndexBlobReader::new(Cursor::new(blob));
+        let cursor = RangeReaderAdapter(Cursor::new(blob));
+        let mut blob_reader = InvertedIndexBlobReader::new(cursor);

        let metas = blob_reader.metadata().await.unwrap();
        let meta = metas.metas.get("tag0").unwrap();
--- a/src/index/src/inverted_index/format/reader/footer.rs
+++ b/src/index/src/inverted_index/format/reader/footer.rs
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::io::SeekFrom;
-
-use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
+use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
 use prost::Message;
 use snafu::{ensure, ResultExt};

 use crate::inverted_index::error::{
-    DecodeProtoSnafu, ReadSnafu, Result, SeekSnafu, UnexpectedFooterPayloadSizeSnafu,
+    CommonIoSnafu, DecodeProtoSnafu, Result, UnexpectedFooterPayloadSizeSnafu,
    UnexpectedOffsetSizeSnafu, UnexpectedZeroSegmentRowCountSnafu,
 };
 use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
@@ -37,7 +35,7 @@ impl<R> InvertedIndeFooterReader<R> {
    }
 }

-impl<R: AsyncRead + AsyncSeek + Unpin> InvertedIndeFooterReader<R> {
+impl<R: RangeReader> InvertedIndeFooterReader<R> {
    pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
        let payload_size = self.read_payload_size().await?;
        let metas = self.read_payload(payload_size).await?;
@@ -45,26 +43,26 @@ impl<R: AsyncRead + AsyncSeek + Unpin> InvertedIndeFooterReader<R> {
    }

    async fn read_payload_size(&mut self) -> Result<u64> {
-        let size_offset = SeekFrom::Start(self.blob_size - FOOTER_PAYLOAD_SIZE_SIZE);
-        self.source.seek(size_offset).await.context(SeekSnafu)?;
-        let size_buf = &mut [0u8; FOOTER_PAYLOAD_SIZE_SIZE as usize];
-        self.source.read_exact(size_buf).await.context(ReadSnafu)?;
+        let mut size_buf = [0u8; FOOTER_PAYLOAD_SIZE_SIZE as usize];
+        let end = self.blob_size;
+        let start = end - FOOTER_PAYLOAD_SIZE_SIZE;
+        self.source
+            .read_into(start..end, &mut &mut size_buf[..])
+            .await
+            .context(CommonIoSnafu)?;

-        let payload_size = u32::from_le_bytes(*size_buf) as u64;
+        let payload_size = u32::from_le_bytes(size_buf) as u64;
        self.validate_payload_size(payload_size)?;

        Ok(payload_size)
    }

    async fn read_payload(&mut self, payload_size: u64) -> Result<InvertedIndexMetas> {
-        let payload_offset =
-            SeekFrom::Start(self.blob_size - FOOTER_PAYLOAD_SIZE_SIZE - payload_size);
-        self.source.seek(payload_offset).await.context(SeekSnafu)?;
+        let end = self.blob_size - FOOTER_PAYLOAD_SIZE_SIZE;
+        let start = end - payload_size;
+        let bytes = self.source.read(start..end).await.context(CommonIoSnafu)?;

-        let payload = &mut vec![0u8; payload_size as usize];
-        self.source.read_exact(payload).await.context(ReadSnafu)?;
-
-        let metas = InvertedIndexMetas::decode(&payload[..]).context(DecodeProtoSnafu)?;
+        let metas = InvertedIndexMetas::decode(&*bytes).context(DecodeProtoSnafu)?;
        self.validate_metas(&metas, payload_size)?;

        Ok(metas)
@@ -115,6 +113,7 @@ impl<R: AsyncRead + AsyncSeek + Unpin> InvertedIndeFooterReader<R> {

 #[cfg(test)]
 mod tests {
+    use common_base::range_read::RangeReaderAdapter;
    use futures::io::Cursor;
    use prost::Message;

@@ -144,7 +143,8 @@ mod tests {

        let payload_buf = create_test_payload(meta);
        let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(Cursor::new(payload_buf), blob_size);
+        let cursor = RangeReaderAdapter(Cursor::new(payload_buf));
+        let mut reader = InvertedIndeFooterReader::new(cursor, blob_size);

        let payload_size = reader.read_payload_size().await.unwrap();
        let metas = reader.read_payload(payload_size).await.unwrap();
@@ -164,7 +164,8 @@ mod tests {
        let mut payload_buf = create_test_payload(meta);
        payload_buf.push(0xff); // Add an extra byte to corrupt the footer
        let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(Cursor::new(payload_buf), blob_size);
+        let cursor = RangeReaderAdapter(Cursor::new(payload_buf));
+        let mut reader = InvertedIndeFooterReader::new(cursor, blob_size);

        let payload_size_result = reader.read_payload_size().await;
        assert!(payload_size_result.is_err());
@@ -181,7 +182,8 @@ mod tests {

        let payload_buf = create_test_payload(meta);
        let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(Cursor::new(payload_buf), blob_size);
+        let cursor = RangeReaderAdapter(Cursor::new(payload_buf));
+        let mut reader = InvertedIndeFooterReader::new(cursor, blob_size);

        let payload_size = reader.read_payload_size().await.unwrap();
        let payload_result = reader.read_payload(payload_size).await;
--- a/src/index/src/inverted_index/format/writer/blob.rs
+++ b/src/index/src/inverted_index/format/writer/blob.rs
@@ -99,6 +99,7 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexBlobWriter<W> {

 #[cfg(test)]
 mod tests {
+    use common_base::range_read::RangeReaderAdapter;
    use futures::io::Cursor;
    use futures::stream;

@@ -119,7 +120,7 @@ mod tests {
            .await
            .unwrap();

-        let cursor = Cursor::new(blob);
+        let cursor = RangeReaderAdapter(Cursor::new(blob));
        let mut reader = InvertedIndexBlobReader::new(cursor);
        let metadata = reader.metadata().await.unwrap();
        assert_eq!(metadata.total_row_count, 8);
@@ -160,7 +161,7 @@ mod tests {
            .await
            .unwrap();

-        let cursor = Cursor::new(blob);
+        let cursor = RangeReaderAdapter(Cursor::new(blob));
        let mut reader = InvertedIndexBlobReader::new(cursor);
        let metadata = reader.metadata().await.unwrap();
        assert_eq!(metadata.total_row_count, 8);
--- a/src/log-store/src/raft_engine/log_store.rs
+++ b/src/log-store/src/raft_engine/log_store.rs
@@ -16,6 +16,7 @@ use std::collections::{hash_map, HashMap};
 use std::fmt::{Debug, Formatter};
 use std::sync::atomic::{AtomicI64, Ordering};
 use std::sync::Arc;
+use std::time::Duration;

 use async_stream::stream;
 use common_runtime::{RepeatedTask, TaskFunction};
@@ -40,7 +41,9 @@ use crate::raft_engine::protos::logstore::{EntryImpl, NamespaceImpl};
 const NAMESPACE_PREFIX: &str = "$sys/";

 pub struct RaftEngineLogStore {
-    config: RaftEngineConfig,
+    sync_write: bool,
+    sync_period: Option<Duration>,
+    read_batch_size: usize,
    engine: Arc<Engine>,
    gc_task: RepeatedTask<Error>,
    last_sync_time: AtomicI64,
@@ -76,7 +79,7 @@ impl TaskFunction<Error> for PurgeExpiredFilesFunction {
 }

 impl RaftEngineLogStore {
-    pub async fn try_new(dir: String, config: RaftEngineConfig) -> Result<Self> {
+    pub async fn try_new(dir: String, config: &RaftEngineConfig) -> Result<Self> {
        let raft_engine_config = Config {
            dir,
            purge_threshold: ReadableSize(config.purge_threshold.0),
@@ -85,6 +88,7 @@ impl RaftEngineLogStore {
            target_file_size: ReadableSize(config.file_size.0),
            enable_log_recycle: config.enable_log_recycle,
            prefill_for_recycle: config.prefill_log_files,
+            recovery_threads: config.recovery_parallelism,
            ..Default::default()
        };
        let engine = Arc::new(Engine::open(raft_engine_config).context(RaftEngineSnafu)?);
@@ -96,7 +100,9 @@ impl RaftEngineLogStore {
        );

        let log_store = Self {
-            config,
+            sync_write: config.sync_write,
+            sync_period: config.sync_period,
+            read_batch_size: config.read_batch_size,
            engine,
            gc_task,
            last_sync_time: AtomicI64::new(0),
@@ -196,7 +202,9 @@ impl RaftEngineLogStore {
 impl Debug for RaftEngineLogStore {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("RaftEngineLogsStore")
-            .field("config", &self.config)
+            .field("sync_write", &self.sync_write)
+            .field("sync_period", &self.sync_period)
+            .field("read_batch_size", &self.read_batch_size)
            .field("started", &self.gc_task.started())
            .finish()
    }
@@ -228,9 +236,9 @@ impl LogStore for RaftEngineLogStore {

        let (mut batch, last_entry_ids) = self.entries_to_batch(entries)?;

-        let mut sync = self.config.sync_write;
+        let mut sync = self.sync_write;

-        if let Some(sync_period) = &self.config.sync_period {
+        if let Some(sync_period) = &self.sync_period {
            let now = common_time::util::current_time_millis();
            if now - self.last_sync_time.load(Ordering::Relaxed) >= sync_period.as_millis() as i64 {
                self.last_sync_time.store(now, Ordering::Relaxed);
@@ -276,7 +284,7 @@ impl LogStore for RaftEngineLogStore {
            entry_id,
            self.span(ns)
        );
-        let max_batch_size = self.config.read_batch_size;
+        let max_batch_size = self.read_batch_size;
        let (tx, mut rx) = tokio::sync::mpsc::channel(max_batch_size);
        let _handle = common_runtime::spawn_global(async move {
            while start_index <= last_index {
@@ -489,7 +497,7 @@ mod tests {
        let dir = create_temp_dir("raft-engine-logstore-test");
        let logstore = RaftEngineLogStore::try_new(
            dir.path().to_str().unwrap().to_string(),
-            RaftEngineConfig::default(),
+            &RaftEngineConfig::default(),
        )
        .await
        .unwrap();
@@ -502,7 +510,7 @@ mod tests {
        let dir = create_temp_dir("raft-engine-logstore-test");
        let logstore = RaftEngineLogStore::try_new(
            dir.path().to_str().unwrap().to_string(),
-            RaftEngineConfig::default(),
+            &RaftEngineConfig::default(),
        )
        .await
        .unwrap();
@@ -528,7 +536,7 @@ mod tests {
        let dir = create_temp_dir("raft-engine-logstore-test");
        let logstore = RaftEngineLogStore::try_new(
            dir.path().to_str().unwrap().to_string(),
-            RaftEngineConfig::default(),
+            &RaftEngineConfig::default(),
        )
        .await
        .unwrap();
@@ -570,7 +578,7 @@ mod tests {
        {
            let logstore = RaftEngineLogStore::try_new(
                dir.path().to_str().unwrap().to_string(),
-                RaftEngineConfig::default(),
+                &RaftEngineConfig::default(),
            )
            .await
            .unwrap();
@@ -590,7 +598,7 @@ mod tests {

        let logstore = RaftEngineLogStore::try_new(
            dir.path().to_str().unwrap().to_string(),
-            RaftEngineConfig::default(),
+            &RaftEngineConfig::default(),
        )
        .await
        .unwrap();
@@ -634,7 +642,7 @@ mod tests {
            ..Default::default()
        };

-        RaftEngineLogStore::try_new(path, config).await.unwrap()
+        RaftEngineLogStore::try_new(path, &config).await.unwrap()
    }

    #[tokio::test]
--- a/src/log-store/src/test_util/log_store_util.rs
+++ b/src/log-store/src/test_util/log_store_util.rs
@@ -29,7 +29,7 @@ pub async fn create_tmp_local_file_log_store<P: AsRef<Path>>(path: P) -> RaftEng
        file_size: ReadableSize::kb(128),
        ..Default::default()
    };
-    RaftEngineLogStore::try_new(path, cfg).await.unwrap()
+    RaftEngineLogStore::try_new(path, &cfg).await.unwrap()
 }

 /// Create a [KafkaLogStore].
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -655,13 +655,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Invalid heartbeat request: {}", err_msg))]
-    InvalidHeartbeatRequest {
-        err_msg: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Failed to publish message"))]
    PublishMessage {
        #[snafu(source)]
@@ -809,7 +802,6 @@ impl ErrorExt for Error {
            | Error::UnsupportedSelectorType { .. }
            | Error::InvalidArguments { .. }
            | Error::InitExportMetricsTask { .. }
-            | Error::InvalidHeartbeatRequest { .. }
            | Error::ProcedureNotFound { .. }
            | Error::TooManyPartitions { .. }
            | Error::TomlFormat { .. } => StatusCode::InvalidArguments,
--- a/src/meta-srv/src/handler/collect_stats_handler.rs
+++ b/src/meta-srv/src/handler/collect_stats_handler.rs
@@ -15,8 +15,12 @@
 use std::cmp::Ordering;

 use api::v1::meta::{HeartbeatRequest, Role};
+use common_meta::instruction::CacheIdent;
+use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
+use common_meta::key::{MetadataKey, MetadataValue};
+use common_meta::peer::Peer;
 use common_meta::rpc::store::PutRequest;
-use common_telemetry::warn;
+use common_telemetry::{error, warn};
 use dashmap::DashMap;
 use snafu::ResultExt;

@@ -120,6 +124,13 @@ impl HeartbeatHandler for CollectStatsHandler {
            true
        };

+        // Need to refresh the [datanode -> address] mapping
+        if refresh {
+            // Safety: `epoch_stats.stats` is not empty
+            let last = epoch_stats.stats.last().unwrap();
+            rewrite_node_address(ctx, last).await;
+        }
+
        if !refresh && epoch_stats.len() < MAX_CACHED_STATS_PER_KEY {
            return Ok(HandleControl::Continue);
        }
@@ -131,7 +142,7 @@ impl HeartbeatHandler for CollectStatsHandler {
        let put = PutRequest {
            key,
            value,
-            ..Default::default()
+            prev_kv: false,
        };

        let _ = ctx
@@ -144,6 +155,44 @@ impl HeartbeatHandler for CollectStatsHandler {
    }
 }

+async fn rewrite_node_address(ctx: &mut Context, stat: &Stat) {
+    let peer = Peer {
+        id: stat.id,
+        addr: stat.addr.clone(),
+    };
+    let key = NodeAddressKey::with_datanode(peer.id).to_bytes();
+    if let Ok(value) = NodeAddressValue::new(peer.clone()).try_as_raw_value() {
+        let put = PutRequest {
+            key,
+            value,
+            prev_kv: false,
+        };
+
+        match ctx.leader_cached_kv_backend.put(put).await {
+            Ok(_) => {
+                // broadcast invalidating cache
+                let cache_idents = stat
+                    .table_ids()
+                    .into_iter()
+                    .map(CacheIdent::TableId)
+                    .collect::<Vec<_>>();
+                if let Err(e) = ctx
+                    .cache_invalidator
+                    .invalidate(&Default::default(), &cache_idents)
+                    .await
+                {
+                    error!(e; "Failed to invalidate {} `NodeAddressKey` cache, peer: {:?}", cache_idents.len(), peer);
+                }
+            }
+            Err(e) => {
+                error!(e; "Failed to update NodeAddressValue: {:?}", peer);
+            }
+        }
+    } else {
+        warn!("Failed to serialize NodeAddressValue: {:?}", peer);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/meta-srv/src/handler/extract_stat_handler.rs
+++ b/src/meta-srv/src/handler/extract_stat_handler.rs
@@ -13,7 +13,7 @@
 // limitations under the License.

 use api::v1::meta::{HeartbeatRequest, Role};
-use common_telemetry::warn;
+use common_telemetry::{info, warn};

 use super::node_stat::Stat;
 use crate::error::Result;
@@ -40,12 +40,15 @@ impl HeartbeatHandler for ExtractStatHandler {
            return Ok(HandleControl::Continue);
        }

-        match Stat::try_from(req.clone()) {
+        match Stat::try_from(req) {
            Ok(stat) => {
                let _ = acc.stat.insert(stat);
            }
-            Err(err) => {
-                warn!(err; "Incomplete heartbeat data: {:?}", req);
+            Err(Some(header)) => {
+                info!("New handshake request: {:?}", header);
+            }
+            Err(_) => {
+                warn!("Incomplete heartbeat data: {:?}", req);
            }
        };

--- a/src/meta-srv/src/handler/failure_handler.rs
+++ b/src/meta-srv/src/handler/failure_handler.rs
@@ -93,6 +93,7 @@ mod tests {
                approximate_bytes: 0,
                engine: default_engine().to_string(),
                role: RegionRole::Follower,
+                extensions: Default::default(),
            }
        }
        acc.stat = Some(Stat {
--- a/src/meta-srv/src/handler/node_stat.rs
+++ b/src/meta-srv/src/handler/node_stat.rs
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};

-use api::v1::meta::HeartbeatRequest;
+use api::v1::meta::{HeartbeatRequest, RequestHeader};
 use common_meta::ClusterId;
 use common_time::util as time_util;
 use serde::{Deserialize, Serialize};
 use store_api::region_engine::RegionRole;
 use store_api::storage::RegionId;
+use table::metadata::TableId;

-use crate::error::{Error, InvalidHeartbeatRequestSnafu};
 use crate::key::DatanodeStatKey;

 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
@@ -57,6 +57,8 @@ pub struct RegionStat {
    pub engine: String,
    /// The region role.
    pub role: RegionRole,
+    /// The extension info of this region
+    pub extensions: HashMap<String, Vec<u8>>,
 }

 impl Stat {
@@ -77,6 +79,11 @@ impl Stat {
        self.region_stats.iter().map(|s| (s.id, s.role)).collect()
    }

+    /// Returns all table ids in the region stats.
+    pub fn table_ids(&self) -> HashSet<TableId> {
+        self.region_stats.iter().map(|s| s.id.table_id()).collect()
+    }
+
    pub fn retain_active_region_stats(&mut self, inactive_region_ids: &HashSet<RegionId>) {
        if inactive_region_ids.is_empty() {
            return;
@@ -90,10 +97,10 @@ impl Stat {
    }
 }

-impl TryFrom<HeartbeatRequest> for Stat {
-    type Error = Error;
+impl TryFrom<&HeartbeatRequest> for Stat {
+    type Error = Option<RequestHeader>;

-    fn try_from(value: HeartbeatRequest) -> Result<Self, Self::Error> {
+    fn try_from(value: &HeartbeatRequest) -> Result<Self, Self::Error> {
        let HeartbeatRequest {
            header,
            peer,
@@ -105,9 +112,9 @@ impl TryFrom<HeartbeatRequest> for Stat {
        match (header, peer) {
            (Some(header), Some(peer)) => {
                let region_stats = region_stats
-                    .into_iter()
-                    .map(RegionStat::try_from)
-                    .collect::<Result<Vec<_>, _>>()?;
+                    .iter()
+                    .map(RegionStat::from)
+                    .collect::<Vec<_>>();

                Ok(Self {
                    timestamp_millis: time_util::current_time_millis(),
@@ -115,34 +122,30 @@ impl TryFrom<HeartbeatRequest> for Stat {
                    // datanode id
                    id: peer.id,
                    // datanode address
-                    addr: peer.addr,
+                    addr: peer.addr.clone(),
                    rcus: region_stats.iter().map(|s| s.rcus).sum(),
                    wcus: region_stats.iter().map(|s| s.wcus).sum(),
                    region_num: region_stats.len() as u64,
                    region_stats,
-                    node_epoch,
+                    node_epoch: *node_epoch,
                })
            }
-            _ => InvalidHeartbeatRequestSnafu {
-                err_msg: "missing header or peer",
-            }
-            .fail(),
+            (header, _) => Err(header.clone()),
        }
    }
 }

-impl TryFrom<api::v1::meta::RegionStat> for RegionStat {
-    type Error = Error;
-
-    fn try_from(value: api::v1::meta::RegionStat) -> Result<Self, Self::Error> {
-        Ok(Self {
+impl From<&api::v1::meta::RegionStat> for RegionStat {
+    fn from(value: &api::v1::meta::RegionStat) -> Self {
+        Self {
            id: RegionId::from_u64(value.region_id),
            rcus: value.rcus,
            wcus: value.wcus,
            approximate_bytes: value.approximate_bytes,
            engine: value.engine.to_string(),
            role: RegionRole::from(value.role()),
-        })
+            extensions: value.extensions.clone(),
+        }
    }
 }

--- a/src/meta-srv/src/handler/region_lease_handler.rs
+++ b/src/meta-srv/src/handler/region_lease_handler.rs
@@ -135,6 +135,7 @@ mod test {
            wcus: 0,
            approximate_bytes: 0,
            engine: String::new(),
+            extensions: Default::default(),
        }
    }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
zyy17	0847ff36ce	fix: config test failed and use `similar_asserts::assert_eq` to replace `assert_eq` for long string compare (#4731 ) * fix: config test failed and use 'similar_asserts::assert_eq' to replace 'assert_eq' for long string compare * Update Cargo.toml Co-authored-by: Yingwen <realevenyag@gmail.com> * Update src/cmd/tests/load_config_test.rs Co-authored-by: Yingwen <realevenyag@gmail.com> --------- Co-authored-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com>	2024-09-18 07:41:25 +00:00
shuiyisong	c014e875f3	chore: add auto-decompression layer for otlp http request (#4723 ) * chore: add auto-decompression for http request * test: otlp	2024-09-18 04:32:00 +00:00
Zhenchi	3b5b906543	feat(index): add explicit adapter between `RangeReader` and `AsyncRead` (#4724 ) Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-09-18 03:33:55 +00:00
Weny Xu	d1dfffcdaf	chore: enable fuzz test for append table (#4702 ) * chore: enable fuzz test for append table * fix: fix mysql translator	2024-09-18 03:01:30 +00:00
localhost	36b1bafbf0	fix: pipeline dissert error is returned directly to the user, instead of printing a warn log (#4709 ) * fix: pipeline dissert error is returned directly to the user, instead of printing a warn log * chore: add more test for pipeline	2024-09-12 18:21:05 +00:00
Yohan Wal	67fb3d003e	feat: add respective get_by_path UDFs for JSON type (#4720 ) * feat: add respectiv get_by_path udf for json type * Apply review comments Co-authored-by: Weny Xu <wenymedia@gmail.com> * fix: fix compile error * refactor: change name of UDFs, add some tests --------- Co-authored-by: Weny Xu <wenymedia@gmail.com>	2024-09-11 08:17:57 +00:00
zyy17	aa03d3b11c	docs: use docs comment prefix and bump toml2docs version (#4711 )	2024-09-11 07:49:23 +00:00
discord9	a3d567f0c9	perf(flow): use batch mode for flow (#4599 ) * generic bundle trait * feat: impl get/let * fix: drop batch * test: tumble batch * feat: use batch eval flow * fix: div use arrow::div not mul * perf: not append batch * perf: use bool mask for reduce * perf: tiny opt * perf: refactor slow path * feat: opt if then * fix: WIP * perf: if then * chore: use trace instead * fix: reduce missing non-first batch * perf: flow if then using interleave * docs: add TODO * perf: remove unnecessary eq * chore: remove unused import * fix: run_available no longer loop forever * feat: blocking on high input buf * chore: increase threhold * chore: after rebase * chore: per review * chore: per review * fix: allow empty values in reduce&test * tests: more flow doc example tests * chore: per review * chore: per review	2024-09-11 03:31:52 +00:00
Zhenchi	f252599ac6	feat(index): add `RangeReader` trait (#4718 ) * feat(index): add `RangeReader` trait` Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * fix: return content_length as read bytes Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> * chore: remove buffer & use `BufMut` Signed-off-by: Zhenchi <zhongzc_arch@outlook.com> --------- Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-09-10 15:24:06 +00:00
Ruihang Xia	ff40d512bd	fix: support append-only physical table (#4716 ) * fix: support append-only physical table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * Update src/metric-engine/src/engine/create.rs Co-authored-by: jeremyhi <jiachun_feng@proton.me> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Weny Xu <wenymedia@gmail.com> Co-authored-by: jeremyhi <jiachun_feng@proton.me>	2024-09-10 12:23:23 +00:00
jeremyhi	dcae21208b	chore: refresh route table (#4673 ) * chore: remove error:: * chore: avoid to use get_raw if unnecessary * chore: clearer method name * feat: remap node addresses in table route * chore: add unit test for remap address * feat: refresh node address mapping via heartbeat * feat: broadcast table cache invalidate on new epoch * chore: clarify heartbeat log * chore: remove InvalidHeartbeatRequest * chore: add log * feat: add role into NodeAddressKey * chore: fix test * Update src/common/meta/src/key/table_route.rs Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com> * chore: simplify code --------- Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com>	2024-09-10 12:08:59 +00:00
Weny Xu	d0fd79ac7f	chore: remove `validate_request_with_table` (#4710 ) perf: remove `validate_request_with_table`	2024-09-10 11:56:18 +00:00
Yingwen	3e17c09e45	feat: skip caching uncompressed pages if they are large (#4705 ) * feat: cache each uncompressed page * chore: remove unused function * chore: log * chore: log * chore: row group pages cache kv * feat: also support row group level cache * chore: fix range count * feat: don't cache compressed page for row group cache * feat: use function to get part * chore: log whether scan is from compaction * chore: avoid get column * feat: add timer metrics * chore: Revert "feat: add timer metrics" This reverts commit 4618f57fa2ba13b1e1a8dec83afd01c00ae4c867. * feat: don't cache individual uncompressed page * feat: append in row group level under append mode Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: fetch pages cost * perf: yield * Update src/mito2/src/sst/parquet/row_group.rs * refactor: cache key * feat: print file num and row groups num in explain * test: update sqlness test * chore: Update src/mito2/src/sst/parquet/page_reader.rs --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-10 11:52:16 +00:00
jeremyhi	04de3ed929	chore: avoid schema check when auto_create_table_hint is disabled (#4712 ) chore: avoid schema check when auto-create-table-hint is disabled	2024-09-10 07:13:28 +00:00
Ruihang Xia	29f215531a	feat: parallel in row group level under append mode (#4704 ) feat: append in row group level under append mode Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-10 07:12:23 +00:00
jeremyhi	545a80c6e0	chore: remove unused method (#4703 )	2024-09-09 12:14:17 +00:00
Yohan Wal	04e7dd6fd5	feat: add json data type (#4619 ) * feat: add json type and vector * fix: allow to create and insert json data * feat: udf to query json as string * refactor: remove JsonbValue and JsonVector * feat: show json value as strings * chore: make ci happy * test: adunit test and sqlness test * refactor: use binary as grpc value of json * fix: use non-preserve-order jsonb * test: revert changed test * refactor: change udf get_by_path to jq * chore: make ci happy * fix: distinguish binary and json in proto * chore: delete udf for future pr * refactor: remove Value(Json) * chore: follow review comments * test: some tests and checks * test: fix unit tests * chore: follow review comments * chore: corresponding changes to proto * fix: change grpc and pgsql server behavior alongside with sqlness/crud tests * chore: follow review comments * feat: udf of conversions between json and strings, used for grpc server * refactor: rename to_string to json_to_string * test: add more sqlness test for json * chore: thanks for review :) * Apply suggestions from code review --------- Co-authored-by: Weny Xu <wenymedia@gmail.com>	2024-09-09 11:41:36 +00:00
jeremyhi	dc89944570	feat: gRPC auto create table hint (#4700 ) * feat: gRPC auto create table hint * chore: remove the checking of auto_create_table_hint	2024-09-09 09:07:07 +00:00
Weny Xu	8bf549c2fa	chore: print downgraded region last_entry_id (#4701 )	2024-09-09 08:14:55 +00:00
Lei, HUANG	208afe402b	feat(wal): increase recovery parallelism (#4689 ) * Refactor RaftEngineLogStore to use references for config - Updated `RaftEngineLogStore::try_new` to accept a reference to `RaftEngineConfig` instead of taking ownership. - Replaced direct usage of `config` with individual fields (`sync_write`, `sync_period`, `read_batch_size`). - Adjusted test cases to pass references to `RaftEngineConfig`. * Add parallelism configuration for WAL recovery - Introduced `recovery_parallelism` setting in `datanode.example.toml` and `standalone.example.toml` for configuring parallelism during WAL recovery. - Updated `Cargo.lock` and `Cargo.toml` to include `num_cpus` dependency. - Modified `RaftEngineConfig` to include `recovery_parallelism` with a default value set to the number of CP * feat/wal-recovery-parallelism: Add `wal.recovery_parallelism` configuration option - Introduced `wal.recovery_parallelism` to config.md for specifying parallelism during WAL recovery. - Updated `RaftEngineLogStore` to include `recovery_threads` from the new configuration. * fix: ut	2024-09-09 04:25:24 +00:00
Ning Sun	c22a398f59	fix: return version string based on request protocol (#4680 ) * fix: return version string based on request protocol * fix: resolve lint issue	2024-09-09 03:36:54 +00:00
JohnsonLee	a8477e4142	fix: table resolving logic related to pg_catalog (#4580 ) * fix: table resolving logic related to pg_catalog refer to https://github.com/GreptimeTeam/greptimedb/issues/3560#issuecomment-2287794348 and #4543 * refactor: remove CatalogProtocol type * fix: sqlness * fix: forbid create database pg_catalog with mysql client * refactor: use QueryContext as arguments rather than Channel * refactor: pass None as default behaviour in information_schema * test: fix test	2024-09-09 00:47:59 +00:00
Yiran	b950e705f5	chore: update the document link in README.md (#4690 )	2024-09-07 15:27:32 +00:00
Ruihang Xia	d2d62e0c6f	fix: unconditional statistics (#4694 ) * fix: unconditional statistics Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add more sqlness case Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-07 04:28:11 +00:00
localhost	5d9f8a3be7	feat: add test pipeline api (#4667 ) * chore: add test pipeline api * chore: add test for test pipeline api * chore: fix taplo check * chore: change pipeline dryrun api path * chore: add more info for pipeline dryrun api	2024-09-06 08:36:49 +00:00
jeremyhi	e88465840d	feat: add extension field to HeartbeatRequest (#4688 ) * feat: add extension field to HeartbeatRequest * chore: extension to extensions * chore: upgrade proto	2024-09-06 08:29:20 +00:00
localhost	67d95d2088	refactor!: add processor builder and transform buidler (#4571 ) * chore: add processor builder and transform buidler * chore: in process * chore: intermediate state from hashmap to vector in pipeline * chore: remove useless code and rename some struct * chore: fix typos * chore: format code * chore: add error handling and optimize code readability * chore: fix typos * chore: remove useless code * chore: add some doc * chore: fix by pr commit * chore: remove useless code and change struct name * chore: modify the location of the find_key_index function.	2024-09-06 07:51:08 +00:00