feat: support windowed sort with where condition

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
feat: spawn block write wal
2025-12-25 15:40:02 +00:00 · 2024-11-04 19:34:03 +08:00 · 2024-11-04 17:35:12 +08:00 · 2024-11-01 17:45:28 +08:00 · 2024-11-01 09:25:03 +00:00 · 2024-11-01 07:10:57 +00:00
149 changed files with 4415 additions and 889 deletions
--- a/.github/actions/build-windows-artifacts/action.yml
+++ b/.github/actions/build-windows-artifacts/action.yml
@@ -40,7 +40,7 @@ runs:

    - name: Install PyArrow Package
      shell: pwsh
-      run: pip install pyarrow
+      run: pip install pyarrow numpy

    - name: Install WSL distribution
      uses: Vampire/setup-wsl@v2
--- a/.github/actions/setup-etcd-cluster/action.yml
+++ b/.github/actions/setup-etcd-cluster/action.yml
@@ -18,7 +18,7 @@ runs:
        --set replicaCount=${{ inputs.etcd-replicas }} \
        --set resources.requests.cpu=50m \
        --set resources.requests.memory=128Mi \
-        --set resources.limits.cpu=1000m \
+        --set resources.limits.cpu=1500m \
        --set resources.limits.memory=2Gi \
        --set auth.rbac.create=false \
        --set auth.rbac.token.enabled=false \
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -436,7 +436,7 @@ jobs:
    timeout-minutes: 60
    strategy:
      matrix:
-        target: ["fuzz_migrate_mito_regions", "fuzz_failover_mito_regions", "fuzz_failover_metric_regions"]
+        target: ["fuzz_migrate_mito_regions", "fuzz_migrate_metric_regions", "fuzz_failover_mito_regions", "fuzz_failover_metric_regions"]
        mode:
          - name: "Remote WAL"
            minio: true
@@ -449,6 +449,12 @@ jobs:
              minio: true
              kafka: false
              values: "with-minio.yaml"
+          - target: "fuzz_migrate_metric_regions"
+            mode:
+              name: "Local WAL"
+              minio: true
+              kafka: false
+              values: "with-minio.yaml"
    steps:
      - name: Remove unused software
        run: |
@@ -688,7 +694,7 @@ jobs:
        with:
          python-version: '3.10'
      - name: Install PyArrow Package
-        run: pip install pyarrow
+        run: pip install pyarrow numpy
      - name: Setup etcd server
        working-directory: tests-integration/fixtures/etcd
        run: docker compose -f docker-compose-standalone.yml up -d --wait
--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
@@ -92,7 +92,7 @@ jobs:
        with:
          python-version: "3.10"
      - name: Install PyArrow Package
-        run: pip install pyarrow
+        run: pip install pyarrow numpy
      - name: Install WSL distribution
        uses: Vampire/setup-wsl@v2
        with:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1788,7 +1788,6 @@ dependencies = [
 "tokio-stream",
 "tonic 0.11.0",
 "tracing",
- "tracing-subscriber",
 ]

 [[package]]
@@ -2149,6 +2148,7 @@ dependencies = [
 "paste",
 "prost 0.12.6",
 "snafu 0.8.5",
+ "store-api",
 "table",
 ]

@@ -4531,7 +4531,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=b4d301184eb0d01fd4d1042fcc7c5dfb54f3c1e3#b4d301184eb0d01fd4d1042fcc7c5dfb54f3c1e3"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=255f87a3318ace3f88a67f76995a0e14910983f4#255f87a3318ace3f88a67f76995a0e14910983f4"
 dependencies = [
 "prost 0.12.6",
 "serde",
@@ -5524,7 +5524,7 @@ dependencies = [
 [[package]]
 name = "jsonb"
 version = "0.4.1"
-source = "git+https://github.com/datafuselabs/jsonb.git?rev=46ad50fc71cf75afbf98eec455f7892a6387c1fc#46ad50fc71cf75afbf98eec455f7892a6387c1fc"
+source = "git+https://github.com/databendlabs/jsonb.git?rev=46ad50fc71cf75afbf98eec455f7892a6387c1fc#46ad50fc71cf75afbf98eec455f7892a6387c1fc"
 dependencies = [
 "byteorder",
 "fast-float",
@@ -8627,9 +8627,9 @@ dependencies = [

 [[package]]
 name = "promql-parser"
-version = "0.4.1"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c1ad4a4cfa84ec4aa5831c82e57af0a3faf3f0af83bee13fa1390b2d0a32dc9"
+checksum = "7fe99e6f80a79abccf1e8fb48dd63473a36057e600cc6ea36147c8318698ae6f"
 dependencies = [
 "cfgrammar",
 "chrono",
@@ -8637,6 +8637,8 @@ dependencies = [
 "lrlex",
 "lrpar",
 "regex",
+ "serde",
+ "serde_json",
 ]

 [[package]]
@@ -9030,6 +9032,7 @@ dependencies = [
 "table",
 "tokio",
 "tokio-stream",
+ "uuid",
 ]

 [[package]]
@@ -11496,6 +11499,7 @@ dependencies = [
 "datatypes",
 "derive_builder 0.12.0",
 "futures",
+ "humantime",
 "serde",
 "serde_json",
 "snafu 0.8.5",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -121,11 +121,11 @@ etcd-client = { version = "0.13" }
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "b4d301184eb0d01fd4d1042fcc7c5dfb54f3c1e3" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "255f87a3318ace3f88a67f76995a0e14910983f4" }
 humantime = "2.1"
 humantime-serde = "1.1"
 itertools = "0.10"
-jsonb = { git = "https://github.com/datafuselabs/jsonb.git", rev = "46ad50fc71cf75afbf98eec455f7892a6387c1fc", default-features = false }
+jsonb = { git = "https://github.com/databendlabs/jsonb.git", rev = "46ad50fc71cf75afbf98eec455f7892a6387c1fc", default-features = false }
 lazy_static = "1.4"
 meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev = "a10facb353b41460eeb98578868ebf19c2084fac" }
 mockall = "0.11.4"
@@ -145,7 +145,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["arrow", "
 paste = "1.0"
 pin-project = "1.0"
 prometheus = { version = "0.13.3", features = ["process"] }
-promql-parser = { version = "0.4.1" }
+promql-parser = { version = "0.4.3", features = ["ser"] }
 prost = "0.12"
 raft-engine = { version = "0.4.1", default-features = false }
 rand = "0.8"
@@ -180,13 +180,16 @@ sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "5
 ] }
 strum = { version = "0.25", features = ["derive"] }
 tempfile = "3"
-tokio = { version = "1.36", features = ["full"] }
+tokio = { version = "1.40", features = ["full"] }
 tokio-postgres = "0.7"
 tokio-stream = { version = "0.1" }
 tokio-util = { version = "0.7", features = ["io-util", "compat"] }
 toml = "0.8.8"
 tonic = { version = "0.11", features = ["tls", "gzip", "zstd"] }
 tower = { version = "0.4" }
+tracing-appender = "0.2"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
+typetag = "0.2"
 uuid = { version = "1.7", features = ["serde", "v4", "fast-rng"] }
 zstd = "0.13"

--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -646,7 +646,7 @@ url = ""
 headers = { }

 ## The tracing options. Only effect when compiled with `tokio-console` feature.
-[tracing]
+#+ [tracing]
 ## The tokio console address.
 ## @toml2docs:none-default
-tokio_console_addr = "127.0.0.1"
+#+ tokio_console_addr = "127.0.0.1"
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -101,8 +101,8 @@ threshold = "10s"
 sample_ratio = 1.0

 ## The tracing options. Only effect when compiled with `tokio-console` feature.
-[tracing]
+#+ [tracing]
 ## The tokio console address.
 ## @toml2docs:none-default
-tokio_console_addr = "127.0.0.1"
+#+ tokio_console_addr = "127.0.0.1"

--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -231,7 +231,7 @@ url = ""
 headers = { }

 ## The tracing options. Only effect when compiled with `tokio-console` feature.
-[tracing]
+#+ [tracing]
 ## The tokio console address.
 ## @toml2docs:none-default
-tokio_console_addr = "127.0.0.1"
+#+ tokio_console_addr = "127.0.0.1"
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -218,7 +218,7 @@ url = ""
 headers = { }

 ## The tracing options. Only effect when compiled with `tokio-console` feature.
-[tracing]
+#+ [tracing]
 ## The tokio console address.
 ## @toml2docs:none-default
-tokio_console_addr = "127.0.0.1"
+#+ tokio_console_addr = "127.0.0.1"
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -690,7 +690,7 @@ url = ""
 headers = { }

 ## The tracing options. Only effect when compiled with `tokio-console` feature.
-[tracing]
+#+ [tracing]
 ## The tokio console address.
 ## @toml2docs:none-default
-tokio_console_addr = "127.0.0.1"
+#+ tokio_console_addr = "127.0.0.1"
--- a/docs/how-to/how-to-change-log-level-on-the-fly.md
+++ b/docs/how-to/how-to-change-log-level-on-the-fly.md
@@ -0,0 +1,16 @@
+# Change Log Level on the Fly
+
+## HTTP API
+
+example:
+```bash
+curl --data "trace;flow=debug" 127.0.0.1:4000/debug/log_level
+```
+And database will reply with something like:
+```bash
+Log Level changed from Some("info") to "trace;flow=debug"%
+```
+
+The data is a string in the format of `global_level;module1=level1;module2=level2;...` that follow the same rule of `RUST_LOG`. 
+
+The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive).
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -116,6 +116,7 @@ impl From<ColumnDataTypeWrapper> for ConcreteDataType {
                    ConcreteDataType::binary_datatype()
                }
            }
+            ColumnDataType::Json => ConcreteDataType::json_datatype(),
            ColumnDataType::String => ConcreteDataType::string_datatype(),
            ColumnDataType::Date => ConcreteDataType::date_datatype(),
            ColumnDataType::Datetime => ConcreteDataType::datetime_datatype(),
@@ -417,6 +418,10 @@ pub fn values_with_capacity(datatype: ColumnDataType, capacity: usize) -> Values
            decimal128_values: Vec::with_capacity(capacity),
            ..Default::default()
        },
+        ColumnDataType::Json => Values {
+            string_values: Vec::with_capacity(capacity),
+            ..Default::default()
+        },
    }
 }

--- a/src/catalog/src/system_schema/information_schema/region_statistics.rs
+++ b/src/catalog/src/system_schema/information_schema/region_statistics.rs
@@ -39,9 +39,12 @@ use crate::CatalogManager;
 const REGION_ID: &str = "region_id";
 const TABLE_ID: &str = "table_id";
 const REGION_NUMBER: &str = "region_number";
+const REGION_ROWS: &str = "region_rows";
+const DISK_SIZE: &str = "disk_size";
 const MEMTABLE_SIZE: &str = "memtable_size";
 const MANIFEST_SIZE: &str = "manifest_size";
 const SST_SIZE: &str = "sst_size";
+const INDEX_SIZE: &str = "index_size";
 const ENGINE: &str = "engine";
 const REGION_ROLE: &str = "region_role";

@@ -52,9 +55,12 @@ const INIT_CAPACITY: usize = 42;
 /// - `region_id`: The region id.
 /// - `table_id`: The table id.
 /// - `region_number`: The region number.
+/// - `region_rows`: The number of rows in region.
 /// - `memtable_size`: The memtable size in bytes.
+/// - `disk_size`: The approximate disk size in bytes.
 /// - `manifest_size`: The manifest size in bytes.
-/// - `sst_size`: The sst size in bytes.
+/// - `sst_size`: The sst data files size in bytes.
+/// - `index_size`: The sst index files size in bytes.
 /// - `engine`: The engine type.
 /// - `region_role`: The region role.
 ///
@@ -76,9 +82,12 @@ impl InformationSchemaRegionStatistics {
            ColumnSchema::new(REGION_ID, ConcreteDataType::uint64_datatype(), false),
            ColumnSchema::new(TABLE_ID, ConcreteDataType::uint32_datatype(), false),
            ColumnSchema::new(REGION_NUMBER, ConcreteDataType::uint32_datatype(), false),
+            ColumnSchema::new(REGION_ROWS, ConcreteDataType::uint64_datatype(), true),
+            ColumnSchema::new(DISK_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(MEMTABLE_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(MANIFEST_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(SST_SIZE, ConcreteDataType::uint64_datatype(), true),
+            ColumnSchema::new(INDEX_SIZE, ConcreteDataType::uint64_datatype(), true),
            ColumnSchema::new(ENGINE, ConcreteDataType::string_datatype(), true),
            ColumnSchema::new(REGION_ROLE, ConcreteDataType::string_datatype(), true),
        ]))
@@ -135,9 +144,12 @@ struct InformationSchemaRegionStatisticsBuilder {
    region_ids: UInt64VectorBuilder,
    table_ids: UInt32VectorBuilder,
    region_numbers: UInt32VectorBuilder,
+    region_rows: UInt64VectorBuilder,
+    disk_sizes: UInt64VectorBuilder,
    memtable_sizes: UInt64VectorBuilder,
    manifest_sizes: UInt64VectorBuilder,
    sst_sizes: UInt64VectorBuilder,
+    index_sizes: UInt64VectorBuilder,
    engines: StringVectorBuilder,
    region_roles: StringVectorBuilder,
 }
@@ -150,9 +162,12 @@ impl InformationSchemaRegionStatisticsBuilder {
            region_ids: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            table_ids: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
            region_numbers: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
+            region_rows: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
+            disk_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            memtable_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            manifest_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            sst_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
+            index_sizes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
            engines: StringVectorBuilder::with_capacity(INIT_CAPACITY),
            region_roles: StringVectorBuilder::with_capacity(INIT_CAPACITY),
        }
@@ -177,9 +192,12 @@ impl InformationSchemaRegionStatisticsBuilder {
            (REGION_ID, &Value::from(region_stat.id.as_u64())),
            (TABLE_ID, &Value::from(region_stat.id.table_id())),
            (REGION_NUMBER, &Value::from(region_stat.id.region_number())),
+            (REGION_ROWS, &Value::from(region_stat.num_rows)),
+            (DISK_SIZE, &Value::from(region_stat.approximate_bytes)),
            (MEMTABLE_SIZE, &Value::from(region_stat.memtable_size)),
            (MANIFEST_SIZE, &Value::from(region_stat.manifest_size)),
            (SST_SIZE, &Value::from(region_stat.sst_size)),
+            (INDEX_SIZE, &Value::from(region_stat.index_size)),
            (ENGINE, &Value::from(region_stat.engine.as_str())),
            (REGION_ROLE, &Value::from(region_stat.role.to_string())),
        ];
@@ -192,9 +210,12 @@ impl InformationSchemaRegionStatisticsBuilder {
        self.table_ids.push(Some(region_stat.id.table_id()));
        self.region_numbers
            .push(Some(region_stat.id.region_number()));
+        self.region_rows.push(Some(region_stat.num_rows));
+        self.disk_sizes.push(Some(region_stat.approximate_bytes));
        self.memtable_sizes.push(Some(region_stat.memtable_size));
        self.manifest_sizes.push(Some(region_stat.manifest_size));
        self.sst_sizes.push(Some(region_stat.sst_size));
+        self.index_sizes.push(Some(region_stat.index_size));
        self.engines.push(Some(&region_stat.engine));
        self.region_roles.push(Some(&region_stat.role.to_string()));
    }
@@ -204,9 +225,12 @@ impl InformationSchemaRegionStatisticsBuilder {
            Arc::new(self.region_ids.finish()),
            Arc::new(self.table_ids.finish()),
            Arc::new(self.region_numbers.finish()),
+            Arc::new(self.region_rows.finish()),
+            Arc::new(self.disk_sizes.finish()),
            Arc::new(self.memtable_sizes.finish()),
            Arc::new(self.manifest_sizes.finish()),
            Arc::new(self.sst_sizes.finish()),
+            Arc::new(self.index_sizes.finish()),
            Arc::new(self.engines.finish()),
            Arc::new(self.region_roles.finish()),
        ];
--- a/src/client/Cargo.toml
+++ b/src/client/Cargo.toml
@@ -45,7 +45,6 @@ common-grpc-expr.workspace = true
 datanode.workspace = true
 derive-new = "0.5"
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }

 [dev-dependencies.substrait_proto]
 package = "substrait"
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -78,7 +78,7 @@ table.workspace = true
 tokio.workspace = true
 toml.workspace = true
 tonic.workspace = true
-tracing-appender = "0.2"
+tracing-appender.workspace = true

 [target.'cfg(not(windows))'.dependencies]
 tikv-jemallocator = "0.6"
--- a/src/cmd/src/cli/repl.rs
+++ b/src/cmd/src/cli/repl.rs
@@ -174,7 +174,7 @@ impl Repl {

            let plan = query_engine
                .planner()
-                .plan(stmt, query_ctx.clone())
+                .plan(&stmt, query_ctx.clone())
                .await
                .context(PlanStatementSnafu)?;

--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::net::SocketAddr;
 use std::sync::Arc;
 use std::{fs, path};

@@ -250,6 +251,13 @@ pub struct Instance {
    _guard: Vec<WorkerGuard>,
 }

+impl Instance {
+    /// Find the socket addr of a server by its `name`.
+    pub async fn server_addr(&self, name: &str) -> Option<SocketAddr> {
+        self.frontend.server_handlers().addr(name).await
+    }
+}
+
 #[async_trait]
 impl App for Instance {
    fn name(&self) -> &str {
@@ -340,7 +348,8 @@ pub struct StartCommand {
 }

 impl StartCommand {
-    fn load_options(
+    /// Load the GreptimeDB options from various sources (command line, config file or env).
+    pub fn load_options(
        &self,
        global_options: &GlobalOptions,
    ) -> Result<GreptimeOptions<StandaloneOptions>> {
@@ -430,7 +439,8 @@ impl StartCommand {
    #[allow(unreachable_code)]
    #[allow(unused_variables)]
    #[allow(clippy::diverging_sub_expression)]
-    async fn build(&self, opts: GreptimeOptions<StandaloneOptions>) -> Result<Instance> {
+    /// Build GreptimeDB instance with the loaded options.
+    pub async fn build(&self, opts: GreptimeOptions<StandaloneOptions>) -> Result<Instance> {
        common_runtime::init_global_runtimes(&opts.runtime);

        let guard = common_telemetry::init_global_logging(
@@ -726,12 +736,14 @@ impl InformationExtension for StandaloneInformationExtension {
                    id: stat.region_id,
                    rcus: 0,
                    wcus: 0,
-                    approximate_bytes: region_stat.estimated_disk_size() as i64,
+                    approximate_bytes: region_stat.estimated_disk_size(),
                    engine: stat.engine,
                    role: RegionRole::from(stat.role).into(),
+                    num_rows: region_stat.num_rows,
                    memtable_size: region_stat.memtable_size,
                    manifest_size: region_stat.manifest_size,
                    sst_size: region_stat.sst_size,
+                    index_size: region_stat.index_size,
                }
            })
            .collect::<Vec<_>>();
--- a/src/common/grpc-expr/Cargo.toml
+++ b/src/common/grpc-expr/Cargo.toml
@@ -18,6 +18,7 @@ common-time.workspace = true
 datatypes.workspace = true
 prost.workspace = true
 snafu.workspace = true
+store-api.workspace = true
 table.workspace = true

 [dev-dependencies]
--- a/src/common/grpc-expr/src/alter.rs
+++ b/src/common/grpc-expr/src/alter.rs
@@ -22,12 +22,13 @@ use api::v1::{
 use common_query::AddColumnLocation;
 use datatypes::schema::{ColumnSchema, RawSchema};
 use snafu::{ensure, OptionExt, ResultExt};
+use store_api::region_request::ChangeOption;
 use table::metadata::TableId;
 use table::requests::{AddColumnRequest, AlterKind, AlterTableRequest, ChangeColumnTypeRequest};

 use crate::error::{
-    InvalidColumnDefSnafu, MissingFieldSnafu, MissingTimestampColumnSnafu, Result,
-    UnknownLocationTypeSnafu,
+    InvalidChangeTableOptionRequestSnafu, InvalidColumnDefSnafu, MissingFieldSnafu,
+    MissingTimestampColumnSnafu, Result, UnknownLocationTypeSnafu,
 };

 const LOCATION_TYPE_FIRST: i32 = LocationType::First as i32;
@@ -92,6 +93,15 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterExpr) -> Result<Alter
        Kind::RenameTable(RenameTable { new_table_name }) => {
            AlterKind::RenameTable { new_table_name }
        }
+        Kind::ChangeTableOptions(api::v1::ChangeTableOptions {
+            change_table_options,
+        }) => AlterKind::ChangeTableOptions {
+            options: change_table_options
+                .iter()
+                .map(ChangeOption::try_from)
+                .collect::<std::result::Result<Vec<_>, _>>()
+                .context(InvalidChangeTableOptionRequestSnafu)?,
+        },
    };

    let request = AlterTableRequest {
--- a/src/common/grpc-expr/src/error.rs
+++ b/src/common/grpc-expr/src/error.rs
@@ -19,6 +19,7 @@ use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
 use snafu::{Location, Snafu};
+use store_api::metadata::MetadataError;

 #[derive(Snafu)]
 #[snafu(visibility(pub))]
@@ -118,6 +119,12 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Invalid change table option request"))]
+    InvalidChangeTableOptionRequest {
+        #[snafu(source)]
+        error: MetadataError,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -141,6 +148,7 @@ impl ErrorExt for Error {
            Error::UnknownColumnDataType { .. } | Error::InvalidFulltextColumnType { .. } => {
                StatusCode::InvalidArguments
            }
+            Error::InvalidChangeTableOptionRequest { .. } => StatusCode::InvalidArguments,
        }
    }

--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -60,7 +60,7 @@ table.workspace = true
 tokio.workspace = true
 tokio-postgres = { workspace = true, optional = true }
 tonic.workspace = true
-typetag = "0.2"
+typetag.workspace = true

 [dev-dependencies]
 chrono.workspace = true
--- a/src/common/meta/src/datanode.rs
+++ b/src/common/meta/src/datanode.rs
@@ -78,17 +78,21 @@ pub struct RegionStat {
    /// The write capacity units during this period
    pub wcus: i64,
    /// Approximate bytes of this region
-    pub approximate_bytes: i64,
+    pub approximate_bytes: u64,
    /// The engine name.
    pub engine: String,
    /// The region role.
    pub role: RegionRole,
+    /// The number of rows
+    pub num_rows: u64,
    /// The size of the memtable in bytes.
    pub memtable_size: u64,
    /// The size of the manifest in bytes.
    pub manifest_size: u64,
-    /// The size of the SST files in bytes.
+    /// The size of the SST data files in bytes.
    pub sst_size: u64,
+    /// The size of the SST index files in bytes.
+    pub index_size: u64,
 }

 impl Stat {
@@ -178,12 +182,14 @@ impl From<&api::v1::meta::RegionStat> for RegionStat {
            id: RegionId::from_u64(value.region_id),
            rcus: value.rcus,
            wcus: value.wcus,
-            approximate_bytes: value.approximate_bytes,
+            approximate_bytes: value.approximate_bytes as u64,
            engine: value.engine.to_string(),
            role: RegionRole::from(value.role()),
+            num_rows: region_stat.num_rows,
            memtable_size: region_stat.memtable_size,
            manifest_size: region_stat.manifest_size,
            sst_size: region_stat.sst_size,
+            index_size: region_stat.index_size,
        }
    }
 }
--- a/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs
@@ -43,10 +43,10 @@ impl AlterLogicalTablesProcedure {
            &self.data.physical_columns,
        );

-        // Updates physical table's metadata
+        // Updates physical table's metadata, and we don't need to touch per-region settings.
        self.context
            .table_metadata_manager
-            .update_table_info(physical_table_info, new_raw_table_info)
+            .update_table_info(physical_table_info, None, new_raw_table_info)
            .await?;

        Ok(())
--- a/src/common/meta/src/ddl/alter_table.rs
+++ b/src/common/meta/src/ddl/alter_table.rs
@@ -43,10 +43,10 @@ use crate::ddl::DdlContext;
 use crate::error::{Error, Result};
 use crate::instruction::CacheIdent;
 use crate::key::table_info::TableInfoValue;
-use crate::key::DeserializedValueWithBytes;
+use crate::key::{DeserializedValueWithBytes, RegionDistribution};
 use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
 use crate::rpc::ddl::AlterTableTask;
-use crate::rpc::router::{find_leader_regions, find_leaders};
+use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution};
 use crate::{metrics, ClusterId};

 /// The alter table procedure
@@ -101,6 +101,9 @@ impl AlterTableProcedure {
            .get_physical_table_route(table_id)
            .await?;

+        self.data.region_distribution =
+            Some(region_distribution(&physical_table_route.region_routes));
+
        let leaders = find_leaders(&physical_table_route.region_routes);
        let mut alter_region_tasks = Vec::with_capacity(leaders.len());

@@ -161,8 +164,14 @@ impl AlterTableProcedure {
            self.on_update_metadata_for_rename(new_table_name.to_string(), table_info_value)
                .await?;
        } else {
-            self.on_update_metadata_for_alter(new_info.into(), table_info_value)
-                .await?;
+            // region distribution is set in submit_alter_region_requests
+            let region_distribution = self.data.region_distribution.as_ref().unwrap().clone();
+            self.on_update_metadata_for_alter(
+                new_info.into(),
+                region_distribution,
+                table_info_value,
+            )
+            .await?;
        }

        info!("Updated table metadata for table {table_ref}, table_id: {table_id}");
@@ -271,6 +280,8 @@ pub struct AlterTableData {
    table_id: TableId,
    /// Table info value before alteration.
    table_info_value: Option<DeserializedValueWithBytes<TableInfoValue>>,
+    /// Region distribution for table in case we need to update region options.
+    region_distribution: Option<RegionDistribution>,
 }

 impl AlterTableData {
@@ -281,6 +292,7 @@ impl AlterTableData {
            table_id,
            cluster_id,
            table_info_value: None,
+            region_distribution: None,
        }
    }

--- a/src/common/meta/src/ddl/alter_table/region_request.rs
+++ b/src/common/meta/src/ddl/alter_table/region_request.rs
@@ -106,6 +106,7 @@ fn create_proto_alter_kind(
            })))
        }
        Kind::RenameTable(_) => Ok(None),
+        Kind::ChangeTableOptions(v) => Ok(Some(alter_request::Kind::ChangeTableOptions(v.clone()))),
    }
 }

--- a/src/common/meta/src/ddl/alter_table/update_metadata.rs
+++ b/src/common/meta/src/ddl/alter_table/update_metadata.rs
@@ -20,7 +20,7 @@ use table::requests::AlterKind;
 use crate::ddl::alter_table::AlterTableProcedure;
 use crate::error::{self, Result};
 use crate::key::table_info::TableInfoValue;
-use crate::key::DeserializedValueWithBytes;
+use crate::key::{DeserializedValueWithBytes, RegionDistribution};

 impl AlterTableProcedure {
    /// Builds new_meta
@@ -51,7 +51,9 @@ impl AlterTableProcedure {
            AlterKind::RenameTable { new_table_name } => {
                new_info.name = new_table_name.to_string();
            }
-            AlterKind::DropColumns { .. } | AlterKind::ChangeColumnTypes { .. } => {}
+            AlterKind::DropColumns { .. }
+            | AlterKind::ChangeColumnTypes { .. }
+            | AlterKind::ChangeTableOptions { .. } => {}
        }

        Ok(new_info)
@@ -75,11 +77,16 @@ impl AlterTableProcedure {
    pub(crate) async fn on_update_metadata_for_alter(
        &self,
        new_table_info: RawTableInfo,
+        region_distribution: RegionDistribution,
        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
    ) -> Result<()> {
        let table_metadata_manager = &self.context.table_metadata_manager;
        table_metadata_manager
-            .update_table_info(current_table_info_value, new_table_info)
+            .update_table_info(
+                current_table_info_value,
+                Some(region_distribution),
+                new_table_info,
+            )
            .await?;

        Ok(())
--- a/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs
+++ b/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs
@@ -58,10 +58,10 @@ impl CreateLogicalTablesProcedure {
            &new_table_info.name,
        );

-        // Update physical table's metadata
+        // Update physical table's metadata and we don't need to touch per-region settings.
        self.context
            .table_metadata_manager
-            .update_table_info(&physical_table_info, new_table_info)
+            .update_table_info(&physical_table_info, None, new_table_info)
            .await?;

        // Invalid physical table cache
--- a/src/common/meta/src/ddl/test_util/datanode_handler.rs
+++ b/src/common/meta/src/ddl/test_util/datanode_handler.rs
@@ -29,7 +29,10 @@ use crate::test_util::MockDatanodeHandler;
 #[async_trait::async_trait]
 impl MockDatanodeHandler for () {
    async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> Result<RegionResponse> {
-        unreachable!()
+        Ok(RegionResponse {
+            affected_rows: 0,
+            extensions: Default::default(),
+        })
    }

    async fn handle_query(
--- a/src/common/meta/src/ddl/tests/alter_table.rs
+++ b/src/common/meta/src/ddl/tests/alter_table.rs
@@ -19,13 +19,14 @@ use std::sync::Arc;
 use api::v1::alter_expr::Kind;
 use api::v1::region::{region_request, RegionRequest};
 use api::v1::{
-    AddColumn, AddColumns, AlterExpr, ColumnDataType, ColumnDef as PbColumnDef, DropColumn,
-    DropColumns, SemanticType,
+    AddColumn, AddColumns, AlterExpr, ChangeTableOption, ChangeTableOptions, ColumnDataType,
+    ColumnDef as PbColumnDef, DropColumn, DropColumns, SemanticType,
 };
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use store_api::storage::RegionId;
+use table::requests::TTL_KEY;
 use tokio::sync::mpsc::{self};

 use crate::ddl::alter_table::AlterTableProcedure;
@@ -34,6 +35,7 @@ use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::datanode_handler::{
    DatanodeWatcher, RequestOutdatedErrorDatanodeHandler,
 };
+use crate::key::datanode_table::DatanodeTableKey;
 use crate::key::table_name::TableNameKey;
 use crate::key::table_route::TableRouteValue;
 use crate::peer::Peer;
@@ -293,12 +295,21 @@ async fn test_on_update_metadata_add_columns() {
    let table_name = "foo";
    let table_id = 1024;
    let task = test_create_table_task(table_name, table_id);
+
+    let region_id = RegionId::new(table_id, 0);
+    let mock_table_routes = vec![RegionRoute {
+        region: Region::new_test(region_id),
+        leader_peer: Some(Peer::default()),
+        follower_peers: vec![],
+        leader_state: None,
+        leader_down_since: None,
+    }];
    // Puts a value to table name key.
    ddl_context
        .table_metadata_manager
        .create_table_metadata(
            task.table_info.clone(),
-            TableRouteValue::physical(vec![]),
+            TableRouteValue::physical(mock_table_routes),
            HashMap::new(),
        )
        .await
@@ -326,6 +337,7 @@ async fn test_on_update_metadata_add_columns() {
    let mut procedure =
        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
    procedure.on_prepare().await.unwrap();
+    procedure.submit_alter_region_requests().await.unwrap();
    procedure.on_update_metadata().await.unwrap();

    let table_info = ddl_context
@@ -343,3 +355,76 @@ async fn test_on_update_metadata_add_columns() {
        table_info.meta.next_column_id
    );
 }
+
+#[tokio::test]
+async fn test_on_update_table_options() {
+    let node_manager = Arc::new(MockDatanodeManager::new(()));
+    let ddl_context = new_ddl_context(node_manager);
+    let cluster_id = 1;
+    let table_name = "foo";
+    let table_id = 1024;
+    let task = test_create_table_task(table_name, table_id);
+
+    let region_id = RegionId::new(table_id, 0);
+    let mock_table_routes = vec![RegionRoute {
+        region: Region::new_test(region_id),
+        leader_peer: Some(Peer::default()),
+        follower_peers: vec![],
+        leader_state: None,
+        leader_down_since: None,
+    }];
+    // Puts a value to table name key.
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            task.table_info.clone(),
+            TableRouteValue::physical(mock_table_routes),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let task = AlterTableTask {
+        alter_table: AlterExpr {
+            catalog_name: DEFAULT_CATALOG_NAME.to_string(),
+            schema_name: DEFAULT_SCHEMA_NAME.to_string(),
+            table_name: table_name.to_string(),
+            kind: Some(Kind::ChangeTableOptions(ChangeTableOptions {
+                change_table_options: vec![ChangeTableOption {
+                    key: TTL_KEY.to_string(),
+                    value: "1d".to_string(),
+                }],
+            })),
+        },
+    };
+    let mut procedure =
+        AlterTableProcedure::new(cluster_id, table_id, task, ddl_context.clone()).unwrap();
+    procedure.on_prepare().await.unwrap();
+    procedure.submit_alter_region_requests().await.unwrap();
+    procedure.on_update_metadata().await.unwrap();
+
+    let table_info = ddl_context
+        .table_metadata_manager
+        .table_info_manager()
+        .get(table_id)
+        .await
+        .unwrap()
+        .unwrap()
+        .into_inner()
+        .table_info;
+
+    let datanode_key = DatanodeTableKey::new(0, table_id);
+    let region_info = ddl_context
+        .table_metadata_manager
+        .datanode_table_manager()
+        .get(&datanode_key)
+        .await
+        .unwrap()
+        .unwrap()
+        .region_info;
+
+    assert_eq!(
+        region_info.region_options,
+        HashMap::from(&table_info.meta.options)
+    );
+}
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -652,6 +652,18 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display(
+        "Datanode table info not found, table id: {}, datanode id: {}",
+        table_id,
+        datanode_id
+    ))]
+    DatanodeTableInfoNotFound {
+        datanode_id: DatanodeId,
+        table_id: TableId,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -752,6 +764,7 @@ impl ErrorExt for Error {
            PostgresExecution { .. } => StatusCode::Internal,
            #[cfg(feature = "pg_kvbackend")]
            ConnectPostgres { .. } => StatusCode::Internal,
+            Error::DatanodeTableInfoNotFound { .. } => StatusCode::Internal,
        }
    }

--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -133,7 +133,6 @@ use self::flow::flow_name::FlowNameValue;
 use self::schema_name::{SchemaManager, SchemaNameKey, SchemaNameValue};
 use self::table_route::{TableRouteManager, TableRouteValue};
 use self::tombstone::TombstoneManager;
-use crate::ddl::utils::region_storage_path;
 use crate::error::{self, Result, SerdeJsonSnafu};
 use crate::key::node_address::NodeAddressValue;
 use crate::key::table_route::TableRouteKey;
@@ -593,8 +592,6 @@ impl TableMetadataManager {
        table_info.meta.region_numbers = region_numbers;
        let table_id = table_info.ident.table_id;
        let engine = table_info.meta.engine.clone();
-        let region_storage_path =
-            region_storage_path(&table_info.catalog_name, &table_info.schema_name);

        // Creates table name.
        let table_name = TableNameKey::new(
@@ -606,7 +603,7 @@ impl TableMetadataManager {
            .table_name_manager()
            .build_create_txn(&table_name, table_id)?;

-        let region_options = (&table_info.meta.options).into();
+        let region_options = table_info.to_region_options();
        // Creates table info.
        let table_info_value = TableInfoValue::new(table_info);
        let (create_table_info_txn, on_create_table_info_failure) = self
@@ -625,6 +622,7 @@ impl TableMetadataManager {
        ]);

        if let TableRouteValue::Physical(x) = &table_route_value {
+            let region_storage_path = table_info_value.region_storage_path();
            let create_datanode_table_txn = self.datanode_table_manager().build_create_txn(
                table_id,
                &engine,
@@ -926,13 +924,15 @@ impl TableMetadataManager {
    }

    /// Updates table info and returns an error if different metadata exists.
+    /// And cascade-ly update all redundant table options for each region
+    /// if region_distribution is present.
    pub async fn update_table_info(
        &self,
        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
+        region_distribution: Option<RegionDistribution>,
        new_table_info: RawTableInfo,
    ) -> Result<()> {
        let table_id = current_table_info_value.table_info.ident.table_id;
-
        let new_table_info_value = current_table_info_value.update(new_table_info);

        // Updates table info.
@@ -940,8 +940,19 @@ impl TableMetadataManager {
            .table_info_manager()
            .build_update_txn(table_id, current_table_info_value, &new_table_info_value)?;

-        let mut r = self.kv_backend.txn(update_table_info_txn).await?;
+        let txn = if let Some(region_distribution) = region_distribution {
+            // region options induced from table info.
+            let new_region_options = new_table_info_value.table_info.to_region_options();
+            let update_datanode_table_options_txn = self
+                .datanode_table_manager
+                .build_update_table_options_txn(table_id, region_distribution, new_region_options)
+                .await?;
+            Txn::merge_all([update_table_info_txn, update_datanode_table_options_txn])
+        } else {
+            update_table_info_txn
+        };

+        let mut r = self.kv_backend.txn(txn).await?;
        // Checks whether metadata was already updated.
        if !r.succeeded {
            let mut set = TxnOpGetResponseSet::from(&mut r.responses);
@@ -1669,12 +1680,12 @@ mod tests {
            DeserializedValueWithBytes::from_inner(TableInfoValue::new(table_info.clone()));
        // should be ok.
        table_metadata_manager
-            .update_table_info(&current_table_info_value, new_table_info.clone())
+            .update_table_info(&current_table_info_value, None, new_table_info.clone())
            .await
            .unwrap();
        // if table info was updated, it should be ok.
        table_metadata_manager
-            .update_table_info(&current_table_info_value, new_table_info.clone())
+            .update_table_info(&current_table_info_value, None, new_table_info.clone())
            .await
            .unwrap();

@@ -1696,7 +1707,7 @@ mod tests {
        // if the current_table_info_value is wrong, it should return an error.
        // The ABA problem.
        assert!(table_metadata_manager
-            .update_table_info(&wrong_table_info_value, new_table_info)
+            .update_table_info(&wrong_table_info_value, None, new_table_info)
            .await
            .is_err())
    }
--- a/src/common/meta/src/key/datanode_table.rs
+++ b/src/common/meta/src/key/datanode_table.rs
@@ -23,7 +23,7 @@ use store_api::storage::RegionNumber;
 use table::metadata::TableId;

 use super::MetadataKey;
-use crate::error::{InvalidMetadataSnafu, Result};
+use crate::error::{DatanodeTableInfoNotFoundSnafu, InvalidMetadataSnafu, Result};
 use crate::key::{
    MetadataValue, RegionDistribution, DATANODE_TABLE_KEY_PATTERN, DATANODE_TABLE_KEY_PREFIX,
 };
@@ -209,6 +209,49 @@ impl DatanodeTableManager {
        Ok(txn)
    }

+    /// Builds a transaction to updates the redundant table options (including WAL options)
+    /// for given table id, if provided.
+    ///
+    /// Note that the provided `new_region_options` must be a
+    /// complete set of all options rather than incremental changes.
+    pub(crate) async fn build_update_table_options_txn(
+        &self,
+        table_id: TableId,
+        region_distribution: RegionDistribution,
+        new_region_options: HashMap<String, String>,
+    ) -> Result<Txn> {
+        assert!(!region_distribution.is_empty());
+        // safety: region_distribution must not be empty
+        let (any_datanode, _) = region_distribution.first_key_value().unwrap();
+
+        let mut region_info = self
+            .kv_backend
+            .get(&DatanodeTableKey::new(*any_datanode, table_id).to_bytes())
+            .await
+            .transpose()
+            .context(DatanodeTableInfoNotFoundSnafu {
+                datanode_id: *any_datanode,
+                table_id,
+            })?
+            .and_then(|r| DatanodeTableValue::try_from_raw_value(&r.value))?
+            .region_info;
+        // substitute region options only.
+        region_info.region_options = new_region_options;
+
+        let mut txns = Vec::with_capacity(region_distribution.len());
+
+        for (datanode, regions) in region_distribution.into_iter() {
+            let key = DatanodeTableKey::new(datanode, table_id);
+            let key_bytes = key.to_bytes();
+            let value_bytes = DatanodeTableValue::new(table_id, regions, region_info.clone())
+                .try_as_raw_value()?;
+            txns.push(TxnOp::Put(key_bytes, value_bytes));
+        }
+
+        let txn = Txn::new().and_then(txns);
+        Ok(txn)
+    }
+
    /// Builds the update datanode table transactions. It only executes while the primary keys comparing successes.
    pub(crate) fn build_update_txn(
        &self,
--- a/src/common/meta/src/key/table_info.rs
+++ b/src/common/meta/src/key/table_info.rs
@@ -23,6 +23,7 @@ use table::table_name::TableName;
 use table::table_reference::TableReference;

 use super::TABLE_INFO_KEY_PATTERN;
+use crate::ddl::utils::region_storage_path;
 use crate::error::{InvalidMetadataSnafu, Result};
 use crate::key::txn_helper::TxnOpGetResponseSet;
 use crate::key::{DeserializedValueWithBytes, MetadataKey, MetadataValue, TABLE_INFO_KEY_PREFIX};
@@ -125,6 +126,11 @@ impl TableInfoValue {
            table_name: self.table_info.name.to_string(),
        }
    }
+
+    /// Builds storage path for all regions in table.
+    pub fn region_storage_path(&self) -> String {
+        region_storage_path(&self.table_info.catalog_name, &self.table_info.schema_name)
+    }
 }

 pub type TableInfoManagerRef = Arc<TableInfoManager>;
--- a/src/common/telemetry/Cargo.toml
+++ b/src/common/telemetry/Cargo.toml
@@ -32,7 +32,7 @@ serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
 tracing = "0.1"
-tracing-appender = "0.2"
+tracing-appender.workspace = true
 tracing-log = "0.1"
 tracing-opentelemetry = "0.22.0"
-tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
+tracing-subscriber.workspace = true
--- a/src/datatypes/src/error.rs
+++ b/src/datatypes/src/error.rs
@@ -189,6 +189,13 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Invalid JSON text: {}", value))]
+    InvalidJson {
+        value: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Value exceeds the precision {} bound", precision))]
    ValueExceedsPrecision {
        precision: u8,
@@ -222,7 +229,8 @@ impl ErrorExt for Error {
            | DefaultValueType { .. }
            | DuplicateMeta { .. }
            | InvalidTimestampPrecision { .. }
-            | InvalidPrecisionOrScale { .. } => StatusCode::InvalidArguments,
+            | InvalidPrecisionOrScale { .. }
+            | InvalidJson { .. } => StatusCode::InvalidArguments,

            ValueExceedsPrecision { .. }
            | CastType { .. }
--- a/src/datatypes/src/lib.rs
+++ b/src/datatypes/src/lib.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 #![feature(let_chains)]
+#![feature(assert_matches)]

 pub mod arrow_array;
 pub mod data_type;
--- a/src/datatypes/src/vectors/binary.rs
+++ b/src/datatypes/src/vectors/binary.rs
@@ -36,6 +36,36 @@ impl BinaryVector {
    pub(crate) fn as_arrow(&self) -> &dyn Array {
        &self.array
    }
+
+    /// Creates a new binary vector of JSONB from a binary vector.
+    /// The binary vector must contain valid JSON strings.
+    pub fn convert_binary_to_json(&self) -> Result<BinaryVector> {
+        let arrow_array = self.to_arrow_array();
+        let mut vector = vec![];
+        for binary in arrow_array
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap()
+            .iter()
+        {
+            let jsonb = if let Some(binary) = binary {
+                match jsonb::from_slice(binary) {
+                    Ok(jsonb) => Some(jsonb.to_vec()),
+                    Err(_) => {
+                        let s = String::from_utf8_lossy(binary);
+                        return error::InvalidJsonSnafu {
+                            value: s.to_string(),
+                        }
+                        .fail();
+                    }
+                }
+            } else {
+                None
+            };
+            vector.push(jsonb);
+        }
+        Ok(BinaryVector::from(vector))
+    }
 }

 impl From<BinaryArray> for BinaryVector {
@@ -233,6 +263,8 @@ vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector);

 #[cfg(test)]
 mod tests {
+    use std::assert_matches::assert_matches;
+
    use arrow::datatypes::DataType as ArrowDataType;
    use common_base::bytes::Bytes;
    use serde_json;
@@ -383,4 +415,52 @@ mod tests {
        assert_eq!(b"four", vector.get_data(3).unwrap());
        assert_eq!(builder.len(), 4);
    }
+
+    #[test]
+    fn test_binary_json_conversion() {
+        // json strings
+        let json_strings = vec![
+            b"{\"hello\": \"world\"}".to_vec(),
+            b"{\"foo\": 1}".to_vec(),
+            b"123".to_vec(),
+        ];
+        let json_vector = BinaryVector::from(json_strings.clone())
+            .convert_binary_to_json()
+            .unwrap();
+        let jsonbs = json_strings
+            .iter()
+            .map(|v| jsonb::parse_value(v).unwrap().to_vec())
+            .collect::<Vec<_>>();
+        for i in 0..3 {
+            assert_eq!(
+                json_vector.get_ref(i).as_binary().unwrap().unwrap(),
+                jsonbs.get(i).unwrap().as_slice()
+            );
+        }
+
+        // jsonb
+        let json_vector = BinaryVector::from(jsonbs.clone())
+            .convert_binary_to_json()
+            .unwrap();
+        for i in 0..3 {
+            assert_eq!(
+                json_vector.get_ref(i).as_binary().unwrap().unwrap(),
+                jsonbs.get(i).unwrap().as_slice()
+            );
+        }
+
+        // binary with jsonb header (0x80, 0x40, 0x20)
+        let binary_with_jsonb_header: Vec<u8> = [0x80, 0x23, 0x40, 0x22].to_vec();
+        let error = BinaryVector::from(vec![binary_with_jsonb_header])
+            .convert_binary_to_json()
+            .unwrap_err();
+        assert_matches!(error, error::Error::InvalidJson { .. });
+
+        // invalid json string
+        let json_strings = vec![b"{\"hello\": \"world\"".to_vec()];
+        let error = BinaryVector::from(json_strings)
+            .convert_binary_to_json()
+            .unwrap_err();
+        assert_matches!(error, error::Error::InvalidJson { .. });
+    }
 }
--- a/src/datatypes/src/vectors/operations.rs
+++ b/src/datatypes/src/vectors/operations.rs
@@ -18,6 +18,8 @@ mod find_unique;
 mod replicate;
 mod take;

+use std::sync::Arc;
+
 use common_base::BitVec;

 use crate::error::{self, Result};
@@ -89,6 +91,12 @@ macro_rules! impl_scalar_vector_op {
            }

            fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
+                if to_type == &ConcreteDataType::json_datatype() {
+                    if let Some(vector) = self.as_any().downcast_ref::<BinaryVector>() {
+                        let json_vector = vector.convert_binary_to_json()?;
+                        return Ok(Arc::new(json_vector) as VectorRef);
+                    }
+                }
                cast::cast_non_constant!(self, to_type)
            }

--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -91,8 +91,9 @@ impl RegionEngine for FileRegionEngine {
        request: ScanRequest,
    ) -> Result<RegionScannerRef, BoxedError> {
        let stream = self.handle_query(region_id, request).await?;
+        let metadata = self.get_metadata(region_id).await?;
        // We don't support enabling append mode for file engine.
-        let scanner = Box::new(SinglePartitionScanner::new(stream, false));
+        let scanner = Box::new(SinglePartitionScanner::new(stream, false, metadata));
        Ok(scanner)
    }

--- a/src/flow/src/df_optimizer.rs
+++ b/src/flow/src/df_optimizer.rs
@@ -106,7 +106,7 @@ pub async fn sql_to_flow_plan(
        .context(ExternalSnafu)?;
    let plan = engine
        .planner()
-        .plan(stmt, query_ctx)
+        .plan(&stmt, query_ctx)
        .await
        .map_err(BoxedError::new)
        .context(ExternalSnafu)?;
--- a/src/flow/src/transform.rs
+++ b/src/flow/src/transform.rs
@@ -278,7 +278,7 @@ mod test {
        let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap();
        let plan = engine
            .planner()
-            .plan(stmt, QueryContext::arc())
+            .plan(&stmt, QueryContext::arc())
            .await
            .unwrap();
        let plan = apply_df_optimizer(plan).await.unwrap();
@@ -300,7 +300,7 @@ mod test {
        let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap();
        let plan = engine
            .planner()
-            .plan(stmt, QueryContext::arc())
+            .plan(&stmt, QueryContext::arc())
            .await
            .unwrap();
        let plan = apply_df_optimizer(plan).await;
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -313,6 +313,14 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Failed to init plugin"))]
+    // this comment is to bypass the unused snafu check in "check-snafu.py"
+    InitPlugin {
+        #[snafu(implicit)]
+        location: Location,
+        source: BoxedError,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -375,8 +383,9 @@ impl ErrorExt for Error {
            | Error::ExecLogicalPlan { source, .. } => source.status_code(),

            Error::InvokeRegionServer { source, .. } => source.status_code(),
-
-            Error::External { source, .. } => source.status_code(),
+            Error::External { source, .. } | Error::InitPlugin { source, .. } => {
+                source.status_code()
+            }
            Error::FindTableRoute { source, .. } => source.status_code(),

            #[cfg(feature = "python")]
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -225,11 +225,45 @@ impl Instance {
    async fn query_statement(&self, stmt: Statement, query_ctx: QueryContextRef) -> Result<Output> {
        check_permission(self.plugins.clone(), &stmt, &query_ctx)?;

-        let stmt = QueryStatement::Sql(stmt);
-        self.statement_executor
-            .execute_stmt(stmt, query_ctx)
-            .await
-            .context(TableOperationSnafu)
+        let query_interceptor = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
+        let query_interceptor = query_interceptor.as_ref();
+
+        let output = match stmt {
+            Statement::Query(_) | Statement::Explain(_) | Statement::Delete(_) => {
+                let stmt = QueryStatement::Sql(stmt);
+                let plan = self
+                    .statement_executor
+                    .plan(&stmt, query_ctx.clone())
+                    .await?;
+
+                let QueryStatement::Sql(stmt) = stmt else {
+                    unreachable!()
+                };
+                query_interceptor.pre_execute(&stmt, Some(&plan), query_ctx.clone())?;
+
+                self.statement_executor.exec_plan(plan, query_ctx).await
+            }
+            Statement::Tql(tql) => {
+                let plan = self
+                    .statement_executor
+                    .plan_tql(tql.clone(), &query_ctx)
+                    .await?;
+
+                query_interceptor.pre_execute(
+                    &Statement::Tql(tql),
+                    Some(&plan),
+                    query_ctx.clone(),
+                )?;
+
+                self.statement_executor.exec_plan(plan, query_ctx).await
+            }
+            _ => {
+                query_interceptor.pre_execute(&stmt, None, query_ctx.clone())?;
+
+                self.statement_executor.execute_sql(stmt, query_ctx).await
+            }
+        };
+        output.context(TableOperationSnafu)
    }
 }

@@ -255,14 +289,6 @@ impl SqlQueryHandler for Instance {
            Ok(stmts) => {
                let mut results = Vec::with_capacity(stmts.len());
                for stmt in stmts {
-                    // TODO(sunng87): figure out at which stage we can call
-                    // this hook after ArrowFlight adoption. We need to provide
-                    // LogicalPlan as to this hook.
-                    if let Err(e) = query_interceptor.pre_execute(&stmt, None, query_ctx.clone()) {
-                        results.push(Err(e));
-                        break;
-                    }
-
                    if let Err(e) = checker
                        .check_permission(
                            query_ctx.current_user(),
@@ -341,7 +367,7 @@ impl SqlQueryHandler for Instance {
            let plan = self
                .query_engine
                .planner()
-                .plan(QueryStatement::Sql(stmt), query_ctx.clone())
+                .plan(&QueryStatement::Sql(stmt), query_ctx.clone())
                .await
                .context(PlanStatementSnafu)?;
            self.query_engine
--- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
@@ -114,17 +114,17 @@ impl PredicatesIndexApplier {
            .partition_in_place(|(_, ps)| ps.iter().any(|p| matches!(p, Predicate::InList(_))));
        let mut iter = predicates.into_iter();
        for _ in 0..in_list_index {
-            let (tag_name, predicates) = iter.next().unwrap();
+            let (column_name, predicates) = iter.next().unwrap();
            let fst_applier = Box::new(KeysFstApplier::try_from(predicates)?) as _;
-            fst_appliers.push((tag_name, fst_applier));
+            fst_appliers.push((column_name, fst_applier));
        }

-        for (tag_name, predicates) in iter {
+        for (column_name, predicates) in iter {
            if predicates.is_empty() {
                continue;
            }
            let fst_applier = Box::new(IntersectionFstApplier::try_from(predicates)?) as _;
-            fst_appliers.push((tag_name, fst_applier));
+            fst_appliers.push((column_name, fst_applier));
        }

        Ok(PredicatesIndexApplier { fst_appliers })
--- a/src/log-store/src/error.rs
+++ b/src/log-store/src/error.rs
@@ -17,6 +17,7 @@ use std::any::Any;
 use common_error::ext::ErrorExt;
 use common_macro::stack_trace_debug;
 use common_runtime::error::Error as RuntimeError;
+use common_runtime::JoinError;
 use serde_json::error::Error as JsonError;
 use snafu::{Location, Snafu};
 use store_api::storage::RegionId;
@@ -306,6 +307,14 @@ pub enum Error {
        #[snafu(implicit)]
        location: Location,
    },
+
+    #[snafu(display("Join error"))]
+    Join {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: JoinError,
+    },
 }

 impl ErrorExt for Error {
--- a/src/log-store/src/raft_engine/log_store.rs
+++ b/src/log-store/src/raft_engine/log_store.rs
@@ -31,8 +31,8 @@ use store_api::storage::RegionId;

 use crate::error::{
    AddEntryLogBatchSnafu, DiscontinuousLogIndexSnafu, Error, FetchEntrySnafu,
-    IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, OverrideCompactedEntrySnafu,
-    RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu,
+    IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, JoinSnafu,
+    OverrideCompactedEntrySnafu, RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu,
 };
 use crate::metrics;
 use crate::raft_engine::backend::SYSTEM_NAMESPACE;
@@ -250,6 +250,12 @@ impl LogStore for RaftEngineLogStore {
            .engine
            .write(&mut batch, sync)
            .context(RaftEngineSnafu)?;
+        let engine = self.engine.clone();
+        let _ = common_runtime::spawn_blocking_global(move || {
+            engine.write(&mut batch, sync).context(RaftEngineSnafu)
+        })
+        .await
+        .context(JoinSnafu)?;

        Ok(AppendBatchResponse { last_entry_ids })
    }
--- a/src/meta-client/Cargo.toml
+++ b/src/meta-client/Cargo.toml
@@ -29,4 +29,4 @@ futures = "0.3"
 meta-srv = { workspace = true, features = ["mock"] }
 tower.workspace = true
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber.workspace = true
--- a/src/meta-srv/Cargo.toml
+++ b/src/meta-srv/Cargo.toml
@@ -59,7 +59,7 @@ tokio-stream = { workspace = true, features = ["net"] }
 toml.workspace = true
 tonic.workspace = true
 tower.workspace = true
-typetag = "0.2"
+typetag.workspace = true
 url = "2.3"

 [dev-dependencies]
@@ -69,4 +69,4 @@ common-meta = { workspace = true, features = ["testing"] }
 common-procedure-test.workspace = true
 session.workspace = true
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber.workspace = true
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -18,7 +18,6 @@ use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
 use common_meta::DatanodeId;
 use common_runtime::JoinError;
-use rand::distributions::WeightedError;
 use snafu::{Location, Snafu};
 use store_api::storage::RegionId;
 use table::metadata::TableId;
@@ -32,6 +31,14 @@ use crate::pubsub::Message;
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
 pub enum Error {
+    #[snafu(display("Failed to choose items"))]
+    ChooseItems {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: rand::distributions::WeightedError,
+    },
+
    #[snafu(display("Exceeded deadline, operation: {}", operation))]
    ExceededDeadline {
        #[snafu(implicit)]
@@ -643,20 +650,6 @@ pub enum Error {
        location: Location,
    },

-    #[snafu(display("Failed to set weight array"))]
-    WeightArray {
-        #[snafu(source)]
-        error: WeightedError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Weight array is not set"))]
-    NotSetWeightArray {
-        #[snafu(implicit)]
-        location: Location,
-    },
-
    #[snafu(display("Unexpected table route type: {}", err_msg))]
    UnexpectedLogicalRouteTable {
        #[snafu(implicit)]
@@ -759,10 +752,9 @@ impl ErrorExt for Error {
            | Error::NoEnoughAvailableNode { .. }
            | Error::PublishMessage { .. }
            | Error::Join { .. }
-            | Error::WeightArray { .. }
-            | Error::NotSetWeightArray { .. }
            | Error::PeerUnavailable { .. }
-            | Error::ExceededDeadline { .. } => StatusCode::Internal,
+            | Error::ExceededDeadline { .. }
+            | Error::ChooseItems { .. } => StatusCode::Internal,

            Error::Unsupported { .. } => StatusCode::Unsupported,

--- a/src/meta-srv/src/handler/failure_handler.rs
+++ b/src/meta-srv/src/handler/failure_handler.rs
@@ -93,9 +93,11 @@ mod tests {
                approximate_bytes: 0,
                engine: default_engine().to_string(),
                role: RegionRole::Follower,
+                num_rows: 0,
                memtable_size: 0,
                manifest_size: 0,
                sst_size: 0,
+                index_size: 0,
            }
        }
        acc.stat = Some(Stat {
--- a/src/meta-srv/src/handler/region_lease_handler.rs
+++ b/src/meta-srv/src/handler/region_lease_handler.rs
@@ -135,9 +135,11 @@ mod test {
            wcus: 0,
            approximate_bytes: 0,
            engine: String::new(),
+            num_rows: 0,
            memtable_size: 0,
            manifest_size: 0,
            sst_size: 0,
+            index_size: 0,
        }
    }

--- a/src/meta-srv/src/selector/common.rs
+++ b/src/meta-srv/src/selector/common.rs
@@ -12,29 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::collections::HashSet;
-
 use common_meta::peer::Peer;
 use snafu::ensure;

-use super::weighted_choose::{WeightedChoose, WeightedItem};
+use super::weighted_choose::WeightedChoose;
 use crate::error;
 use crate::error::Result;
 use crate::metasrv::SelectTarget;
 use crate::selector::SelectorOptions;

 /// According to the `opts`, choose peers from the `weight_array` through `weighted_choose`.
-pub fn choose_peers<W>(
-    mut weight_array: Vec<WeightedItem<Peer>>,
-    opts: &SelectorOptions,
-    weighted_choose: &mut W,
-) -> Result<Vec<Peer>>
+pub fn choose_peers<W>(opts: &SelectorOptions, weighted_choose: &mut W) -> Result<Vec<Peer>>
 where
    W: WeightedChoose<Peer>,
 {
    let min_required_items = opts.min_required_items;
    ensure!(
-        !weight_array.is_empty(),
+        !weighted_choose.is_empty(),
        error::NoEnoughAvailableNodeSnafu {
            required: min_required_items,
            available: 0_usize,
@@ -43,12 +37,11 @@ where
    );

    if opts.allow_duplication {
-        weighted_choose.set_weight_array(weight_array)?;
        (0..min_required_items)
            .map(|_| weighted_choose.choose_one())
            .collect::<Result<_>>()
    } else {
-        let weight_array_len = weight_array.len();
+        let weight_array_len = weighted_choose.len();

        // When opts.allow_duplication is false, we need to check that the length of the weighted array is greater than
        // or equal to min_required_items, otherwise it may cause an infinite loop.
@@ -61,33 +54,7 @@ where
            }
        );

-        if weight_array_len == min_required_items {
-            return Ok(weight_array.into_iter().map(|item| item.item).collect());
-        }
-
-        weighted_choose.set_weight_array(weight_array.clone())?;
-
-        // Assume min_required_items is 3, weight_array_len is 100, then we can choose 3 items from the weight array
-        // and return. But assume min_required_items is 99, weight_array_len is 100. It's not cheap to choose 99 items
-        // from the weight array. So we can reverse choose 1 item from the weight array, and return the remaining 99
-        // items.
-        if min_required_items * 2 > weight_array_len {
-            let select_num = weight_array_len - min_required_items;
-            let mut selected = HashSet::with_capacity(select_num);
-            while selected.len() < select_num {
-                let item = weighted_choose.reverse_choose_one()?;
-                selected.insert(item);
-            }
-            weight_array.retain(|item| !selected.contains(&item.item));
-            Ok(weight_array.into_iter().map(|item| item.item).collect())
-        } else {
-            let mut selected = HashSet::with_capacity(min_required_items);
-            while selected.len() < min_required_items {
-                let item = weighted_choose.choose_one()?;
-                selected.insert(item);
-            }
-            Ok(selected.into_iter().collect())
-        }
+        weighted_choose.choose_multiple(min_required_items)
    }
 }

@@ -110,7 +77,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -118,7 +84,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -126,7 +91,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -134,7 +98,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
            WeightedItem {
                item: Peer {
@@ -142,7 +105,6 @@ mod tests {
                    addr: "127.0.0.1:3001".to_string(),
                },
                weight: 1,
-                reverse_weight: 1,
            },
        ];

@@ -152,14 +114,11 @@ mod tests {
                allow_duplication: false,
            };

-            let selected_peers: HashSet<_> = choose_peers(
-                weight_array.clone(),
-                &opts,
-                &mut RandomWeightedChoose::default(),
-            )
-            .unwrap()
-            .into_iter()
-            .collect();
+            let selected_peers: HashSet<_> =
+                choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone()))
+                    .unwrap()
+                    .into_iter()
+                    .collect();

            assert_eq!(i, selected_peers.len());
        }
@@ -169,11 +128,8 @@ mod tests {
            allow_duplication: false,
        };

-        let selected_result = choose_peers(
-            weight_array.clone(),
-            &opts,
-            &mut RandomWeightedChoose::default(),
-        );
+        let selected_result =
+            choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone()));
        assert!(selected_result.is_err());

        for i in 1..=50 {
@@ -182,12 +138,8 @@ mod tests {
                allow_duplication: true,
            };

-            let selected_peers = choose_peers(
-                weight_array.clone(),
-                &opts,
-                &mut RandomWeightedChoose::default(),
-            )
-            .unwrap();
+            let selected_peers =
+                choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone())).unwrap();

            assert_eq!(i, selected_peers.len());
        }
--- a/src/meta-srv/src/selector/lease_based.rs
+++ b/src/meta-srv/src/selector/lease_based.rs
@@ -48,13 +48,12 @@ impl Selector for LeaseBasedSelector {
                    addr: v.node_addr.clone(),
                },
                weight: 1,
-                reverse_weight: 1,
            })
            .collect();

        // 3. choose peers by weight_array.
-        let weighted_choose = &mut RandomWeightedChoose::default();
-        let selected = choose_peers(weight_array, &opts, weighted_choose)?;
+        let mut weighted_choose = RandomWeightedChoose::new(weight_array);
+        let selected = choose_peers(&opts, &mut weighted_choose)?;

        Ok(selected)
    }
--- a/src/meta-srv/src/selector/load_based.rs
+++ b/src/meta-srv/src/selector/load_based.rs
@@ -19,7 +19,6 @@ use common_meta::key::TableMetadataManager;
 use common_meta::peer::Peer;
 use common_meta::rpc::router::find_leaders;
 use common_telemetry::{debug, info};
-use parking_lot::RwLock;
 use snafu::ResultExt;
 use table::metadata::TableId;

@@ -29,36 +28,30 @@ use crate::lease;
 use crate::metasrv::SelectorContext;
 use crate::selector::common::choose_peers;
 use crate::selector::weight_compute::{RegionNumsBasedWeightCompute, WeightCompute};
-use crate::selector::weighted_choose::{RandomWeightedChoose, WeightedChoose};
+use crate::selector::weighted_choose::RandomWeightedChoose;
 use crate::selector::{Namespace, Selector, SelectorOptions};

-pub struct LoadBasedSelector<W, C> {
-    weighted_choose: RwLock<W>,
+pub struct LoadBasedSelector<C> {
    weight_compute: C,
 }

-impl<W, C> LoadBasedSelector<W, C> {
-    pub fn new(weighted_choose: W, weight_compute: C) -> Self {
-        Self {
-            weighted_choose: RwLock::new(weighted_choose),
-            weight_compute,
-        }
+impl<C> LoadBasedSelector<C> {
+    pub fn new(weight_compute: C) -> Self {
+        Self { weight_compute }
    }
 }

-impl Default for LoadBasedSelector<RandomWeightedChoose<Peer>, RegionNumsBasedWeightCompute> {
+impl Default for LoadBasedSelector<RegionNumsBasedWeightCompute> {
    fn default() -> Self {
        Self {
-            weighted_choose: RwLock::new(RandomWeightedChoose::default()),
            weight_compute: RegionNumsBasedWeightCompute,
        }
    }
 }

 #[async_trait::async_trait]
-impl<W, C> Selector for LoadBasedSelector<W, C>
+impl<C> Selector for LoadBasedSelector<C>
 where
-    W: WeightedChoose<Peer>,
    C: WeightCompute<Source = HashMap<DatanodeStatKey, DatanodeStatValue>>,
 {
    type Context = SelectorContext;
@@ -100,8 +93,8 @@ where
        let weight_array = self.weight_compute.compute(&stat_kvs);

        // 5. choose peers by weight_array.
-        let mut weighted_choose = self.weighted_choose.write();
-        let selected = choose_peers(weight_array, &opts, &mut *weighted_choose)?;
+        let mut weighted_choose = RandomWeightedChoose::new(weight_array);
+        let selected = choose_peers(&opts, &mut weighted_choose)?;

        debug!(
            "LoadBasedSelector select peers: {:?}, namespace: {}, opts: {:?}.",
--- a/src/meta-srv/src/selector/weight_compute.rs
+++ b/src/meta-srv/src/selector/weight_compute.rs
@@ -85,7 +85,6 @@ impl WeightCompute for RegionNumsBasedWeightCompute {
            .map(|(peer, region_num)| WeightedItem {
                item: peer,
                weight: (max_weight - region_num + base_weight) as usize,
-                reverse_weight: (region_num - min_weight + base_weight) as usize,
            })
            .collect()
    }
@@ -181,10 +180,6 @@ mod tests {
            },
            4,
        );
-
-        for weight in weight_array.iter() {
-            assert_eq!(weight.reverse_weight, *expected.get(&weight.item).unwrap());
-        }
    }

    fn mock_stat_1() -> Stat {
@@ -198,9 +193,11 @@ mod tests {
                approximate_bytes: 1,
                engine: "mito2".to_string(),
                role: RegionRole::Leader,
+                num_rows: 0,
                memtable_size: 0,
                manifest_size: 0,
                sst_size: 0,
+                index_size: 0,
            }],
            ..Default::default()
        }
@@ -217,9 +214,11 @@ mod tests {
                approximate_bytes: 1,
                engine: "mito2".to_string(),
                role: RegionRole::Leader,
+                num_rows: 0,
                memtable_size: 0,
                manifest_size: 0,
                sst_size: 0,
+                index_size: 0,
            }],
            ..Default::default()
        }
@@ -236,9 +235,11 @@ mod tests {
                approximate_bytes: 1,
                engine: "mito2".to_string(),
                role: RegionRole::Leader,
+                num_rows: 0,
                memtable_size: 0,
                manifest_size: 0,
                sst_size: 0,
+                index_size: 0,
            }],
            ..Default::default()
        }
--- a/src/meta-srv/src/selector/weighted_choose.rs
+++ b/src/meta-srv/src/selector/weighted_choose.rs
@@ -12,41 +12,37 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use rand::distributions::WeightedIndex;
-use rand::prelude::Distribution;
+use rand::seq::SliceRandom;
 use rand::thread_rng;
-use snafu::{ensure, ResultExt};
+use snafu::ResultExt;

 use crate::error;
 use crate::error::Result;

 /// A common trait for weighted balance algorithm.
 pub trait WeightedChoose<Item>: Send + Sync {
-    /// The method will re-set weight array.
-    ///
-    /// Note:
-    /// 1. make sure weight_array is not empty.
-    /// 2. the total weight is greater than 0.
-    ///
-    /// Otherwise an error will be returned.
-    fn set_weight_array(&mut self, weight_array: Vec<WeightedItem<Item>>) -> Result<()>;
-
    /// The method will choose one item.
-    ///
-    /// If not set weight_array before, an error will be returned.
    fn choose_one(&mut self) -> Result<Item>;

-    /// The method will reverse choose one item.
+    /// The method will choose multiple items.
    ///
-    /// If not set weight_array before, an error will be returned.
-    fn reverse_choose_one(&mut self) -> Result<Item>;
+    /// Returns less than `amount` items if the weight_array is not enough.
+    fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>>;
+
+    /// Returns the length of the weight_array.
+    fn len(&self) -> usize;
+
+    /// Returns whether the weight_array is empty.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
 }

+/// The struct represents a weighted item.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct WeightedItem<Item> {
    pub item: Item,
    pub weight: usize,
-    pub reverse_weight: usize,
 }

 /// A implementation of weighted balance: random weighted choose.
@@ -64,16 +60,18 @@ pub struct WeightedItem<Item> {
 /// ```
 pub struct RandomWeightedChoose<Item> {
    items: Vec<WeightedItem<Item>>,
-    weighted_index: Option<WeightedIndex<usize>>,
-    reverse_weighted_index: Option<WeightedIndex<usize>>,
+}
+
+impl<Item> RandomWeightedChoose<Item> {
+    pub fn new(items: Vec<WeightedItem<Item>>) -> Self {
+        Self { items }
+    }
 }

 impl<Item> Default for RandomWeightedChoose<Item> {
    fn default() -> Self {
        Self {
            items: Vec::default(),
-            weighted_index: None,
-            reverse_weighted_index: None,
        }
    }
 }
@@ -82,48 +80,29 @@ impl<Item> WeightedChoose<Item> for RandomWeightedChoose<Item>
 where
    Item: Clone + Send + Sync,
 {
-    fn set_weight_array(&mut self, weight_array: Vec<WeightedItem<Item>>) -> Result<()> {
-        self.weighted_index = Some(
-            WeightedIndex::new(weight_array.iter().map(|item| item.weight))
-                .context(error::WeightArraySnafu)?,
-        );
-
-        self.reverse_weighted_index = Some(
-            WeightedIndex::new(weight_array.iter().map(|item| item.reverse_weight))
-                .context(error::WeightArraySnafu)?,
-        );
-
-        self.items = weight_array;
-
-        Ok(())
-    }
-
    fn choose_one(&mut self) -> Result<Item> {
-        ensure!(
-            !self.items.is_empty() && self.weighted_index.is_some(),
-            error::NotSetWeightArraySnafu
-        );
-
        // unwrap safety: whether weighted_index is none has been checked before.
-        let weighted_index = self.weighted_index.as_ref().unwrap();
-
-        Ok(self.items[weighted_index.sample(&mut thread_rng())]
+        let item = self
+            .items
+            .choose_weighted(&mut thread_rng(), |item| item.weight as f64)
+            .context(error::ChooseItemsSnafu)?
            .item
-            .clone())
+            .clone();
+        Ok(item)
    }

-    fn reverse_choose_one(&mut self) -> Result<Item> {
-        ensure!(
-            !self.items.is_empty() && self.reverse_weighted_index.is_some(),
-            error::NotSetWeightArraySnafu
-        );
+    fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>> {
+        Ok(self
+            .items
+            .choose_multiple_weighted(&mut thread_rng(), amount, |item| item.weight as f64)
+            .context(error::ChooseItemsSnafu)?
+            .cloned()
+            .map(|item| item.item)
+            .collect::<Vec<_>>())
+    }

-        // unwrap safety: whether reverse_weighted_index is none has been checked before.
-        let reverse_weighted_index = self.reverse_weighted_index.as_ref().unwrap();
-
-        Ok(self.items[reverse_weighted_index.sample(&mut thread_rng())]
-            .item
-            .clone())
+    fn len(&self) -> usize {
+        self.items.len()
    }
 }

@@ -133,45 +112,22 @@ mod tests {

    #[test]
    fn test_random_weighted_choose() {
-        let mut choose = RandomWeightedChoose::default();
-        choose
-            .set_weight_array(vec![
-                WeightedItem {
-                    item: 1,
-                    weight: 100,
-                    reverse_weight: 0,
-                },
-                WeightedItem {
-                    item: 2,
-                    weight: 0,
-                    reverse_weight: 100,
-                },
-            ])
-            .unwrap();
+        let mut choose = RandomWeightedChoose::new(vec![
+            WeightedItem {
+                item: 1,
+                weight: 100,
+            },
+            WeightedItem { item: 2, weight: 0 },
+        ]);
+
        for _ in 0..100 {
            let ret = choose.choose_one().unwrap();
            assert_eq!(1, ret);
        }

        for _ in 0..100 {
-            let ret = choose.reverse_choose_one().unwrap();
-            assert_eq!(2, ret);
+            let ret = choose.choose_multiple(3).unwrap();
+            assert_eq!(vec![1, 2], ret);
        }
    }
-
-    #[test]
-    #[should_panic]
-    fn test_random_weighted_choose_should_panic() {
-        let mut choose: RandomWeightedChoose<u32> = RandomWeightedChoose::default();
-        choose.set_weight_array(vec![]).unwrap();
-        let _ = choose.choose_one().unwrap();
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_random_reverse_weighted_choose_should_panic() {
-        let mut choose: RandomWeightedChoose<u32> = RandomWeightedChoose::default();
-        choose.set_weight_array(vec![]).unwrap();
-        let _ = choose.reverse_choose_one().unwrap();
-    }
 }
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -80,18 +80,15 @@ impl CacheManager {
        CacheManagerBuilder::default()
    }

-    /// Gets cached [ParquetMetaData].
+    /// Gets cached [ParquetMetaData] from in-memory cache first.
+    /// If not found, tries to get it from write cache and fill the in-memory cache.
    pub async fn get_parquet_meta_data(
        &self,
        region_id: RegionId,
        file_id: FileId,
    ) -> Option<Arc<ParquetMetaData>> {
        // Try to get metadata from sst meta cache
-        let metadata = self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
-            let value = sst_meta_cache.get(&SstMetaKey(region_id, file_id));
-            update_hit_miss(value, SST_META_TYPE)
-        });
-
+        let metadata = self.get_parquet_meta_data_from_mem_cache(region_id, file_id);
        if metadata.is_some() {
            return metadata;
        }
@@ -110,6 +107,20 @@ impl CacheManager {
        None
    }

+    /// Gets cached [ParquetMetaData] from in-memory cache.
+    /// This method does not perform I/O.
+    pub fn get_parquet_meta_data_from_mem_cache(
+        &self,
+        region_id: RegionId,
+        file_id: FileId,
+    ) -> Option<Arc<ParquetMetaData>> {
+        // Try to get metadata from sst meta cache
+        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
+            let value = sst_meta_cache.get(&SstMetaKey(region_id, file_id));
+            update_hit_miss(value, SST_META_TYPE)
+        })
+    }
+
    /// Puts [ParquetMetaData] into the cache.
    pub fn put_parquet_meta_data(
        &self,
--- a/src/mito2/src/cache/cache_size.rs
+++ b/src/mito2/src/cache/cache_size.rs
@@ -127,16 +127,3 @@ fn parquet_offset_index_heap_size(offset_index: &ParquetOffsetIndex) -> usize {
        })
        .sum()
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::cache::test_util::parquet_meta;
-
-    #[test]
-    fn test_parquet_meta_size() {
-        let metadata = parquet_meta();
-
-        assert_eq!(956, parquet_meta_size(&metadata));
-    }
-}
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -163,13 +163,13 @@ impl MitoEngine {
    }

    /// Returns a region scanner to scan the region for `request`.
-    async fn region_scanner(
+    fn region_scanner(
        &self,
        region_id: RegionId,
        request: ScanRequest,
    ) -> Result<RegionScannerRef> {
        let scanner = self.scanner(region_id, request)?;
-        scanner.region_scanner().await
+        scanner.region_scanner()
    }

    /// Scans a region.
@@ -527,7 +527,6 @@ impl RegionEngine for MitoEngine {
        request: ScanRequest,
    ) -> Result<RegionScannerRef, BoxedError> {
        self.region_scanner(region_id, request)
-            .await
            .map_err(BoxedError::new)
    }

--- a/src/mito2/src/engine/basic_test.rs
+++ b/src/mito2/src/engine/basic_test.rs
@@ -580,7 +580,8 @@ async fn test_region_usage() {
    flush_region(&engine, region_id, None).await;

    let region_stat = region.region_statistic();
-    assert_eq!(region_stat.sst_size, 3010);
+    assert_eq!(region_stat.sst_size, 2790);
+    assert_eq!(region_stat.num_rows, 10);

    // region total usage
    // Some memtables may share items.
--- a/src/mito2/src/engine/prune_test.rs
+++ b/src/mito2/src/engine/prune_test.rs
@@ -259,3 +259,56 @@ async fn test_prune_memtable_complex_expr() {
 +-------+---------+---------------------+";
    assert_eq!(expected, batches.pretty_print().unwrap());
 }
+
+#[tokio::test]
+async fn test_mem_range_prune() {
+    let mut env = TestEnv::new();
+    let engine = env.create_engine(MitoConfig::default()).await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+
+    let column_schemas = rows_schema(&request);
+
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    put_rows(
+        &engine,
+        region_id,
+        Rows {
+            schema: column_schemas.clone(),
+            rows: build_rows(5, 8),
+        },
+    )
+    .await;
+
+    // Starts scan and gets the memtable time range.
+    let stream = engine
+        .scan_to_stream(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+
+    put_rows(
+        &engine,
+        region_id,
+        Rows {
+            schema: column_schemas.clone(),
+            rows: build_rows(10, 12),
+        },
+    )
+    .await;
+
+    let batches = RecordBatches::try_collect(stream).await.unwrap();
+    let expected = "\
+-------+---------+---------------------+
+| tag_0 | field_0 | ts                  |
+-------+---------+---------------------+
+| 5     | 5.0     | 1970-01-01T00:00:05 |
+| 6     | 6.0     | 1970-01-01T00:00:06 |
+| 7     | 7.0     | 1970-01-01T00:00:07 |
+-------+---------+---------------------+";
+    assert_eq!(expected, batches.pretty_print().unwrap());
+}
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -34,8 +34,10 @@ pub use crate::memtable::key_values::KeyValues;
 use crate::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtableBuilder};
 use crate::memtable::time_series::TimeSeriesMemtableBuilder;
 use crate::metrics::WRITE_BUFFER_BYTES;
+use crate::read::prune::PruneTimeIterator;
 use crate::read::Batch;
 use crate::region::options::{MemtableOptions, MergeMode};
+use crate::sst::file::FileTimeRange;

 pub mod bulk;
 pub mod key_values;
@@ -355,8 +357,10 @@ impl MemtableRange {
    }

    /// Builds an iterator to read the range.
-    pub fn build_iter(&self) -> Result<BoxedBatchIterator> {
-        self.context.builder.build()
+    /// Filters the result by the specific time range.
+    pub fn build_iter(&self, time_range: FileTimeRange) -> Result<BoxedBatchIterator> {
+        let iter = self.context.builder.build()?;
+        Ok(Box::new(PruneTimeIterator::new(iter, time_range)))
    }
 }

--- a/src/mito2/src/memtable/time_partition.rs
+++ b/src/mito2/src/memtable/time_partition.rs
@@ -216,6 +216,16 @@ impl TimePartitions {
            .sum()
    }

+    /// Returns the number of rows.
+    pub(crate) fn num_rows(&self) -> u64 {
+        let inner = self.inner.lock().unwrap();
+        inner
+            .parts
+            .iter()
+            .map(|part| part.memtable.stats().num_rows as u64)
+            .sum()
+    }
+
    /// Append memtables in partitions to small vec.
    pub(crate) fn list_memtables_to_small_vec(&self, memtables: &mut SmallMemtableVec) {
        let inner = self.inner.lock().unwrap();
--- a/src/mito2/src/memtable/version.rs
+++ b/src/mito2/src/memtable/version.rs
@@ -115,6 +115,15 @@ impl MemtableVersion {
            .sum()
    }

+    /// Returns the number of rows in memtables.
+    pub(crate) fn num_rows(&self) -> u64 {
+        self.immutables
+            .iter()
+            .map(|mem| mem.stats().num_rows as u64)
+            .sum::<u64>()
+            + self.mutable.num_rows()
+    }
+
    /// Returns true if the memtable version is empty.
    ///
    /// The version is empty when mutable memtable is empty and there is no
--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -494,36 +494,80 @@ impl Batch {

    /// Checks the batch is monotonic by timestamps.
    #[cfg(debug_assertions)]
-    pub(crate) fn check_monotonic(&self) -> bool {
+    pub(crate) fn check_monotonic(&self) -> Result<(), String> {
+        use std::cmp::Ordering;
        if self.timestamps_native().is_none() {
-            return true;
+            return Ok(());
        }

        let timestamps = self.timestamps_native().unwrap();
        let sequences = self.sequences.as_arrow().values();
-        timestamps.windows(2).enumerate().all(|(i, window)| {
+        for (i, window) in timestamps.windows(2).enumerate() {
            let current = window[0];
            let next = window[1];
            let current_sequence = sequences[i];
            let next_sequence = sequences[i + 1];
-            if current == next {
-                current_sequence >= next_sequence
-            } else {
-                current < next
+            match current.cmp(&next) {
+                Ordering::Less => {
+                    // The current timestamp is less than the next timestamp.
+                    continue;
+                }
+                Ordering::Equal => {
+                    // The current timestamp is equal to the next timestamp.
+                    if current_sequence < next_sequence {
+                        return Err(format!(
+                            "sequence are not monotonic: ts {} == {} but current sequence {} < {}, index: {}",
+                            current, next, current_sequence, next_sequence, i
+                        ));
+                    }
+                }
+                Ordering::Greater => {
+                    // The current timestamp is greater than the next timestamp.
+                    return Err(format!(
+                        "timestamps are not monotonic: {} > {}, index: {}",
+                        current, next, i
+                    ));
+                }
            }
-        })
+        }
+
+        Ok(())
    }

-    /// Returns true if the given batch is behind the current batch.
+    /// Returns Ok if the given batch is behind the current batch.
    #[cfg(debug_assertions)]
-    pub(crate) fn check_next_batch(&self, other: &Batch) -> bool {
-        // Checks the primary key and then the timestamp.
-        use std::cmp::Ordering;
-        self.primary_key()
-            .cmp(other.primary_key())
-            .then_with(|| self.last_timestamp().cmp(&other.first_timestamp()))
-            .then_with(|| other.first_sequence().cmp(&self.last_sequence()))
-            <= Ordering::Equal
+    pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> {
+        // Checks the primary key
+        if self.primary_key() < other.primary_key() {
+            return Ok(());
+        }
+        if self.primary_key() > other.primary_key() {
+            return Err(format!(
+                "primary key is not monotonic: {:?} > {:?}",
+                self.primary_key(),
+                other.primary_key()
+            ));
+        }
+        // Checks the timestamp.
+        if self.last_timestamp() < other.first_timestamp() {
+            return Ok(());
+        }
+        if self.last_timestamp() > other.first_timestamp() {
+            return Err(format!(
+                "timestamps are not monotonic: {:?} > {:?}",
+                self.last_timestamp(),
+                other.first_timestamp()
+            ));
+        }
+        // Checks the sequence.
+        if self.last_sequence() >= other.first_sequence() {
+            return Ok(());
+        }
+        Err(format!(
+            "sequences are not monotonic: {:?} < {:?}",
+            self.last_sequence(),
+            other.first_sequence()
+        ))
    }
 }

@@ -532,26 +576,55 @@ impl Batch {
 #[derive(Default)]
 pub(crate) struct BatchChecker {
    last_batch: Option<Batch>,
+    start: Option<Timestamp>,
+    end: Option<Timestamp>,
 }

 #[cfg(debug_assertions)]
 impl BatchChecker {
+    /// Attaches the given start timestamp to the checker.
+    pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
+        self.start = start;
+        self
+    }
+
+    /// Attaches the given end timestamp to the checker.
+    pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
+        self.end = end;
+        self
+    }
+
    /// Returns true if the given batch is monotonic and behind
    /// the last batch.
-    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> bool {
-        if !batch.check_monotonic() {
-            return false;
+    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> Result<(), String> {
+        batch.check_monotonic()?;
+
+        if let (Some(start), Some(first)) = (self.start, batch.first_timestamp()) {
+            if start > first {
+                return Err(format!(
+                    "batch's first timestamp is before the start timestamp: {:?} > {:?}",
+                    start, first
+                ));
+            }
+        }
+        if let (Some(end), Some(last)) = (self.end, batch.last_timestamp()) {
+            if end <= last {
+                return Err(format!(
+                    "batch's last timestamp is after the end timestamp: {:?} <= {:?}",
+                    end, last
+                ));
+            }
        }

        // Checks the batch is behind the last batch.
        // Then Updates the last batch.
-        let is_behind = self
+        let res = self
            .last_batch
            .as_ref()
            .map(|last| last.check_next_batch(batch))
-            .unwrap_or(true);
+            .unwrap_or(Ok(()));
        self.last_batch = Some(batch.clone());
-        is_behind
+        res
    }

    /// Formats current batch and last batch for debug.
@@ -590,15 +663,14 @@ impl BatchChecker {
        part_range: store_api::region_engine::PartitionRange,
        batch: &Batch,
    ) {
-        if !self.check_monotonic(batch) {
-            panic!(
-                "{}: batch is not sorted, region_id: {}, partition: {}, part_range: {:?}, {}",
-                scanner,
-                region_id,
-                partition,
-                part_range,
-                self.format_batch(batch),
+        if let Err(e) = self.check_monotonic(batch) {
+            let err_msg = format!(
+                "{}: batch is not sorted, {}, region_id: {}, partition: {}, part_range: {:?}",
+                scanner, e, region_id, partition, part_range,
            );
+            common_telemetry::error!("{err_msg}, {}", self.format_batch(batch));
+            // Only print the number of row in the panic message.
+            panic!("{err_msg}, batch rows: {}", batch.num_rows());
        }
    }
 }
--- a/src/mito2/src/read/prune.rs
+++ b/src/mito2/src/read/prune.rs
@@ -12,9 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use common_time::Timestamp;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::BooleanVectorBuilder;
+
 use crate::error::Result;
+use crate::memtable::BoxedBatchIterator;
 use crate::read::last_row::RowGroupLastRowCachedReader;
 use crate::read::{Batch, BatchReader};
+use crate::sst::file::FileTimeRange;
 use crate::sst::parquet::file_range::FileRangeContextRef;
 use crate::sst::parquet::reader::{ReaderMetrics, RowGroupReader};

@@ -112,3 +118,214 @@ impl PruneReader {
        }
    }
 }
+
+/// An iterator that prunes batches by time range.
+pub(crate) struct PruneTimeIterator {
+    iter: BoxedBatchIterator,
+    time_range: FileTimeRange,
+}
+
+impl PruneTimeIterator {
+    /// Creates a new `PruneTimeIterator` with the given iterator and time range.
+    pub(crate) fn new(iter: BoxedBatchIterator, time_range: FileTimeRange) -> Self {
+        Self { iter, time_range }
+    }
+
+    /// Prune batch by time range.
+    fn prune(&self, mut batch: Batch) -> Result<Batch> {
+        if batch.is_empty() {
+            return Ok(batch);
+        }
+
+        // fast path, the batch is within the time range.
+        // Note that the time range is inclusive.
+        if self.time_range.0 <= batch.first_timestamp().unwrap()
+            && batch.last_timestamp().unwrap() <= self.time_range.1
+        {
+            return Ok(batch);
+        }
+
+        // slow path, prune the batch by time range.
+        // Note that the timestamp precision may be different from the time range.
+        // Safety: We know this is the timestamp type.
+        let unit = batch
+            .timestamps()
+            .data_type()
+            .as_timestamp()
+            .unwrap()
+            .unit();
+        let mut filter_builder = BooleanVectorBuilder::with_capacity(batch.timestamps().len());
+        let timestamps = batch.timestamps_native().unwrap();
+        for ts in timestamps {
+            let ts = Timestamp::new(*ts, unit);
+            if self.time_range.0 <= ts && ts <= self.time_range.1 {
+                filter_builder.push(Some(true));
+            } else {
+                filter_builder.push(Some(false));
+            }
+        }
+        let filter = filter_builder.finish();
+
+        batch.filter(&filter)?;
+        Ok(batch)
+    }
+
+    // Prune and return the next non-empty batch.
+    fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
+        while let Some(batch) = self.iter.next() {
+            let batch = batch?;
+            let pruned_batch = self.prune(batch)?;
+            if !pruned_batch.is_empty() {
+                return Ok(Some(pruned_batch));
+            }
+        }
+        Ok(None)
+    }
+}
+
+impl Iterator for PruneTimeIterator {
+    type Item = Result<Batch>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_non_empty_batch().transpose()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use api::v1::OpType;
+
+    use super::*;
+    use crate::test_util::new_batch;
+
+    #[test]
+    fn test_prune_time_iter_empty() {
+        let input = [];
+        let iter = input.into_iter().map(Ok);
+        let iter = PruneTimeIterator::new(
+            Box::new(iter),
+            (
+                Timestamp::new_millisecond(0),
+                Timestamp::new_millisecond(1000),
+            ),
+        );
+        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
+        assert!(actual.is_empty());
+    }
+
+    #[test]
+    fn test_prune_time_iter_filter() {
+        let input = [
+            new_batch(
+                b"k1",
+                &[10, 11],
+                &[20, 20],
+                &[OpType::Put, OpType::Put],
+                &[110, 111],
+            ),
+            new_batch(
+                b"k1",
+                &[15, 16],
+                &[20, 20],
+                &[OpType::Put, OpType::Put],
+                &[115, 116],
+            ),
+            new_batch(
+                b"k1",
+                &[17, 18],
+                &[20, 20],
+                &[OpType::Put, OpType::Put],
+                &[117, 118],
+            ),
+        ];
+
+        let iter = input.clone().into_iter().map(Ok);
+        let iter = PruneTimeIterator::new(
+            Box::new(iter),
+            (
+                Timestamp::new_millisecond(10),
+                Timestamp::new_millisecond(15),
+            ),
+        );
+        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
+        assert_eq!(
+            actual,
+            [
+                new_batch(
+                    b"k1",
+                    &[10, 11],
+                    &[20, 20],
+                    &[OpType::Put, OpType::Put],
+                    &[110, 111],
+                ),
+                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
+            ]
+        );
+
+        let iter = input.clone().into_iter().map(Ok);
+        let iter = PruneTimeIterator::new(
+            Box::new(iter),
+            (
+                Timestamp::new_millisecond(11),
+                Timestamp::new_millisecond(20),
+            ),
+        );
+        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
+        assert_eq!(
+            actual,
+            [
+                new_batch(b"k1", &[11], &[20], &[OpType::Put], &[111],),
+                new_batch(
+                    b"k1",
+                    &[15, 16],
+                    &[20, 20],
+                    &[OpType::Put, OpType::Put],
+                    &[115, 116],
+                ),
+                new_batch(
+                    b"k1",
+                    &[17, 18],
+                    &[20, 20],
+                    &[OpType::Put, OpType::Put],
+                    &[117, 118],
+                ),
+            ]
+        );
+
+        let iter = input.into_iter().map(Ok);
+        let iter = PruneTimeIterator::new(
+            Box::new(iter),
+            (
+                Timestamp::new_millisecond(10),
+                Timestamp::new_millisecond(18),
+            ),
+        );
+        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
+        assert_eq!(
+            actual,
+            [
+                new_batch(
+                    b"k1",
+                    &[10, 11],
+                    &[20, 20],
+                    &[OpType::Put, OpType::Put],
+                    &[110, 111],
+                ),
+                new_batch(
+                    b"k1",
+                    &[15, 16],
+                    &[20, 20],
+                    &[OpType::Put, OpType::Put],
+                    &[115, 116],
+                ),
+                new_batch(
+                    b"k1",
+                    &[17, 18],
+                    &[20, 20],
+                    &[OpType::Put, OpType::Put],
+                    &[117, 118],
+                ),
+            ]
+        );
+    }
+}
--- a/src/mito2/src/read/range.rs
+++ b/src/mito2/src/read/range.rs
@@ -18,15 +18,17 @@ use common_time::Timestamp;
 use smallvec::{smallvec, SmallVec};
 use store_api::region_engine::PartitionRange;

+use crate::cache::CacheManager;
 use crate::memtable::MemtableRef;
 use crate::read::scan_region::ScanInput;
 use crate::sst::file::{overlaps, FileHandle, FileTimeRange};
+use crate::sst::parquet::format::parquet_row_group_time_range;
 use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;

 const ALL_ROW_GROUPS: i64 = -1;

 /// Index to access a row group.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Debug, Clone, Copy, PartialEq)]
 pub(crate) struct RowGroupIndex {
    /// Index to the memtable/file.
    pub(crate) index: usize,
@@ -38,6 +40,7 @@ pub(crate) struct RowGroupIndex {
 /// Meta data of a partition range.
 /// If the scanner is [UnorderedScan], each meta only has one row group or memtable.
 /// If the scanner is [SeqScan], each meta may have multiple row groups and memtables.
+#[derive(Debug, PartialEq)]
 pub(crate) struct RangeMeta {
    /// The time range of the range.
    pub(crate) time_range: FileTimeRange,
@@ -84,7 +87,12 @@ impl RangeMeta {
    pub(crate) fn unordered_scan_ranges(input: &ScanInput) -> Vec<RangeMeta> {
        let mut ranges = Vec::with_capacity(input.memtables.len() + input.files.len());
        Self::push_unordered_mem_ranges(&input.memtables, &mut ranges);
-        Self::push_unordered_file_ranges(input.memtables.len(), &input.files, &mut ranges);
+        Self::push_unordered_file_ranges(
+            input.memtables.len(),
+            &input.files,
+            input.cache_manager.as_deref(),
+            &mut ranges,
+        );

        ranges
    }
@@ -164,12 +172,36 @@ impl RangeMeta {
    fn push_unordered_file_ranges(
        num_memtables: usize,
        files: &[FileHandle],
+        cache: Option<&CacheManager>,
        ranges: &mut Vec<RangeMeta>,
    ) {
        // For append mode, we can parallelize reading row groups.
        for (i, file) in files.iter().enumerate() {
            let file_index = num_memtables + i;
-            if file.meta_ref().num_row_groups > 0 {
+            // Get parquet meta from the cache.
+            let parquet_meta = cache.and_then(|c| {
+                c.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id())
+            });
+            if let Some(parquet_meta) = parquet_meta {
+                // Scans each row group.
+                for row_group_index in 0..file.meta_ref().num_row_groups {
+                    let time_range = parquet_row_group_time_range(
+                        file.meta_ref(),
+                        &parquet_meta,
+                        row_group_index as usize,
+                    );
+                    let num_rows = parquet_meta.row_group(row_group_index as usize).num_rows();
+                    ranges.push(RangeMeta {
+                        time_range: time_range.unwrap_or_else(|| file.time_range()),
+                        indices: smallvec![file_index],
+                        row_group_indices: smallvec![RowGroupIndex {
+                            index: file_index,
+                            row_group_index: row_group_index as i64,
+                        }],
+                        num_rows: num_rows as usize,
+                    });
+                }
+            } else if file.meta_ref().num_row_groups > 0 {
                // Scans each row group.
                for row_group_index in 0..file.meta_ref().num_row_groups {
                    ranges.push(RangeMeta {
@@ -217,7 +249,6 @@ impl RangeMeta {
        }
    }

-    // TODO(yingwen): Support multiple row groups in a range so we can split them later.
    fn push_seq_file_ranges(
        num_memtables: usize,
        files: &[FileHandle],
@@ -226,15 +257,31 @@ impl RangeMeta {
        // For non append-only mode, each range only contains one file.
        for (i, file) in files.iter().enumerate() {
            let file_index = num_memtables + i;
-            ranges.push(RangeMeta {
-                time_range: file.time_range(),
-                indices: smallvec![file_index],
-                row_group_indices: smallvec![RowGroupIndex {
-                    index: file_index,
-                    row_group_index: ALL_ROW_GROUPS,
-                }],
-                num_rows: file.meta_ref().num_rows as usize,
-            });
+            if file.meta_ref().num_row_groups > 0 {
+                // All row groups share the same time range.
+                let row_group_indices = (0..file.meta_ref().num_row_groups)
+                    .map(|row_group_index| RowGroupIndex {
+                        index: file_index,
+                        row_group_index: row_group_index as i64,
+                    })
+                    .collect();
+                ranges.push(RangeMeta {
+                    time_range: file.time_range(),
+                    indices: smallvec![file_index],
+                    row_group_indices,
+                    num_rows: file.meta_ref().num_rows as usize,
+                });
+            } else {
+                ranges.push(RangeMeta {
+                    time_range: file.time_range(),
+                    indices: smallvec![file_index],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: file_index,
+                        row_group_index: ALL_ROW_GROUPS,
+                    }],
+                    num_rows: file.meta_ref().num_rows as usize,
+                });
+            }
        }
    }
 }
@@ -366,4 +413,212 @@ mod tests {
            &[(vec![3], 0, 1000), (vec![1, 2], 3000, 6000)],
        );
    }
+
+    #[test]
+    fn test_merge_range() {
+        let mut left = RangeMeta {
+            time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+            indices: smallvec![1],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 2
+                }
+            ],
+            num_rows: 5,
+        };
+        let right = RangeMeta {
+            time_range: (Timestamp::new_second(800), Timestamp::new_second(1200)),
+            indices: smallvec![2],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 2,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 2,
+                    row_group_index: 2
+                }
+            ],
+            num_rows: 4,
+        };
+        left.merge(right);
+
+        assert_eq!(
+            left,
+            RangeMeta {
+                time_range: (Timestamp::new_second(800), Timestamp::new_second(2000)),
+                indices: smallvec![1, 2],
+                row_group_indices: smallvec![
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    },
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 2
+                    },
+                    RowGroupIndex {
+                        index: 2,
+                        row_group_index: 1
+                    },
+                    RowGroupIndex {
+                        index: 2,
+                        row_group_index: 2
+                    },
+                ],
+                num_rows: 9,
+            }
+        );
+    }
+
+    #[test]
+    fn test_split_range() {
+        let range = RangeMeta {
+            time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+            indices: smallvec![1],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 2
+                }
+            ],
+            num_rows: 5,
+        };
+
+        assert!(range.can_split_preserve_order());
+        let mut output = Vec::new();
+        range.maybe_split(&mut output);
+
+        assert_eq!(
+            output,
+            &[
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    },],
+                    num_rows: 2,
+                },
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 2
+                    }],
+                    num_rows: 2,
+                }
+            ]
+        );
+    }
+
+    #[test]
+    fn test_not_split_range() {
+        let range = RangeMeta {
+            time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+            indices: smallvec![1, 2],
+            row_group_indices: smallvec![
+                RowGroupIndex {
+                    index: 1,
+                    row_group_index: 1
+                },
+                RowGroupIndex {
+                    index: 2,
+                    row_group_index: 1
+                }
+            ],
+            num_rows: 5,
+        };
+
+        assert!(!range.can_split_preserve_order());
+        let mut output = Vec::new();
+        range.maybe_split(&mut output);
+        assert_eq!(1, output.len());
+    }
+
+    #[test]
+    fn test_maybe_split_ranges() {
+        let ranges = vec![
+            RangeMeta {
+                time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                indices: smallvec![1],
+                row_group_indices: smallvec![
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 0
+                    },
+                    RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    }
+                ],
+                num_rows: 4,
+            },
+            RangeMeta {
+                time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
+                indices: smallvec![2, 3],
+                row_group_indices: smallvec![
+                    RowGroupIndex {
+                        index: 2,
+                        row_group_index: 0
+                    },
+                    RowGroupIndex {
+                        index: 3,
+                        row_group_index: 0
+                    }
+                ],
+                num_rows: 5,
+            },
+        ];
+        let output = maybe_split_ranges_for_seq_scan(ranges);
+        assert_eq!(
+            output,
+            vec![
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 0
+                    },],
+                    num_rows: 2,
+                },
+                RangeMeta {
+                    time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
+                    indices: smallvec![1],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 1,
+                        row_group_index: 1
+                    }],
+                    num_rows: 2,
+                },
+                RangeMeta {
+                    time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
+                    indices: smallvec![2, 3],
+                    row_group_indices: smallvec![
+                        RowGroupIndex {
+                            index: 2,
+                            row_group_index: 0
+                        },
+                        RowGroupIndex {
+                            index: 3,
+                            row_group_index: 0
+                        }
+                    ],
+                    num_rows: 5,
+                },
+            ]
+        )
+    }
 }
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -74,7 +74,7 @@ impl Scanner {

    /// Returns a [RegionScanner] to scan the region.
    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
-    pub(crate) async fn region_scanner(self) -> Result<RegionScannerRef> {
+    pub(crate) fn region_scanner(self) -> Result<RegionScannerRef> {
        match self {
            Scanner::Seq(seq_scan) => Ok(Box::new(seq_scan)),
            Scanner::Unordered(unordered_scan) => Ok(Box::new(unordered_scan)),
@@ -393,20 +393,29 @@ impl ScanRegion {
            .and_then(|c| c.index_cache())
            .cloned();

+        // TODO(zhongzc): currently we only index tag columns, need to support field columns.
+        let ignore_column_ids = &self
+            .version
+            .options
+            .index_options
+            .inverted_index
+            .ignore_column_ids;
+        let indexed_column_ids = self
+            .version
+            .metadata
+            .primary_key
+            .iter()
+            .filter(|id| !ignore_column_ids.contains(id))
+            .copied()
+            .collect::<HashSet<_>>();
+
        InvertedIndexApplierBuilder::new(
            self.access_layer.region_dir().to_string(),
            self.access_layer.object_store().clone(),
            file_cache,
            index_cache,
            self.version.metadata.as_ref(),
-            self.version
-                .options
-                .index_options
-                .inverted_index
-                .ignore_column_ids
-                .iter()
-                .copied()
-                .collect(),
+            indexed_column_ids,
            self.access_layer.puffin_manager_factory().clone(),
        )
        .build(&self.request.filters)
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -26,6 +26,7 @@ use crate::error::Result;
 use crate::read::range::RowGroupIndex;
 use crate::read::scan_region::StreamContext;
 use crate::read::{Batch, ScannerMetrics, Source};
+use crate::sst::file::FileTimeRange;
 use crate::sst::parquet::reader::ReaderMetrics;

 struct PartitionMetricsInner {
@@ -128,13 +129,14 @@ pub(crate) fn scan_mem_ranges(
    stream_ctx: Arc<StreamContext>,
    part_metrics: PartitionMetrics,
    index: RowGroupIndex,
+    time_range: FileTimeRange,
 ) -> impl Stream<Item = Result<Batch>> {
    try_stream! {
        let ranges = stream_ctx.build_mem_ranges(index);
        part_metrics.inc_num_mem_ranges(ranges.len());
        for range in ranges {
            let build_reader_start = Instant::now();
-            let iter = range.build_iter()?;
+            let iter = range.build_iter(time_range)?;
            part_metrics.inc_build_reader_cost(build_reader_start.elapsed());

            let mut source = Source::Iter(iter);
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -27,6 +27,7 @@ use common_telemetry::tracing;
 use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
 use datatypes::schema::SchemaRef;
 use snafu::ResultExt;
+use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{PartitionRange, RegionScanner, ScannerProperties};
 use store_api::storage::TimeSeriesRowSelector;
 use tokio::sync::Semaphore;
@@ -232,7 +233,10 @@ impl SeqScan {
                let mut metrics = ScannerMetrics::default();
                let mut fetch_start = Instant::now();
                #[cfg(debug_assertions)]
-                let mut checker = crate::read::BatchChecker::default();
+                let mut checker = crate::read::BatchChecker::default()
+                    .with_start(Some(part_range.start))
+                    .with_end(Some(part_range.end));
+
                while let Some(batch) = reader
                    .next_batch()
                    .await
@@ -304,8 +308,13 @@ impl RegionScanner for SeqScan {
        self.scan_partition_impl(partition)
    }

-    fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
+    fn prepare(
+        &mut self,
+        ranges: Vec<Vec<PartitionRange>>,
+        distinguish_partition_range: bool,
+    ) -> Result<(), BoxedError> {
        self.properties.partitions = ranges;
+        self.properties.distinguish_partition_range = distinguish_partition_range;
        Ok(())
    }

@@ -313,6 +322,10 @@ impl RegionScanner for SeqScan {
        let predicate = self.stream_ctx.input.predicate();
        predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
    }
+
+    fn metadata(&self) -> RegionMetadataRef {
+        self.stream_ctx.input.mapper.metadata().clone()
+    }
 }

 impl DisplayAs for SeqScan {
@@ -347,7 +360,12 @@ fn build_sources(
    sources.reserve(range_meta.row_group_indices.len());
    for index in &range_meta.row_group_indices {
        let stream = if stream_ctx.is_mem_range_index(*index) {
-            let stream = scan_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index);
+            let stream = scan_mem_ranges(
+                stream_ctx.clone(),
+                part_metrics.clone(),
+                *index,
+                range_meta.time_range,
+            );
            Box::pin(stream) as _
        } else {
            let read_type = if compaction {
--- a/src/mito2/src/read/unordered_scan.rs
+++ b/src/mito2/src/read/unordered_scan.rs
@@ -26,6 +26,7 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
 use datatypes::schema::SchemaRef;
 use futures::{Stream, StreamExt};
 use snafu::ResultExt;
+use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{PartitionRange, RegionScanner, ScannerProperties};

 use crate::error::{PartitionOutOfRangeSnafu, Result};
@@ -89,7 +90,7 @@ impl UnorderedScan {
            let range_meta = &stream_ctx.ranges[part_range_id];
            for index in &range_meta.row_group_indices {
                if stream_ctx.is_mem_range_index(*index) {
-                    let stream = scan_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index);
+                    let stream = scan_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index, range_meta.time_range);
                    for await batch in stream {
                        yield batch;
                    }
@@ -140,7 +141,9 @@ impl UnorderedScan {
                let mut metrics = ScannerMetrics::default();
                let mut fetch_start = Instant::now();
                #[cfg(debug_assertions)]
-                let mut checker = crate::read::BatchChecker::default();
+                let mut checker = crate::read::BatchChecker::default()
+                    .with_start(Some(part_range.start))
+                    .with_end(Some(part_range.end));

                let stream = Self::scan_partition_range(
                    stream_ctx.clone(),
@@ -209,8 +212,13 @@ impl RegionScanner for UnorderedScan {
        self.stream_ctx.input.mapper.output_schema()
    }

-    fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
+    fn prepare(
+        &mut self,
+        ranges: Vec<Vec<PartitionRange>>,
+        distinguish_partition_range: bool,
+    ) -> Result<(), BoxedError> {
        self.properties.partitions = ranges;
+        self.properties.distinguish_partition_range = distinguish_partition_range;
        Ok(())
    }

@@ -222,6 +230,10 @@ impl RegionScanner for UnorderedScan {
        let predicate = self.stream_ctx.input.predicate();
        predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
    }
+
+    fn metadata(&self) -> RegionMetadataRef {
+        self.stream_ctx.input.mapper.metadata().clone()
+    }
 }

 impl DisplayAs for UnorderedScan {
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -277,15 +277,19 @@ impl MitoRegion {
        let memtable_usage = (memtables.mutable_usage() + memtables.immutables_usage()) as u64;

        let sst_usage = version.ssts.sst_usage();
+        let index_usage = version.ssts.index_usage();

        let wal_usage = self.estimated_wal_usage(memtable_usage);
        let manifest_usage = self.stats.total_manifest_size();
+        let num_rows = version.ssts.num_rows() + version.memtables.num_rows();

        RegionStatistic {
+            num_rows,
            memtable_size: memtable_usage,
            wal_size: wal_usage,
            manifest_size: manifest_usage,
            sst_size: sst_usage,
+            index_size: index_usage,
        }
    }

@@ -422,15 +426,15 @@ impl ManifestContext {
    /// Sets the [`RegionRole`].
    ///
    /// ```
-    ///     +------------------------------------------+    
-    ///     |                      +-----------------+ |    
-    ///     |                      |                 | |    
+    ///     +------------------------------------------+
+    ///     |                      +-----------------+ |
+    ///     |                      |                 | |
    /// +---+------+       +-------+-----+        +--v-v---+
    /// | Follower |       | Downgrading |        | Leader |
    /// +---^-^----+       +-----+-^-----+        +--+-+---+
-    ///     | |                  | |                 | |    
-    ///     | +------------------+ +-----------------+ |    
-    ///     +------------------------------------------+    
+    ///     | |                  | |                 | |
+    ///     | +------------------+ +-----------------+ |
+    ///     +------------------------------------------+
    ///
    /// Transition:
    /// - Follower -> Leader
--- a/src/mito2/src/region/version.rs
+++ b/src/mito2/src/region/version.rs
@@ -98,6 +98,18 @@ impl VersionControl {
        Ok(())
    }

+    /// Applies region option changes and generates a new version.
+    pub(crate) fn alter_options(&self, options: RegionOptions) {
+        let version = self.current().version;
+        let new_version = Arc::new(
+            VersionBuilder::from_version(version)
+                .options(options)
+                .build(),
+        );
+        let mut version_data = self.data.write().unwrap();
+        version_data.version = new_version;
+    }
+
    /// Apply edit to current version.
    pub(crate) fn apply_edit(
        &self,
--- a/src/mito2/src/sst/file.rs
+++ b/src/mito2/src/sst/file.rs
@@ -111,7 +111,8 @@ pub struct FileMeta {
    pub region_id: RegionId,
    /// Compared to normal file names, FileId ignore the extension
    pub file_id: FileId,
-    /// Timestamp range of file.
+    /// Timestamp range of file. The timestamps have the same time unit as the
+    /// data in the SST.
    pub time_range: FileTimeRange,
    /// SST level of the file.
    pub level: Level,
--- a/src/mito2/src/sst/index.rs
+++ b/src/mito2/src/sst/index.rs
@@ -20,6 +20,7 @@ pub(crate) mod puffin_manager;
 mod statistics;
 mod store;

+use std::collections::HashSet;
 use std::num::NonZeroUsize;

 use common_telemetry::{debug, warn};
@@ -212,13 +213,28 @@ impl<'a> IndexerBuilder<'a> {
            segment_row_count = row_group_size;
        }

+        // TODO(zhongzc): currently we only index tag columns, need to support field columns.
+        let indexed_column_ids = self
+            .metadata
+            .primary_key
+            .iter()
+            .filter(|id| {
+                !self
+                    .index_options
+                    .inverted_index
+                    .ignore_column_ids
+                    .contains(id)
+            })
+            .copied()
+            .collect::<HashSet<_>>();
+
        let indexer = InvertedIndexer::new(
            self.file_id,
            self.metadata,
            self.intermediate_manager.clone(),
            self.inverted_index_config.mem_threshold_on_create(),
            segment_row_count,
-            &self.index_options.inverted_index.ignore_column_ids,
+            indexed_column_ids,
        );

        Some(indexer)
--- a/src/mito2/src/sst/index/inverted_index/applier/builder.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder.rs
@@ -20,7 +20,6 @@ mod regex_match;

 use std::collections::{HashMap, HashSet};

-use api::v1::SemanticType;
 use common_telemetry::warn;
 use datafusion_common::ScalarValue;
 use datafusion_expr::{BinaryExpr, Expr, Operator};
@@ -55,8 +54,8 @@ pub(crate) struct InvertedIndexApplierBuilder<'a> {
    /// Metadata of the region, used to get metadata like column type.
    metadata: &'a RegionMetadata,

-    /// Column ids to ignore.
-    ignore_column_ids: HashSet<ColumnId>,
+    /// Column ids of the columns that are indexed.
+    indexed_column_ids: HashSet<ColumnId>,

    /// Stores predicates during traversal on the Expr tree.
    output: HashMap<ColumnId, Vec<Predicate>>,
@@ -76,7 +75,7 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
        file_cache: Option<FileCacheRef>,
        index_cache: Option<InvertedIndexCacheRef>,
        metadata: &'a RegionMetadata,
-        ignore_column_ids: HashSet<ColumnId>,
+        indexed_column_ids: HashSet<ColumnId>,
        puffin_manager_factory: PuffinManagerFactory,
    ) -> Self {
        Self {
@@ -84,7 +83,7 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
            object_store,
            file_cache,
            metadata,
-            ignore_column_ids,
+            indexed_column_ids,
            output: HashMap::default(),
            index_cache,
            puffin_manager_factory,
@@ -156,9 +155,9 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
        self.output.entry(column_id).or_default().push(predicate);
    }

-    /// Helper function to get the column id and the column type of a tag column.
+    /// Helper function to get the column id and the column type of a column.
    /// Returns `None` if the column is not a tag column or if the column is ignored.
-    fn tag_column_id_and_type(
+    fn column_id_and_type(
        &self,
        column_name: &str,
    ) -> Result<Option<(ColumnId, ConcreteDataType)>> {
@@ -169,11 +168,7 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
                column: column_name,
            })?;

-        if self.ignore_column_ids.contains(&column.column_id) {
-            return Ok(None);
-        }
-
-        if column.semantic_type != SemanticType::Tag {
+        if !self.indexed_column_ids.contains(&column.column_id) {
            return Ok(None);
        }

@@ -330,7 +325,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

--- a/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs
@@ -28,7 +28,7 @@ impl InvertedIndexApplierBuilder<'_> {
        let Some(column_name) = Self::column_name(&between.expr) else {
            return Ok(());
        };
-        let Some((column_id, data_type)) = self.tag_column_id_and_type(column_name)? else {
+        let Some((column_id, data_type)) = self.column_id_and_type(column_name)? else {
            return Ok(());
        };
        let Some(low) = Self::nonnull_lit(&between.low) else {
@@ -78,7 +78,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -121,7 +121,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -147,7 +147,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -159,7 +159,24 @@ mod tests {
        };

        builder.collect_between(&between).unwrap();
-        assert!(builder.output.is_empty());
+
+        let predicates = builder.output.get(&3).unwrap();
+        assert_eq!(predicates.len(), 1);
+        assert_eq!(
+            predicates[0],
+            Predicate::Range(RangePredicate {
+                range: Range {
+                    lower: Some(Bound {
+                        inclusive: true,
+                        value: encoded_string("abc"),
+                    }),
+                    upper: Some(Bound {
+                        inclusive: true,
+                        value: encoded_string("def"),
+                    }),
+                }
+            })
+        );
    }

    #[test]
@@ -173,7 +190,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -200,7 +217,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

--- a/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs
@@ -114,7 +114,7 @@ impl InvertedIndexApplierBuilder<'_> {
        let Some(lit) = Self::nonnull_lit(literal) else {
            return Ok(());
        };
-        let Some((column_id, data_type)) = self.tag_column_id_and_type(column_name)? else {
+        let Some((column_id, data_type)) = self.column_id_and_type(column_name)? else {
            return Ok(());
        };

@@ -234,7 +234,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -263,7 +263,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -283,14 +283,28 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

        builder
            .collect_comparison_expr(&field_column(), &Operator::Lt, &string_lit("abc"))
            .unwrap();
-        assert!(builder.output.is_empty());
+
+        let predicates = builder.output.get(&3).unwrap();
+        assert_eq!(predicates.len(), 1);
+        assert_eq!(
+            predicates[0],
+            Predicate::Range(RangePredicate {
+                range: Range {
+                    lower: None,
+                    upper: Some(Bound {
+                        inclusive: false,
+                        value: encoded_string("abc"),
+                    }),
+                }
+            })
+        );
    }

    #[test]
@@ -304,7 +318,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

--- a/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs
@@ -31,7 +31,7 @@ impl InvertedIndexApplierBuilder<'_> {
        let Some(lit) = Self::nonnull_lit(right).or_else(|| Self::nonnull_lit(left)) else {
            return Ok(());
        };
-        let Some((column_id, data_type)) = self.tag_column_id_and_type(column_name)? else {
+        let Some((column_id, data_type)) = self.column_id_and_type(column_name)? else {
            return Ok(());
        };

@@ -59,7 +59,7 @@ impl InvertedIndexApplierBuilder<'_> {
        let Some(lit) = Self::nonnull_lit(right).or_else(|| Self::nonnull_lit(left)) else {
            return Ok(());
        };
-        let Some((column_id, data_type)) = self.tag_column_id_and_type(column_name)? else {
+        let Some((column_id, data_type)) = self.column_id_and_type(column_name)? else {
            return Ok(());
        };

@@ -140,7 +140,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -178,14 +178,22 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

        builder
            .collect_eq(&field_column(), &string_lit("abc"))
            .unwrap();
-        assert!(builder.output.is_empty());
+
+        let predicates = builder.output.get(&3).unwrap();
+        assert_eq!(predicates.len(), 1);
+        assert_eq!(
+            predicates[0],
+            Predicate::InList(InListPredicate {
+                list: HashSet::from_iter([encoded_string("abc")])
+            })
+        );
    }

    #[test]
@@ -199,7 +207,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -219,7 +227,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -239,7 +247,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -298,7 +306,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -336,7 +344,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

--- a/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs
@@ -29,7 +29,7 @@ impl InvertedIndexApplierBuilder<'_> {
        let Some(column_name) = Self::column_name(&inlist.expr) else {
            return Ok(());
        };
-        let Some((column_id, data_type)) = self.tag_column_id_and_type(column_name)? else {
+        let Some((column_id, data_type)) = self.column_id_and_type(column_name)? else {
            return Ok(());
        };

@@ -71,7 +71,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -104,7 +104,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -129,7 +129,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -140,7 +140,15 @@ mod tests {
        };

        builder.collect_inlist(&in_list).unwrap();
-        assert!(builder.output.is_empty());
+
+        let predicates = builder.output.get(&3).unwrap();
+        assert_eq!(predicates.len(), 1);
+        assert_eq!(
+            predicates[0],
+            Predicate::InList(InListPredicate {
+                list: HashSet::from_iter([encoded_string("foo"), encoded_string("bar")])
+            })
+        );
    }

    #[test]
@@ -154,7 +162,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -181,7 +189,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

--- a/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs
@@ -25,7 +25,7 @@ impl InvertedIndexApplierBuilder<'_> {
        let Some(column_name) = Self::column_name(column) else {
            return Ok(());
        };
-        let Some((column_id, data_type)) = self.tag_column_id_and_type(column_name)? else {
+        let Some((column_id, data_type)) = self.column_id_and_type(column_name)? else {
            return Ok(());
        };
        if !data_type.is_string() {
@@ -65,7 +65,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -94,7 +94,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -102,7 +102,14 @@ mod tests {
            .collect_regex_match(&field_column(), &string_lit("abc"))
            .unwrap();

-        assert!(builder.output.is_empty());
+        let predicates = builder.output.get(&3).unwrap();
+        assert_eq!(predicates.len(), 1);
+        assert_eq!(
+            predicates[0],
+            Predicate::RegexMatch(RegexMatchPredicate {
+                pattern: "abc".to_string()
+            })
+        );
    }

    #[test]
@@ -116,7 +123,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

@@ -138,7 +145,7 @@ mod tests {
            None,
            None,
            &metadata,
-            HashSet::default(),
+            HashSet::from_iter([1, 2, 3]),
            facotry,
        );

--- a/src/mito2/src/sst/index/inverted_index/creator.rs
+++ b/src/mito2/src/sst/index/inverted_index/creator.rs
@@ -36,6 +36,7 @@ use crate::error::{
    PushIndexValueSnafu, Result,
 };
 use crate::read::Batch;
+use crate::row_converter::SortField;
 use crate::sst::file::FileId;
 use crate::sst::index::intermediate::{IntermediateLocation, IntermediateManager};
 use crate::sst::index::inverted_index::codec::{IndexValueCodec, IndexValuesCodec};
@@ -72,7 +73,7 @@ pub struct InvertedIndexer {
    memory_usage: Arc<AtomicUsize>,

    /// Ids of indexed columns.
-    column_ids: HashSet<ColumnId>,
+    indexed_column_ids: HashSet<ColumnId>,
 }

 impl InvertedIndexer {
@@ -84,7 +85,7 @@ impl InvertedIndexer {
        intermediate_manager: IntermediateManager,
        memory_usage_threshold: Option<usize>,
        segment_row_count: NonZeroUsize,
-        ignore_column_ids: &[ColumnId],
+        indexed_column_ids: HashSet<ColumnId>,
    ) -> Self {
        let temp_file_provider = Arc::new(TempFileProvider::new(
            IntermediateLocation::new(&metadata.region_id, &sst_file_id),
@@ -102,14 +103,6 @@ impl InvertedIndexer {
        let index_creator = Box::new(SortIndexCreator::new(sorter, segment_row_count));

        let codec = IndexValuesCodec::from_tag_columns(metadata.primary_key_columns());
-        let mut column_ids = metadata
-            .primary_key_columns()
-            .map(|c| c.column_id)
-            .collect::<HashSet<_>>();
-        for id in ignore_column_ids {
-            column_ids.remove(id);
-        }
-
        Self {
            codec,
            index_creator,
@@ -118,7 +111,7 @@ impl InvertedIndexer {
            stats: Statistics::new(TYPE_INVERTED_INDEX),
            aborted: false,
            memory_usage,
-            column_ids,
+            indexed_column_ids,
        }
    }

@@ -189,7 +182,7 @@ impl InvertedIndexer {
        guard.inc_row_count(n);

        for ((col_id, col_id_str), field, value) in self.codec.decode(batch.primary_key())? {
-            if !self.column_ids.contains(col_id) {
+            if !self.indexed_column_ids.contains(col_id) {
                continue;
            }

@@ -210,6 +203,32 @@ impl InvertedIndexer {
                .context(PushIndexValueSnafu)?;
        }

+        for field in batch.fields() {
+            if !self.indexed_column_ids.contains(&field.column_id) {
+                continue;
+            }
+
+            let sort_field = SortField::new(field.data.data_type());
+            let col_id_str = field.column_id.to_string();
+            for i in 0..n {
+                self.value_buf.clear();
+                let value = field.data.get_ref(i);
+
+                if value.is_null() {
+                    self.index_creator
+                        .push_with_name(&col_id_str, None)
+                        .await
+                        .context(PushIndexValueSnafu)?;
+                } else {
+                    IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut self.value_buf)?;
+                    self.index_creator
+                        .push_with_name(&col_id_str, Some(&self.value_buf))
+                        .await
+                        .context(PushIndexValueSnafu)?;
+                }
+            }
+        }
+
        Ok(())
    }

@@ -269,7 +288,7 @@ impl InvertedIndexer {
    }

    pub fn column_ids(&self) -> impl Iterator<Item = ColumnId> + '_ {
-        self.column_ids.iter().copied()
+        self.indexed_column_ids.iter().copied()
    }

    pub fn memory_usage(&self) -> usize {
@@ -297,6 +316,7 @@ mod tests {

    use super::*;
    use crate::cache::index::InvertedIndexCache;
+    use crate::read::BatchColumn;
    use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
    use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
    use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -340,12 +360,25 @@ mod tests {
                semantic_type: SemanticType::Timestamp,
                column_id: 3,
            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field_u64",
+                    ConcreteDataType::uint64_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 4,
+            })
            .primary_key(vec![1, 2]);

        Arc::new(builder.build().unwrap())
    }

-    fn new_batch(num_rows: usize, str_tag: impl AsRef<str>, i32_tag: impl Into<i32>) -> Batch {
+    fn new_batch(
+        str_tag: impl AsRef<str>,
+        i32_tag: impl Into<i32>,
+        u64_field: impl IntoIterator<Item = u64>,
+    ) -> Batch {
        let fields = vec![
            SortField::new(ConcreteDataType::string_datatype()),
            SortField::new(ConcreteDataType::int32_datatype()),
@@ -354,6 +387,12 @@ mod tests {
        let row: [ValueRef; 2] = [str_tag.as_ref().into(), i32_tag.into().into()];
        let primary_key = codec.encode(row.into_iter()).unwrap();

+        let u64_field = BatchColumn {
+            column_id: 4,
+            data: Arc::new(UInt64Vector::from_iter_values(u64_field)),
+        };
+        let num_rows = u64_field.data.len();
+
        Batch::new(
            primary_key,
            Arc::new(UInt64Vector::from_iter_values(
@@ -365,14 +404,14 @@ mod tests {
            Arc::new(UInt8Vector::from_iter_values(
                iter::repeat(1).take(num_rows),
            )),
-            vec![],
+            vec![u64_field],
        )
        .unwrap()
    }

    async fn build_applier_factory(
        prefix: &str,
-        tags: BTreeSet<(&'static str, i32)>,
+        rows: BTreeSet<(&'static str, i32, [u64; 2])>,
    ) -> impl Fn(DfExpr) -> BoxFuture<'static, Vec<usize>> {
        let (d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
        let region_dir = "region0".to_string();
@@ -383,6 +422,7 @@ mod tests {
        let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
        let memory_threshold = None;
        let segment_row_count = 2;
+        let indexed_column_ids = HashSet::from_iter([1, 2, 4]);

        let mut creator = InvertedIndexer::new(
            sst_file_id,
@@ -390,18 +430,18 @@ mod tests {
            intm_mgr,
            memory_threshold,
            NonZeroUsize::new(segment_row_count).unwrap(),
-            &[],
+            indexed_column_ids.clone(),
        );

-        for (str_tag, i32_tag) in &tags {
-            let batch = new_batch(segment_row_count, str_tag, *i32_tag);
+        for (str_tag, i32_tag, u64_field) in &rows {
+            let batch = new_batch(str_tag, *i32_tag, u64_field.iter().copied());
            creator.update(&batch).await.unwrap();
        }

        let puffin_manager = factory.build(object_store.clone());
        let mut writer = puffin_manager.writer(&file_path).await.unwrap();
        let (row_count, _) = creator.finish(&mut writer).await.unwrap();
-        assert_eq!(row_count, tags.len() * segment_row_count);
+        assert_eq!(row_count, rows.len() * segment_row_count);
        writer.finish().await.unwrap();

        move |expr| {
@@ -413,7 +453,7 @@ mod tests {
                None,
                Some(cache),
                &region_metadata,
-                Default::default(),
+                indexed_column_ids.clone(),
                factory.clone(),
            )
            .build(&[expr])
@@ -433,19 +473,19 @@ mod tests {

    #[tokio::test]
    async fn test_create_and_query_get_key() {
-        let tags = BTreeSet::from_iter([
-            ("aaa", 1),
-            ("aaa", 2),
-            ("aaa", 3),
-            ("aab", 1),
-            ("aab", 2),
-            ("aab", 3),
-            ("abc", 1),
-            ("abc", 2),
-            ("abc", 3),
+        let rows = BTreeSet::from_iter([
+            ("aaa", 1, [1, 2]),
+            ("aaa", 2, [2, 3]),
+            ("aaa", 3, [3, 4]),
+            ("aab", 1, [4, 5]),
+            ("aab", 2, [5, 6]),
+            ("aab", 3, [6, 7]),
+            ("abc", 1, [7, 8]),
+            ("abc", 2, [8, 9]),
+            ("abc", 3, [9, 10]),
        ]);

-        let applier_factory = build_applier_factory("test_create_and_query_get_key_", tags).await;
+        let applier_factory = build_applier_factory("test_create_and_query_get_key_", rows).await;

        let expr = col("tag_str").eq(lit("aaa"));
        let res = applier_factory(expr).await;
@@ -468,23 +508,27 @@ mod tests {
        let expr = col("tag_str").in_list(vec![lit("aaa"), lit("abc")], false);
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![0, 1, 2, 6, 7, 8]);
+
+        let expr = col("field_u64").eq(lit(2u64));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![0, 1]);
    }

    #[tokio::test]
    async fn test_create_and_query_range() {
-        let tags = BTreeSet::from_iter([
-            ("aaa", 1),
-            ("aaa", 2),
-            ("aaa", 3),
-            ("aab", 1),
-            ("aab", 2),
-            ("aab", 3),
-            ("abc", 1),
-            ("abc", 2),
-            ("abc", 3),
+        let rows = BTreeSet::from_iter([
+            ("aaa", 1, [1, 2]),
+            ("aaa", 2, [2, 3]),
+            ("aaa", 3, [3, 4]),
+            ("aab", 1, [4, 5]),
+            ("aab", 2, [5, 6]),
+            ("aab", 3, [6, 7]),
+            ("abc", 1, [7, 8]),
+            ("abc", 2, [8, 9]),
+            ("abc", 3, [9, 10]),
        ]);

-        let applier_factory = build_applier_factory("test_create_and_query_range_", tags).await;
+        let applier_factory = build_applier_factory("test_create_and_query_range_", rows).await;

        let expr = col("tag_str").between(lit("aaa"), lit("aab"));
        let res = applier_factory(expr).await;
@@ -501,24 +545,28 @@ mod tests {
        let expr = col("tag_i32").between(lit(2), lit(2));
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![1, 4, 7]);
+
+        let expr = col("field_u64").between(lit(2u64), lit(5u64));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![0, 1, 2, 3, 4]);
    }

    #[tokio::test]
    async fn test_create_and_query_comparison() {
-        let tags = BTreeSet::from_iter([
-            ("aaa", 1),
-            ("aaa", 2),
-            ("aaa", 3),
-            ("aab", 1),
-            ("aab", 2),
-            ("aab", 3),
-            ("abc", 1),
-            ("abc", 2),
-            ("abc", 3),
+        let rows = BTreeSet::from_iter([
+            ("aaa", 1, [1, 2]),
+            ("aaa", 2, [2, 3]),
+            ("aaa", 3, [3, 4]),
+            ("aab", 1, [4, 5]),
+            ("aab", 2, [5, 6]),
+            ("aab", 3, [6, 7]),
+            ("abc", 1, [7, 8]),
+            ("abc", 2, [8, 9]),
+            ("abc", 3, [9, 10]),
        ]);

        let applier_factory =
-            build_applier_factory("test_create_and_query_comparison_", tags).await;
+            build_applier_factory("test_create_and_query_comparison_", rows).await;

        let expr = col("tag_str").lt(lit("aab"));
        let res = applier_factory(expr).await;
@@ -528,6 +576,10 @@ mod tests {
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![0, 3, 6]);

+        let expr = col("field_u64").lt(lit(2u64));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![0]);
+
        let expr = col("tag_str").gt(lit("aab"));
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![6, 7, 8]);
@@ -536,6 +588,10 @@ mod tests {
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![2, 5, 8]);

+        let expr = col("field_u64").gt(lit(8u64));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![7, 8]);
+
        let expr = col("tag_str").lt_eq(lit("aab"));
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![0, 1, 2, 3, 4, 5]);
@@ -544,6 +600,10 @@ mod tests {
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![0, 1, 3, 4, 6, 7]);

+        let expr = col("field_u64").lt_eq(lit(2u64));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![0, 1]);
+
        let expr = col("tag_str").gt_eq(lit("aab"));
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![3, 4, 5, 6, 7, 8]);
@@ -552,6 +612,10 @@ mod tests {
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![1, 2, 4, 5, 7, 8]);

+        let expr = col("field_u64").gt_eq(lit(8u64));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![6, 7, 8]);
+
        let expr = col("tag_str")
            .gt(lit("aaa"))
            .and(col("tag_str").lt(lit("abc")));
@@ -561,23 +625,29 @@ mod tests {
        let expr = col("tag_i32").gt(lit(1)).and(col("tag_i32").lt(lit(3)));
        let res = applier_factory(expr).await;
        assert_eq!(res, vec![1, 4, 7]);
+
+        let expr = col("field_u64")
+            .gt(lit(2u64))
+            .and(col("field_u64").lt(lit(9u64)));
+        let res = applier_factory(expr).await;
+        assert_eq!(res, vec![1, 2, 3, 4, 5, 6, 7]);
    }

    #[tokio::test]
    async fn test_create_and_query_regex() {
-        let tags = BTreeSet::from_iter([
-            ("aaa", 1),
-            ("aaa", 2),
-            ("aaa", 3),
-            ("aab", 1),
-            ("aab", 2),
-            ("aab", 3),
-            ("abc", 1),
-            ("abc", 2),
-            ("abc", 3),
+        let rows = BTreeSet::from_iter([
+            ("aaa", 1, [1, 2]),
+            ("aaa", 2, [2, 3]),
+            ("aaa", 3, [3, 4]),
+            ("aab", 1, [4, 5]),
+            ("aab", 2, [5, 6]),
+            ("aab", 3, [6, 7]),
+            ("abc", 1, [7, 8]),
+            ("abc", 2, [8, 9]),
+            ("abc", 3, [9, 10]),
        ]);

-        let applier_factory = build_applier_factory("test_create_and_query_regex_", tags).await;
+        let applier_factory = build_applier_factory("test_create_and_query_regex_", rows).await;

        let expr = binary_expr(col("tag_str"), Operator::RegexMatch, lit(".*"));
        let res = applier_factory(expr).await;
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -62,7 +62,8 @@ impl Default for WriteOptions {

 /// Parquet SST info returned by the writer.
 pub struct SstInfo {
-    /// Time range of the SST.
+    /// Time range of the SST. The timestamps have the same time unit as the
+    /// data in the SST.
    pub time_range: FileTimeRange,
    /// File size in bytes.
    pub file_size: u64,
--- a/src/mito2/src/sst/parquet/format.rs
+++ b/src/mito2/src/sst/parquet/format.rs
@@ -31,13 +31,14 @@ use std::collections::{HashMap, VecDeque};
 use std::sync::Arc;

 use api::v1::SemanticType;
+use common_time::Timestamp;
 use datafusion_common::ScalarValue;
 use datatypes::arrow::array::{ArrayRef, BinaryArray, DictionaryArray, UInt32Array, UInt64Array};
 use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::DataType;
 use datatypes::vectors::{Helper, Vector};
-use parquet::file::metadata::RowGroupMetaData;
+use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
 use parquet::file::statistics::Statistics;
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
@@ -48,6 +49,7 @@ use crate::error::{
 };
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
+use crate::sst::file::{FileMeta, FileTimeRange};
 use crate::sst::to_sst_arrow_schema;

 /// Arrow array type for the primary key dictionary.
@@ -558,6 +560,50 @@ fn new_primary_key_array(primary_key: &[u8], num_rows: usize) -> ArrayRef {
    Arc::new(DictionaryArray::new(keys, values))
 }

+/// Gets the min/max time index of the row group from the parquet meta.
+/// It assumes the parquet is created by the mito engine.
+pub(crate) fn parquet_row_group_time_range(
+    file_meta: &FileMeta,
+    parquet_meta: &ParquetMetaData,
+    row_group_idx: usize,
+) -> Option<FileTimeRange> {
+    let row_group_meta = parquet_meta.row_group(row_group_idx);
+    let num_columns = parquet_meta.file_metadata().schema_descr().num_columns();
+    assert!(
+        num_columns >= FIXED_POS_COLUMN_NUM,
+        "file only has {} columns",
+        num_columns
+    );
+    let time_index_pos = num_columns - FIXED_POS_COLUMN_NUM;
+
+    let stats = row_group_meta.column(time_index_pos).statistics()?;
+    if stats.has_min_max_set() {
+        // The physical type for the timestamp should be i64.
+        let (min, max) = match stats {
+            Statistics::Int64(value_stats) => (*value_stats.min(), *value_stats.max()),
+            Statistics::Int32(_)
+            | Statistics::Boolean(_)
+            | Statistics::Int96(_)
+            | Statistics::Float(_)
+            | Statistics::Double(_)
+            | Statistics::ByteArray(_)
+            | Statistics::FixedLenByteArray(_) => return None,
+        };
+
+        debug_assert!(
+            min >= file_meta.time_range.0.value() && min <= file_meta.time_range.1.value()
+        );
+        debug_assert!(
+            max >= file_meta.time_range.0.value() && max <= file_meta.time_range.1.value()
+        );
+        let unit = file_meta.time_range.0.unit();
+
+        Some((Timestamp::new(min, unit), Timestamp::new(max, unit)))
+    } else {
+        None
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use api::v1::OpType;
--- a/src/mito2/src/sst/version.rs
+++ b/src/mito2/src/sst/version.rs
@@ -84,7 +84,25 @@ impl SstVersion {
        }
    }

-    /// Returns SST files'space occupied in current version.
+    /// Returns the number of rows in SST files.
+    /// For historical reasons, the result is not precise for old SST files.
+    pub(crate) fn num_rows(&self) -> u64 {
+        self.levels
+            .iter()
+            .map(|level_meta| {
+                level_meta
+                    .files
+                    .values()
+                    .map(|file_handle| {
+                        let meta = file_handle.meta_ref();
+                        meta.num_rows
+                    })
+                    .sum::<u64>()
+            })
+            .sum()
+    }
+
+    /// Returns SST data files'space occupied in current version.
    pub(crate) fn sst_usage(&self) -> u64 {
        self.levels
            .iter()
@@ -94,7 +112,24 @@ impl SstVersion {
                    .values()
                    .map(|file_handle| {
                        let meta = file_handle.meta_ref();
-                        meta.file_size + meta.index_file_size
+                        meta.file_size
+                    })
+                    .sum::<u64>()
+            })
+            .sum()
+    }
+
+    /// Returns SST index files'space occupied in current version.
+    pub(crate) fn index_usage(&self) -> u64 {
+        self.levels
+            .iter()
+            .map(|level_meta| {
+                level_meta
+                    .files
+                    .values()
+                    .map(|file_handle| {
+                        let meta = file_handle.meta_ref();
+                        meta.index_file_size
                    })
                    .sum::<u64>()
            })
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -124,16 +124,6 @@ impl MemtableBuilder for EmptyMemtableBuilder {
    }
 }

-/// Empty iterator builder.
-#[derive(Default)]
-pub(crate) struct EmptyIterBuilder {}
-
-impl IterBuilder for EmptyIterBuilder {
-    fn build(&self) -> Result<BoxedBatchIterator> {
-        Ok(Box::new(std::iter::empty()))
-    }
-}
-
 /// Creates a region metadata to test memtable with default pk.
 ///
 /// The schema is `k0, k1, ts, v0, v1` and pk is `k0, k1`.
--- a/src/mito2/src/worker/handle_alter.rs
+++ b/src/mito2/src/worker/handle_alter.rs
@@ -19,7 +19,7 @@ use std::sync::Arc;
 use common_telemetry::{debug, info};
 use snafu::ResultExt;
 use store_api::metadata::{RegionMetadata, RegionMetadataBuilder, RegionMetadataRef};
-use store_api::region_request::RegionAlterRequest;
+use store_api::region_request::{AlterKind, ChangeOption, RegionAlterRequest};
 use store_api::storage::RegionId;

 use crate::error::{
@@ -27,6 +27,8 @@ use crate::error::{
 };
 use crate::flush::FlushReason;
 use crate::manifest::action::RegionChange;
+use crate::region::version::VersionRef;
+use crate::region::MitoRegionRef;
 use crate::request::{DdlRequest, OptionOutputTx, SenderDdlRequest};
 use crate::worker::RegionWorkerLoop;

@@ -45,6 +47,13 @@ impl<S> RegionWorkerLoop<S> {

        // Get the version before alter.
        let version = region.version();
+
+        // fast path for memory state changes like options.
+        if let AlterKind::ChangeRegionOptions { options } = request.kind {
+            self.handle_alter_region_options(region, version, options, sender);
+            return;
+        }
+
        if version.metadata.schema_version != request.schema_version {
            // This is possible if we retry the request.
            debug!(
@@ -67,6 +76,7 @@ impl<S> RegionWorkerLoop<S> {
            sender.send(Err(e).context(InvalidRegionRequestSnafu));
            return;
        }
+
        // Checks whether we need to alter the region.
        if !request.need_alter(&version.metadata) {
            debug!(
@@ -111,7 +121,17 @@ impl<S> RegionWorkerLoop<S> {
            version.metadata.schema_version,
            region.metadata().schema_version
        );
+        self.handle_alter_region_metadata(region, version, request, sender);
+    }

+    /// Handles region metadata changes.
+    fn handle_alter_region_metadata(
+        &mut self,
+        region: MitoRegionRef,
+        version: VersionRef,
+        request: RegionAlterRequest,
+        sender: OptionOutputTx,
+    ) {
        let new_meta = match metadata_after_alteration(&version.metadata, request) {
            Ok(new_meta) => new_meta,
            Err(e) => {
@@ -120,11 +140,38 @@ impl<S> RegionWorkerLoop<S> {
            }
        };
        // Persist the metadata to region's manifest.
-        let change = RegionChange {
-            metadata: new_meta.clone(),
-        };
+        let change = RegionChange { metadata: new_meta };
        self.handle_manifest_region_change(region, change, sender)
    }
+
+    /// Handles requests that changes region options, like TTL. It only affects memory state
+    /// since changes are persisted in the `DatanodeTableValue` in metasrv.
+    fn handle_alter_region_options(
+        &mut self,
+        region: MitoRegionRef,
+        version: VersionRef,
+        options: Vec<ChangeOption>,
+        sender: OptionOutputTx,
+    ) {
+        let mut current_options = version.options.clone();
+        for option in options {
+            match option {
+                ChangeOption::TTL(new_ttl) => {
+                    info!(
+                        "Update region ttl: {}, previous: {:?} new: {:?}",
+                        region.region_id, current_options.ttl, new_ttl
+                    );
+                    if new_ttl.is_zero() {
+                        current_options.ttl = None;
+                    } else {
+                        current_options.ttl = Some(new_ttl);
+                    }
+                }
+            }
+        }
+        region.version_control.alter_options(current_options);
+        sender.send(Ok(0));
+    }
 }

 /// Creates a metadata after applying the alter `request` to the old `metadata`.
--- a/src/mito2/src/worker/handle_open.rs
+++ b/src/mito2/src/worker/handle_open.rs
@@ -22,6 +22,7 @@ use snafu::{OptionExt, ResultExt};
 use store_api::logstore::LogStore;
 use store_api::region_request::RegionOpenRequest;
 use store_api::storage::RegionId;
+use table::requests::STORAGE_KEY;

 use crate::error::{
    ObjectStoreNotFoundSnafu, OpenDalSnafu, OpenRegionSnafu, RegionNotFoundSnafu, Result,
@@ -38,7 +39,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        region_id: RegionId,
        request: &RegionOpenRequest,
    ) -> Result<()> {
-        let object_store = if let Some(storage_name) = request.options.get("storage") {
+        let object_store = if let Some(storage_name) = request.options.get(STORAGE_KEY) {
            self.object_store_manager
                .find(storage_name)
                .context(ObjectStoreNotFoundSnafu {
--- a/src/operator/src/expr_factory.rs
+++ b/src/operator/src/expr_factory.rs
@@ -18,9 +18,9 @@ use api::helper::ColumnDataTypeWrapper;
 use api::v1::alter_expr::Kind;
 use api::v1::column_def::options_from_column_schema;
 use api::v1::{
-    AddColumn, AddColumns, AlterExpr, ChangeColumnType, ChangeColumnTypes, ColumnDataType,
-    ColumnDataTypeExtension, CreateFlowExpr, CreateTableExpr, CreateViewExpr, DropColumn,
-    DropColumns, ExpireAfter, RenameTable, SemanticType, TableName,
+    AddColumn, AddColumns, AlterExpr, ChangeColumnType, ChangeColumnTypes, ChangeTableOptions,
+    ColumnDataType, ColumnDataTypeExtension, CreateFlowExpr, CreateTableExpr, CreateViewExpr,
+    DropColumn, DropColumns, ExpireAfter, RenameTable, SemanticType, TableName,
 };
 use common_error::ext::BoxedError;
 use common_grpc_expr::util::ColumnExpr;
@@ -438,7 +438,7 @@ pub(crate) fn to_alter_expr(
            .map_err(BoxedError::new)
            .context(ExternalSnafu)?;

-    let kind = match alter_table.alter_operation() {
+    let kind = match alter_table.alter_operation {
        AlterTableOperation::AddConstraint(_) => {
            return NotSupportedSnafu {
                feat: "ADD CONSTRAINT",
@@ -451,7 +451,7 @@ pub(crate) fn to_alter_expr(
        } => Kind::AddColumns(AddColumns {
            add_columns: vec![AddColumn {
                column_def: Some(
-                    sql_column_def_to_grpc_column_def(column_def, Some(&query_ctx.timezone()))
+                    sql_column_def_to_grpc_column_def(&column_def, Some(&query_ctx.timezone()))
                        .map_err(BoxedError::new)
                        .context(ExternalSnafu)?,
                ),
@@ -463,13 +463,13 @@ pub(crate) fn to_alter_expr(
            target_type,
        } => {
            let target_type =
-                sql_data_type_to_concrete_data_type(target_type).context(ParseSqlSnafu)?;
+                sql_data_type_to_concrete_data_type(&target_type).context(ParseSqlSnafu)?;
            let (target_type, target_type_extension) = ColumnDataTypeWrapper::try_from(target_type)
                .map(|w| w.to_parts())
                .context(ColumnDataTypeSnafu)?;
            Kind::ChangeColumnTypes(ChangeColumnTypes {
                change_column_types: vec![ChangeColumnType {
-                    column_name: column_name.value.to_string(),
+                    column_name: column_name.value,
                    target_type: target_type as i32,
                    target_type_extension,
                }],
@@ -483,6 +483,11 @@ pub(crate) fn to_alter_expr(
        AlterTableOperation::RenameTable { new_table_name } => Kind::RenameTable(RenameTable {
            new_table_name: new_table_name.to_string(),
        }),
+        AlterTableOperation::ChangeTableOptions { options } => {
+            Kind::ChangeTableOptions(ChangeTableOptions {
+                change_table_options: options.into_iter().map(Into::into).collect(),
+            })
+        }
    };

    Ok(AlterExpr {
@@ -744,7 +749,7 @@ mod tests {

    #[test]
    fn test_to_alter_change_column_type_expr() {
-        let sql = "ALTER TABLE monitor MODIFY mem_usage STRING;";
+        let sql = "ALTER TABLE monitor MODIFY COLUMN mem_usage STRING;";
        let stmt =
            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
                .unwrap()
--- a/src/operator/src/req_convert/common.rs
+++ b/src/operator/src/req_convert/common.rs
@@ -114,6 +114,7 @@ fn push_column_to_rows(column: Column, rows: &mut [Row]) -> Result<()> {
        (Float64, F64Value, f64_values),
        (Binary, BinaryValue, binary_values),
        (String, StringValue, string_values),
+        (Json, StringValue, string_values),
        (Date, DateValue, date_values),
        (Datetime, DatetimeValue, datetime_values),
        (
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -363,7 +363,7 @@ impl StatementExecutor {

    pub async fn plan(
        &self,
-        stmt: QueryStatement,
+        stmt: &QueryStatement,
        query_ctx: QueryContextRef,
    ) -> Result<LogicalPlan> {
        self.query_engine
@@ -373,6 +373,14 @@ impl StatementExecutor {
            .context(PlanStatementSnafu)
    }

+    /// Execute [`LogicalPlan`] directly.
+    pub async fn exec_plan(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result<Output> {
+        self.query_engine
+            .execute(plan, query_ctx)
+            .await
+            .context(ExecLogicalPlanSnafu)
+    }
+
    pub fn optimize_logical_plan(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
        self.query_engine
            .planner()
@@ -382,11 +390,8 @@ impl StatementExecutor {

    #[tracing::instrument(skip_all)]
    async fn plan_exec(&self, stmt: QueryStatement, query_ctx: QueryContextRef) -> Result<Output> {
-        let plan = self.plan(stmt, query_ctx.clone()).await?;
-        self.query_engine
-            .execute(plan, query_ctx)
-            .await
-            .context(ExecLogicalPlanSnafu)
+        let plan = self.plan(&stmt, query_ctx.clone()).await?;
+        self.exec_plan(plan, query_ctx).await
    }

    async fn get_table(&self, table_ref: &TableReference<'_>) -> Result<TableRef> {
--- a/src/operator/src/statement/ddl.rs
+++ b/src/operator/src/statement/ddl.rs
@@ -391,7 +391,7 @@ impl StatementExecutor {
        let logical_plan = match &*create_view.query {
            Statement::Query(query) => {
                self.plan(
-                    QueryStatement::Sql(Statement::Query(query.clone())),
+                    &QueryStatement::Sql(Statement::Query(query.clone())),
                    ctx.clone(),
                )
                .await?
--- a/src/operator/src/statement/tql.rs
+++ b/src/operator/src/statement/tql.rs
@@ -90,7 +90,7 @@ impl StatementExecutor {
        };
        self.query_engine
            .planner()
-            .plan(stmt, query_ctx.clone())
+            .plan(&stmt, query_ctx.clone())
            .await
            .context(PlanStatementSnafu)
    }
--- a/src/partition/src/multi_dim.rs
+++ b/src/partition/src/multi_dim.rs
@@ -274,7 +274,7 @@ impl<'a> RuleChecker<'a> {
    fn check_axis(&self) -> Result<()> {
        for (col_index, axis) in self.axis.iter().enumerate() {
            for (val, split_point) in axis {
-                if split_point.less_than_counter != 0 || !split_point.is_equal {
+                if !split_point.is_equal {
                    UnclosedValueSnafu {
                        value: format!("{val:?}"),
                        column: self.rule.partition_columns[col_index].clone(),
@@ -410,6 +410,7 @@ mod tests {
    ///      b <= h     b >= s            
    /// ```
    #[test]
+    #[ignore = "don't check unmatched `>` and `<` for now"]
    fn empty_expr_case_1() {
        // PARTITION ON COLUMNS (b) (
        //     b <= 'h',
@@ -451,6 +452,7 @@ mod tests {
    ///              10          20      
    /// ```
    #[test]
+    #[ignore = "don't check unmatched `>` and `<` for now"]
    fn empty_expr_case_2() {
        // PARTITION ON COLUMNS (b) (
        //     a >= 100 AND b <= 10  OR  a > 100 AND a <= 200 AND b <= 10  OR  a >= 200 AND b > 10 AND b <= 20  OR  a > 200 AND b <= 20
@@ -580,6 +582,7 @@ mod tests {
    }

    #[test]
+    #[ignore = "don't check unmatched `>` and `<` for now"]
    fn duplicate_expr_case_1() {
        // PARTITION ON COLUMNS (a) (
        //     a <= 20,
--- a/src/query/Cargo.toml
+++ b/src/query/Cargo.toml
@@ -64,6 +64,7 @@ store-api.workspace = true
 substrait.workspace = true
 table.workspace = true
 tokio.workspace = true
+uuid.workspace = true

 [dev-dependencies]
 approx_eq = "0.1"
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -570,7 +570,7 @@ mod tests {
        let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap();
        let plan = engine
            .planner()
-            .plan(stmt, QueryContext::arc())
+            .plan(&stmt, QueryContext::arc())
            .await
            .unwrap();

@@ -592,7 +592,7 @@ mod tests {
        let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap();
        let plan = engine
            .planner()
-            .plan(stmt, QueryContext::arc())
+            .plan(&stmt, QueryContext::arc())
            .await
            .unwrap();

@@ -671,7 +671,7 @@ mod tests {

        let plan = engine
            .planner()
-            .plan(stmt, QueryContext::arc())
+            .plan(&stmt, QueryContext::arc())
            .await
            .unwrap();

--- a/src/query/src/dist_plan/commutativity.rs
+++ b/src/query/src/dist_plan/commutativity.rs
@@ -15,8 +15,11 @@
 use std::collections::HashSet;
 use std::sync::Arc;

+use datafusion::functions_aggregate::sum::Sum;
+use datafusion_expr::aggregate_function::AggregateFunction as BuiltInAggregateFunction;
+use datafusion_expr::expr::{AggregateFunction, AggregateFunctionDefinition};
 use datafusion_expr::utils::exprlist_to_columns;
-use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNode};
+use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, UserDefinedLogicalNode};
 use promql::extension_plan::{
    EmptyMetric, InstantManipulate, RangeManipulate, SeriesDivide, SeriesNormalize,
 };
@@ -25,21 +28,91 @@ use crate::dist_plan::merge_sort::{merge_sort_transformer, MergeSortLogicalPlan}
 use crate::dist_plan::MergeScanLogicalPlan;

 #[allow(dead_code)]
-pub enum Commutativity {
+pub enum Commutativity<T> {
    Commutative,
    PartialCommutative,
-    ConditionalCommutative(Option<Transformer>),
-    TransformedCommutative(Option<Transformer>),
+    ConditionalCommutative(Option<Transformer<T>>),
+    TransformedCommutative(Option<Transformer<T>>),
    NonCommutative,
    Unimplemented,
    /// For unrelated plans like DDL
    Unsupported,
 }

+impl<T> Commutativity<T> {
+    /// Check if self is stricter than `lhs`
+    fn is_stricter_than(&self, lhs: &Self) -> bool {
+        match (lhs, self) {
+            (Commutativity::Commutative, Commutativity::Commutative) => false,
+            (Commutativity::Commutative, _) => true,
+
+            (
+                Commutativity::PartialCommutative,
+                Commutativity::Commutative | Commutativity::PartialCommutative,
+            ) => false,
+            (Commutativity::PartialCommutative, _) => true,
+
+            (
+                Commutativity::ConditionalCommutative(_),
+                Commutativity::Commutative
+                | Commutativity::PartialCommutative
+                | Commutativity::ConditionalCommutative(_),
+            ) => false,
+            (Commutativity::ConditionalCommutative(_), _) => true,
+
+            (
+                Commutativity::TransformedCommutative(_),
+                Commutativity::Commutative
+                | Commutativity::PartialCommutative
+                | Commutativity::ConditionalCommutative(_)
+                | Commutativity::TransformedCommutative(_),
+            ) => false,
+            (Commutativity::TransformedCommutative(_), _) => true,
+
+            (
+                Commutativity::NonCommutative
+                | Commutativity::Unimplemented
+                | Commutativity::Unsupported,
+                _,
+            ) => false,
+        }
+    }
+
+    /// Return a bare commutative level without any transformer
+    fn bare_level<To>(&self) -> Commutativity<To> {
+        match self {
+            Commutativity::Commutative => Commutativity::Commutative,
+            Commutativity::PartialCommutative => Commutativity::PartialCommutative,
+            Commutativity::ConditionalCommutative(_) => Commutativity::ConditionalCommutative(None),
+            Commutativity::TransformedCommutative(_) => Commutativity::TransformedCommutative(None),
+            Commutativity::NonCommutative => Commutativity::NonCommutative,
+            Commutativity::Unimplemented => Commutativity::Unimplemented,
+            Commutativity::Unsupported => Commutativity::Unsupported,
+        }
+    }
+}
+
+impl<T> std::fmt::Debug for Commutativity<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Commutativity::Commutative => write!(f, "Commutative"),
+            Commutativity::PartialCommutative => write!(f, "PartialCommutative"),
+            Commutativity::ConditionalCommutative(_) => write!(f, "ConditionalCommutative"),
+            Commutativity::TransformedCommutative(_) => write!(f, "TransformedCommutative"),
+            Commutativity::NonCommutative => write!(f, "NonCommutative"),
+            Commutativity::Unimplemented => write!(f, "Unimplemented"),
+            Commutativity::Unsupported => write!(f, "Unsupported"),
+        }
+    }
+}
+
 pub struct Categorizer {}

 impl Categorizer {
-    pub fn check_plan(plan: &LogicalPlan, partition_cols: Option<Vec<String>>) -> Commutativity {
+    pub fn check_plan(
+        plan: &LogicalPlan,
+        partition_cols: Option<Vec<String>>,
+    ) -> Commutativity<LogicalPlan> {
        let partition_cols = partition_cols.unwrap_or_default();

        match plan {
@@ -47,21 +120,104 @@ impl Categorizer {
                for expr in &proj.expr {
                    let commutativity = Self::check_expr(expr);
                    if !matches!(commutativity, Commutativity::Commutative) {
-                        return commutativity;
+                        return commutativity.bare_level();
                    }
                }
                Commutativity::Commutative
            }
            // TODO(ruihang): Change this to Commutative once Like is supported in substrait
-            LogicalPlan::Filter(filter) => Self::check_expr(&filter.predicate),
+            LogicalPlan::Filter(filter) => Self::check_expr(&filter.predicate).bare_level(),
            LogicalPlan::Window(_) => Commutativity::Unimplemented,
            LogicalPlan::Aggregate(aggr) => {
+                // fast path: if the group_expr is a subset of partition_cols
                if Self::check_partition(&aggr.group_expr, &partition_cols) {
                    return Commutativity::Commutative;
                }

-                // check all children exprs and uses the strictest level
-                Commutativity::Unimplemented
+                common_telemetry::info!("[DEBUG] aggregate plan expr: {:?}", aggr.aggr_expr);
+
+                // get all commutativity levels of aggregate exprs and find the strictest one
+                let aggr_expr_comm = aggr
+                    .aggr_expr
+                    .iter()
+                    .map(Self::check_expr)
+                    .collect::<Vec<_>>();
+                let mut strictest = Commutativity::Commutative;
+                for comm in &aggr_expr_comm {
+                    if comm.is_stricter_than(&strictest) {
+                        strictest = comm.bare_level();
+                    }
+                }
+
+                common_telemetry::info!("[DEBUG] aggr_expr_comm: {:?}", aggr_expr_comm);
+                common_telemetry::info!("[DEBUG] strictest: {:?}", strictest);
+
+                // fast path: if any expr is commutative or non-commutative
+                if matches!(
+                    strictest,
+                    Commutativity::Commutative
+                        | Commutativity::NonCommutative
+                        | Commutativity::Unimplemented
+                        | Commutativity::Unsupported
+                ) {
+                    return strictest.bare_level();
+                }
+
+                common_telemetry::info!("[DEBUG] continue for strictest",);
+
+                // collect expr transformers
+                let mut expr_transformer = Vec::with_capacity(aggr.aggr_expr.len());
+                for expr_comm in aggr_expr_comm {
+                    match expr_comm {
+                        Commutativity::Commutative => expr_transformer.push(None),
+                        Commutativity::ConditionalCommutative(transformer) => {
+                            expr_transformer.push(transformer.clone());
+                        }
+                        Commutativity::PartialCommutative => expr_transformer
+                            .push(Some(Arc::new(expr_partial_commutative_transformer))),
+                        _ => expr_transformer.push(None),
+                    }
+                }
+
+                // build plan transformer
+                let transformer = Arc::new(move |plan: &LogicalPlan| {
+                    if let LogicalPlan::Aggregate(aggr) = plan {
+                        let mut new_plan = aggr.clone();
+
+                        // transform aggr exprs
+                        for (expr, transformer) in
+                            new_plan.aggr_expr.iter_mut().zip(&expr_transformer)
+                        {
+                            if let Some(transformer) = transformer {
+                                let new_expr = transformer(expr)?;
+                                *expr = new_expr;
+                            }
+                        }
+
+                        // transform group exprs
+                        for expr in new_plan.group_expr.iter_mut() {
+                            // if let Some(transformer) = transformer {
+                            //     let new_expr = transformer(expr)?;
+                            //     *expr = new_expr;
+                            // }
+                            let expr_name = expr.name_for_alias().expect("not a sort expr");
+                            *expr = Expr::Column(expr_name.into());
+                        }
+
+                        common_telemetry::info!(
+                            "[DEBUG] new plan aggr expr: {:?}, group expr: {:?}",
+                            new_plan.aggr_expr,
+                            new_plan.group_expr
+                        );
+                        Some(LogicalPlan::Aggregate(new_plan))
+                    } else {
+                        None
+                    }
+                });
+
+                common_telemetry::info!("[DEBUG] done TransformedCommutative for aggr plan ");
+
+                Commutativity::TransformedCommutative(Some(transformer))
            }
            LogicalPlan::Sort(_) => {
                if partition_cols.is_empty() {
@@ -113,7 +269,7 @@ impl Categorizer {
        }
    }

-    pub fn check_extension_plan(plan: &dyn UserDefinedLogicalNode) -> Commutativity {
+    pub fn check_extension_plan(plan: &dyn UserDefinedLogicalNode) -> Commutativity<LogicalPlan> {
        match plan.name() {
            name if name == EmptyMetric::name()
                || name == InstantManipulate::name()
@@ -129,7 +285,7 @@ impl Categorizer {
        }
    }

-    pub fn check_expr(expr: &Expr) -> Commutativity {
+    pub fn check_expr(expr: &Expr) -> Commutativity<Expr> {
        match expr {
            Expr::Column(_)
            | Expr::ScalarVariable(_, _)
@@ -155,13 +311,14 @@ impl Categorizer {
            | Expr::Case(_)
            | Expr::Cast(_)
            | Expr::TryCast(_)
-            | Expr::AggregateFunction(_)
            | Expr::WindowFunction(_)
            | Expr::InList(_)
            | Expr::InSubquery(_)
            | Expr::ScalarSubquery(_)
            | Expr::Wildcard { .. } => Commutativity::Unimplemented,

+            Expr::AggregateFunction(aggr_fn) => Self::check_aggregate_fn(aggr_fn),
+
            Expr::Alias(_)
            | Expr::Unnest(_)
            | Expr::GroupingSet(_)
@@ -170,6 +327,59 @@ impl Categorizer {
        }
    }

+    fn check_aggregate_fn(aggr_fn: &AggregateFunction) -> Commutativity<Expr> {
+        common_telemetry::info!("[DEBUG] checking aggr_fn: {:?}", aggr_fn);
+        match &aggr_fn.func_def {
+            AggregateFunctionDefinition::BuiltIn(func_def) => match func_def {
+                BuiltInAggregateFunction::Max | BuiltInAggregateFunction::Min => {
+                    // Commutativity::PartialCommutative
+                    common_telemetry::info!("[DEBUG] checking min/max: {:?}", aggr_fn);
+                    let mut new_fn = aggr_fn.clone();
+                    let col_name = Expr::AggregateFunction(aggr_fn.clone())
+                        .name_for_alias()
+                        .expect("not a sort expr");
+                    let alias = col_name.clone();
+                    new_fn.args = vec![Expr::Column(col_name.into())];
+
+                    // new_fn.func_def =
+                    //     AggregateFunctionDefinition::BuiltIn(BuiltInAggregateFunction::Sum);
+                    Commutativity::ConditionalCommutative(Some(Arc::new(move |_| {
+                        common_telemetry::info!("[DEBUG] transforming min/max fn: {:?}", new_fn);
+                        Some(Expr::AggregateFunction(new_fn.clone()).alias(alias.clone()))
+                    })))
+                }
+                BuiltInAggregateFunction::Count => {
+                    common_telemetry::info!("[DEBUG] checking count_fn: {:?}", aggr_fn);
+                    let col_name = Expr::AggregateFunction(aggr_fn.clone())
+                        .name_for_alias()
+                        .expect("not a sort expr");
+                    let sum_udf = Arc::new(AggregateUDF::new_from_impl(Sum::new()));
+                    let alias = col_name.clone();
+                    // let sum_func = Arc::new(AggregateFunction::new_udf(
+                    //     sum_udf,
+                    //     vec![Expr::Column(col_name.into())],
+                    //     false,
+                    //     None,
+                    //     None,
+                    //     None,
+                    // ));
+                    let mut sum_expr = aggr_fn.clone();
+                    sum_expr.func_def = AggregateFunctionDefinition::UDF(sum_udf);
+                    sum_expr.args = vec![Expr::Column(col_name.into())];
+                    // let mut sum_fn = aggr_fn.clone();
+                    // sum_fn.func_def =
+                    //     AggregateFunctionDefinition::BuiltIn(BuiltInAggregateFunction::Sum);
+                    Commutativity::ConditionalCommutative(Some(Arc::new(move |_| {
+                        common_telemetry::info!("[DEBUG] transforming sum_fn: {:?}", sum_expr);
+                        Some(Expr::AggregateFunction(sum_expr.clone()).alias(alias.clone()))
+                    })))
+                }
+                _ => Commutativity::Unimplemented,
+            },
+            AggregateFunctionDefinition::UDF(_) => Commutativity::Unimplemented,
+        }
+    }
+
    /// Return true if the given expr and partition cols satisfied the rule.
    /// In this case the plan can be treated as fully commutative.
    fn check_partition(exprs: &[Expr], partition_cols: &[String]) -> bool {
@@ -191,12 +401,16 @@ impl Categorizer {
    }
 }

-pub type Transformer = Arc<dyn Fn(&LogicalPlan) -> Option<LogicalPlan>>;
+pub type Transformer<T> = Arc<dyn for<'a> Fn(&'a T) -> Option<T>>;

 pub fn partial_commutative_transformer(plan: &LogicalPlan) -> Option<LogicalPlan> {
    Some(plan.clone())
 }

+pub fn expr_partial_commutative_transformer(expr: &Expr) -> Option<Expr> {
+    Some(expr.clone())
+}
+
 #[cfg(test)]
 mod test {
    use datafusion_expr::{LogicalPlanBuilder, Sort};
--- a/src/query/src/lib.rs
+++ b/src/query/src/lib.rs
@@ -15,6 +15,7 @@
 #![feature(let_chains)]
 #![feature(int_roundings)]
 #![feature(trait_upcasting)]
+#![feature(try_blocks)]

 mod analyze;
 pub mod dataframe;
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -20,6 +20,7 @@ pub mod string_normalization;
 #[cfg(test)]
 pub(crate) mod test_util;
 pub mod type_conversion;
+pub mod windowed_sort;

 use datafusion_common::config::ConfigOptions;
 use datafusion_common::Result;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ruihang Xia	d4aa4159d4	feat: support windowed sort with where condition Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-11-04 19:34:03 +08:00
evenyag	960f6d821b	feat: spawn block write wal	2024-11-04 17:35:12 +08:00
Ruihang Xia	9c5d044238	Merge branch 'main' into transform-count-min-max Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-11-01 17:45:28 +08:00
Ruihang Xia	be72d3bedb	feat: simple limit impl in PartSort (#4922 ) * feat: simple limit impl in PartSort Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: update time_index method to return a non-optional String Co-authored-by: Yingwen <realevenyag@gmail.com> Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * use builtin limit Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add more info to analyze display Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * update sqlness Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Yingwen <realevenyag@gmail.com>	2024-11-01 09:25:03 +00:00
discord9	1ff29d8fde	chore: short desc markdown about change log level (#4921 ) * chore: tiny doc about change log level * chore: per review * chore	2024-11-01 07:10:57 +00:00
Yingwen	39ab1a6415	feat: get row group time range from cached metadata (#4869 ) * feat: get part range min-max from cache for unordered scan * feat: seq scan push row groups if num_row_groups > 0 * test: test split * feat: update comment * test: fix split test * refactor: rename get meta data method	2024-11-01 06:35:03 +00:00
Ruihang Xia	70c354eed6	fix: the way to retrieve time index column Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-11-01 12:10:12 +08:00
Ruihang Xia	23bf663d58	feat: handle sort that wont preserving partition Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-10-31 22:13:36 +08:00
Ruihang Xia	817648eac5	Merge branch 'main' into transform-count-min-max Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-10-31 15:38:12 +08:00
Weny Xu	758ad0a8c5	refactor: simplify WeightedChoose (#4916 ) * refactor: simplify WeightedChoose * chore: remove unused errors	2024-10-31 06:22:30 +00:00
Ruihang Xia	8b60c27c2e	feat: enhance windowed-sort optimizer rule (#4910 ) * add RegionScanner::metadata Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * skip PartSort when there is no tag column Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add more sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * handle desc Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: should keep part sort on DESC Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-10-31 06:15:45 +00:00
Yingwen	ea6df9ba49	fix: prune batches from memtable by time range (#4913 ) * feat: add an iter to prune by time range * feat: filter rows from mem range	2024-10-31 05:13:35 +00:00
Ning Sun	69420793e2	feat: implement parse_query api (#4860 ) * feat: implement parse_query api * chore: switch to upstream * fix: add post method for parse_query * chore: bump promql-parser * test: use latest promql ast serialization	2024-10-30 12:16:22 +00:00
Yingwen	0da112b335	chore: provide more info in check batch message (#4906 ) * chore: provide more info in check message * chore: set timeout to 240s --------- Co-authored-by: WenyXu <wenymedia@gmail.com>	2024-10-30 11:56:10 +00:00
dennis zhuang	dcc08f6b3e	feat: adds the number of rows and index files size to region_statistics table (#4909 ) * feat: adds index size to region statistics * feat: adds the number of rows for region statistics * test: adds sqlness test for region_statistics * fix: test	2024-10-30 11:12:58 +00:00
dennis zhuang	a34035a1f2	fix: set transaction variables not working in mysql protocol (#4912 )	2024-10-30 10:59:13 +00:00
LFC	fd8eba36a8	refactor: make use of the "pre_execute" in sql execution interceptor (#4875 ) * feat: dynamic definition of plugin options * rebase * revert * fix ci	2024-10-30 09:16:46 +00:00
Ruihang Xia	9712295177	fix(config): update tracing section headers in example TOML files (#4898 ) Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-10-30 08:31:31 +00:00
Lei, HUANG	d275cdd570	feat: Support altering table TTL (#4848 ) * feat/alter-ttl: Update greptime-proto source and add ChangeTableOptions handling - Change greptime-proto source repository and revision in Cargo.lock and Cargo.toml - Implement handling for ChangeTableOptions in grpc-expr and meta modules - Add support for parsing and applying region option changes in mito2 - Introduce new error type for invalid change table option requests - Add humantime dependency to store-api - Fix SQL syntax in tests for changing column types * chore: remove write buffer size option handling since we don't support specifying write_buffer_size for single table or region * persist ttl to manifest * chore: add sqlness * fix: sqlness * fix: typo and toml format * fix: tests * update: change alter syntax * feat/alter-ttl: Add Clone trait to RegionFlushRequest and remove redundant Default derive in region_request.rs. * feat/alter-ttl: Refactor code to replace 'ChangeTableOption' with 'ChangeRegionOption' and handle TTL as a region option • Rename ChangeTableOption to ChangeRegionOption across various files. • Update AlterKind::ChangeTableOptions to AlterKind::ChangeRegionOptions. • Modify TTL handling to treat '0d' as None for TTL in table options. • Adjust related function names and comments to reflect the change from table to region options. • Include test case updates to verify the new TTL handling behavior. * chore: update format * refactor: update region options in DatanodeTableValue * feat/alter-ttl: Remove TTL handling from RegionManifest and related structures - Eliminate TTL fields from `RegionManifest`, `RegionChange`, and associated handling logic. - Update tests and checksums to reflect removal of TTL. - Refactor `RegionOpener` and `handle_alter` to adjust to TTL removal. - Simplify `RegionChangeResult` by replacing `change` with `new_meta`. * chore: fmt * remove useless delete op * feat/alter-ttl: Updated Cargo.lock and gRPC expression Cargo.toml to include store-api dependency. Refactored alter.rs to use ChangeOption from store-api instead of ChangeTableOptionRequest. Adjusted error handling in error.rs to use MetadataError. Modified handle_alter.rs to handle TTL changes with ChangeOption. Simplified region_request.rs by replacing ChangeRegionOption with ChangeOption and removing redundant code. Removed UnsupportedTableOptionChange error in table/src/error.rs. Updated metadata.rs to use ChangeOption for table options. Removed ChangeTableOptionRequest enum and related conversion code from requests.rs. * feat/alter-ttl: Update greptime-proto dependency to revision 53ab9a9553 * chore: format code * chore: update greptime-proto	2024-10-30 04:39:48 +00:00
Weny Xu	83eb777d21	test: add fuzz test for metric region migration (#4862 ) * test: add fuzz tests for migrate metric regions * test: insert values before migrating metric region * feat: correct table num * chore: apply suggestions from CR	2024-10-29 15:47:48 +00:00
Yohan Wal	8ed5bc5305	refactor: json conversion (#4893 ) * refactor: json type update * test: update test * fix: convert when needed * revert: leave sqlness tests unchanged * fix: fmt * refactor: just refactor * Apply suggestions from code review Co-authored-by: Weny Xu <wenymedia@gmail.com> * refactor: parse jsonb first * test: add bad cases * Update src/datatypes/src/vectors/binary.rs Co-authored-by: Weny Xu <wenymedia@gmail.com> * fix: fmt * fix: fix clippy/check --------- Co-authored-by: Weny Xu <wenymedia@gmail.com>	2024-10-29 15:46:24 +00:00
Weny Xu	9ded314905	feat: add json datatype for grpc protocol (#4897 ) * chore: update greptime-proto * feat: add json datatype for grpc protocol	2024-10-29 12:37:53 +00:00
discord9	702a55a235	chore: update proto depend (#4899 )	2024-10-29 09:32:28 +00:00
discord9	f3e5a5a7aa	ci: install numpy in CI (#4895 ) chore: install numpy in CI	2024-10-29 07:57:40 +00:00
Zhenchi	9c79baca4b	feat(index): support building inverted index for the field column on Mito (#4887 ) feat(index): support building inverted index for the field column Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>	2024-10-29 07:57:17 +00:00
Ruihang Xia	03f2fa219d	feat: optimizer rule for windowed sort (#4874 ) * basic impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement physical rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * feat: install windowed sort physical rule and optimize partition ranges Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add logs and sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * feat: introduce PartSortExec for partitioned sorting Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * tune exec nodes' properties and metrics Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix typo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * debug: add more info on very wrong * debug: also print overlap ranges * feat: add check when emit PartSort Stream * dbg: info on overlap working range * feat: check batch range is inside part range * set distinguish partition range param Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: more logs * update sqlness Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * tune optimizer Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix lints Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix windowed sort rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: early terminate sort stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: remove min/max check * chore: remove unused windowed_sort module, uuid feature and refactor region_scanner to synchronous Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: print more fuzz log * chore: more log * fix: part sort should skip empty part * chore: remove insert logs * tests: empty PartitionRange * refactor: testcase * docs: update comment&tests: all empty * ci: enlarge etcd cpu limit --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: discord9 <discord9@163.com> Co-authored-by: evenyag <realevenyag@gmail.com>	2024-10-29 07:46:05 +00:00
Lei, HUANG	0ee455a980	fix: pyo3 ut (#4894 )	2024-10-29 04:47:57 +00:00
Lei, HUANG	eab9e3a48d	chore: remove struct size assertion (#4885 ) chore/remove-struct-size-assertion: Remove unit tests for parquet_meta_size function in cache_size.rs	2024-10-28 08:50:10 +00:00
Ruihang Xia	03b29439e2	Merge branch 'main' into transform-count-min-max Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-11 11:09:07 +08:00
Ruihang Xia	712f4ca0ef	try sort partial commutative Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-09 21:08:59 +08:00
Ruihang Xia	60bacff57e	ignore unmatched left and right greater Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-08 11:12:21 +08:00
Ruihang Xia	6208772ba4	Merge branch 'main' into transform-count-min-max Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-08 11:02:04 +08:00
Ruihang Xia	67184c0498	Merge branch 'main' into transform-count-min-max Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-05 14:30:47 +08:00
Ruihang Xia	1dd908fdf7	handle group by Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-05 12:50:13 +08:00
Ruihang Xia	8179b4798e	feat: support transforming min/max/count aggr fn Signed-off-by: Ruihang Xia <waynestxia@gmail.com>	2024-09-04 22:17:31 +08:00