test: more reduce tests

support more binary function
eval func
2026-01-10 07:12:54 +00:00 · 2023-09-06 16:38:51 +08:00 · 2023-09-06 16:38:51 +08:00 · 2023-09-06 16:37:49 +08:00 · 2023-09-06 16:37:49 +08:00 · 2023-09-06 16:37:49 +08:00
314 changed files with 21044 additions and 3121 deletions
--- a/.github/actions/build-greptime-binary/action.yml
+++ b/.github/actions/build-greptime-binary/action.yml
@@ -32,6 +32,10 @@ inputs:
    description: Upload to S3
    required: false
    default: 'true'
+  upload-latest-artifacts:
+    description: Upload the latest artifacts to S3
+    required: false
+    default: 'true'
  working-dir:
    description: Working directory to build the artifacts
    required: false
@@ -59,4 +63,5 @@ runs:
        aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
        aws-region: ${{ inputs.aws-region }}
        upload-to-s3: ${{ inputs.upload-to-s3 }}
+        upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
        working-dir: ${{ inputs.working-dir }}
--- a/.github/actions/build-linux-artifacts/action.yml
+++ b/.github/actions/build-linux-artifacts/action.yml
@@ -33,6 +33,10 @@ inputs:
    description: Upload to S3
    required: false
    default: 'true'
+  upload-latest-artifacts:
+    description: Upload the latest artifacts to S3
+    required: false
+    default: 'true'
  working-dir:
    description: Working directory to build the artifacts
    required: false
@@ -69,6 +73,7 @@ runs:
        aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
        aws-region: ${{ inputs.aws-region }}
        upload-to-s3: ${{ inputs.upload-to-s3 }}
+        upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
        working-dir: ${{ inputs.working-dir }}

    - name: Build greptime without pyo3
@@ -85,6 +90,7 @@ runs:
        aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
        aws-region: ${{ inputs.aws-region }}
        upload-to-s3: ${{ inputs.upload-to-s3 }}
+        upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
        working-dir: ${{ inputs.working-dir }}

    - name: Clean up the target directory # Clean up the target directory for the centos7 base image, or it will still use the objects of last build.
@@ -106,4 +112,5 @@ runs:
        aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
        aws-region: ${{ inputs.aws-region }}
        upload-to-s3: ${{ inputs.upload-to-s3 }}
+        upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
        working-dir: ${{ inputs.working-dir }}
--- a/.github/actions/upload-artifacts/action.yml
+++ b/.github/actions/upload-artifacts/action.yml
@@ -26,6 +26,18 @@ inputs:
    description: Upload to S3
    required: false
    default: 'true'
+  upload-latest-artifacts:
+    description: Upload the latest artifacts to S3
+    required: false
+    default: 'true'
+  upload-max-retry-times:
+    description: Max retry times for uploading artifacts to S3
+    required: false
+    default: "20"
+  upload-retry-timeout:
+    description: Timeout for uploading artifacts to S3
+    required: false
+    default: "10" # minutes
  working-dir:
    description: Working directory to upload the artifacts
    required: false
@@ -74,8 +86,8 @@ runs:
        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key }}
        AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
      with:
-        max_attempts: 20
-        timeout_minutes: 5
+        max_attempts: ${{ inputs.upload-max-retry-times }}
+        timeout_minutes: ${{ inputs.upload-retry-timeout }}
        # The bucket layout will be:
        # releases/greptimedb
        # ├── v0.1.0
@@ -92,3 +104,22 @@ runs:
          aws s3 cp \
            ${{ inputs.artifacts-dir }}.sha256sum \
            s3://${{ inputs.release-to-s3-bucket }}/releases/greptimedb/${{ inputs.version }}/${{ inputs.artifacts-dir }}.sha256sum
+
+    - name: Upload latest artifacts to S3
+      if: ${{ inputs.upload-to-s3 == 'true' && inputs.upload-latest-artifacts == 'true' }} # We'll also upload the latest artifacts to S3 in the scheduled and formal release.
+      uses: nick-invision/retry@v2
+      env:
+        AWS_ACCESS_KEY_ID: ${{ inputs.aws-access-key-id }}
+        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key }}
+        AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
+      with:
+        max_attempts: ${{ inputs.upload-max-retry-times }}
+        timeout_minutes: ${{ inputs.upload-retry-timeout }}
+        command: |
+          cd ${{ inputs.working-dir }} && \
+          aws s3 cp \
+            ${{ inputs.artifacts-dir }}.tar.gz \
+            s3://${{ inputs.release-to-s3-bucket }}/releases/greptimedb/latest/${{ inputs.artifacts-dir }}.tar.gz && \
+          aws s3 cp \
+            ${{ inputs.artifacts-dir }}.sha256sum \
+            s3://${{ inputs.release-to-s3-bucket }}/releases/greptimedb/latest/${{ inputs.artifacts-dir }}.sha256sum
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -151,6 +151,7 @@ jobs:
          aws-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          upload-latest-artifacts: false

  build-linux-arm64-artifacts:
    name: Build linux-arm64 artifacts
@@ -174,6 +175,7 @@ jobs:
          aws-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
          aws-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          upload-latest-artifacts: false

  release-images-to-dockerhub:
    name: Build and push images to DockerHub
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ members = [
    "src/sql",
    "src/storage",
    "src/store-api",
+    "src/flow",
    "src/table",
    "src/table-procedure",
    "tests-integration",
@@ -54,7 +55,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "0.3.2"
+version = "0.4.0-nightly"
 edition = "2021"
 license = "Apache-2.0"

@@ -67,17 +68,18 @@ arrow-schema = { version = "43.0", features = ["serde"] }
 async-stream = "0.3"
 async-trait = "0.1"
 chrono = { version = "0.4", features = ["serde"] }
-datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
-datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
-datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
-datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
-datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
-datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
-datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
+datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
+datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
+datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
+datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
+datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
+datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
+datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
 derive_builder = "0.12"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c30a2607be4044502094b25c408171a666a8ff6d" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "4a277f27caa035a801d5b9c020a0449777736614" }
+humantime-serde = "1.1"
 itertools = "0.10"
 lazy_static = "1.4"
 once_cell = "1.18"
@@ -90,9 +92,10 @@ regex = "1.8"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 snafu = { version = "0.7", features = ["backtraces"] }
-sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "c3814f08afa19786b13d72b1731a1e8b3cac4ab9", features = [
+sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4", features = [
    "visitor",
 ] }
+strum = { version = "0.25", features = ["derive"] }
 tempfile = "3"
 tokio = { version = "1.28", features = ["full"] }
 tokio-util = { version = "0.7", features = ["io-util", "compat"] }
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -57,8 +57,6 @@ max_purge_tasks = 32
 checkpoint_margin = 10
 # Region manifest logs and checkpoints gc execution duration
 gc_duration = '10m'
-# Whether to try creating a manifest checkpoint on region opening
-checkpoint_on_startup = false

 # Storage flush options
 [storage.flush]
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -53,10 +53,6 @@ enable = true
 [prom_store_options]
 enable = true

-# Prometheus protocol options, see `standalone.example.toml`.
-[prometheus_options]
-addr = "127.0.0.1:4004"
-
 # Metasrv client options, see `datanode.example.toml`.
 [meta_client_options]
 metasrv_addrs = ["127.0.0.1:3002"]
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -26,7 +26,7 @@ enable_telemetry = true
 # Procedure storage options.
 [procedure]
 # Procedure max retry time.
-max_retry_times = 3
+max_retry_times = 12
 # Initial retry delay of procedures, increases exponentially
 retry_delay = "500ms"

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -76,11 +76,6 @@ enable = true
 # Whether to enable Prometheus remote write and read in HTTP API, true by default.
 enable = true

-# Prometheus protocol options
-[prometheus_options]
-# Prometheus API server address, "127.0.0.1:4004" by default.
-addr = "127.0.0.1:4004"
-
 # WAL options.
 [wal]
 # WAL data directory
@@ -121,8 +116,6 @@ max_purge_tasks = 32
 checkpoint_margin = 10
 # Region manifest logs and checkpoints gc execution duration
 gc_duration = '10m'
-# Whether to try creating a manifest checkpoint on region opening
-checkpoint_on_startup = false

 # Storage flush options
 [storage.flush]
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -55,6 +55,10 @@ impl ColumnDataTypeWrapper {
        Ok(Self(datatype))
    }

+    pub fn new(datatype: ColumnDataType) -> Self {
+        Self(datatype)
+    }
+
    pub fn datatype(&self) -> ColumnDataType {
        self.0
    }
--- a/src/catalog/src/information_schema.rs
+++ b/src/catalog/src/information_schema.rs
@@ -70,11 +70,9 @@ impl InformationSchemaProvider {

    pub fn table(&self, name: &str) -> Option<TableRef> {
        self.information_table(name).map(|table| {
-            let schema = table.schema();
            let table_info = Self::table_info(self.catalog_name.clone(), &table);
-            let table_type = table.table_type();
            let filter_pushdown = FilterPushDownType::Unsupported;
-            let thin_table = ThinTable::new(schema, table_info, table_type, filter_pushdown);
+            let thin_table = ThinTable::new(table_info, filter_pushdown);

            let data_source = Arc::new(InformationTableDataSource::new(table));
            Arc::new(ThinTableAdapter::new(thin_table, data_source)) as _
--- a/src/catalog/src/local/manager.rs
+++ b/src/catalog/src/local/manager.rs
@@ -136,7 +136,7 @@ impl LocalCatalogManager {
            schema: INFORMATION_SCHEMA_NAME.to_string(),
            table_name: SYSTEM_CATALOG_TABLE_NAME.to_string(),
            table_id: SYSTEM_CATALOG_TABLE_ID,
-            table: self.system.information_schema.system.clone(),
+            table: self.system.information_schema.system.as_table_ref(),
        };
        self.catalogs.register_table(register_table_req).await?;

--- a/src/catalog/src/local/memory.rs
+++ b/src/catalog/src/local/memory.rs
@@ -97,26 +97,7 @@ impl CatalogManager for MemoryCatalogManager {
    }

    async fn deregister_table(&self, request: DeregisterTableRequest) -> Result<()> {
-        let mut catalogs = self.catalogs.write().unwrap();
-        let schema = catalogs
-            .get_mut(&request.catalog)
-            .with_context(|| CatalogNotFoundSnafu {
-                catalog_name: &request.catalog,
-            })?
-            .get_mut(&request.schema)
-            .with_context(|| SchemaNotFoundSnafu {
-                catalog: &request.catalog,
-                schema: &request.schema,
-            })?;
-        let result = schema.remove(&request.table_name);
-        if result.is_some() {
-            decrement_gauge!(
-                crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT,
-                1.0,
-                &[crate::metrics::db_label(&request.catalog, &request.schema)],
-            );
-        }
-        Ok(())
+        self.deregister_table_sync(request)
    }

    async fn register_schema(&self, request: RegisterSchemaRequest) -> Result<bool> {
@@ -157,15 +138,7 @@ impl CatalogManager for MemoryCatalogManager {
    }

    async fn schema_exist(&self, catalog: &str, schema: &str) -> Result<bool> {
-        Ok(self
-            .catalogs
-            .read()
-            .unwrap()
-            .get(catalog)
-            .with_context(|| CatalogNotFoundSnafu {
-                catalog_name: catalog,
-            })?
-            .contains_key(schema))
+        self.schema_exist_sync(catalog, schema)
    }

    async fn table(
@@ -187,7 +160,7 @@ impl CatalogManager for MemoryCatalogManager {
    }

    async fn catalog_exist(&self, catalog: &str) -> Result<bool> {
-        Ok(self.catalogs.read().unwrap().get(catalog).is_some())
+        self.catalog_exist_sync(catalog)
    }

    async fn table_exist(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
@@ -245,7 +218,7 @@ impl CatalogManager for MemoryCatalogManager {
 }

 impl MemoryCatalogManager {
-    /// Create a manager with some default setups
+    /// Creates a manager with some default setups
    /// (e.g. default catalog/schema and information schema)
    pub fn with_default_setup() -> Arc<Self> {
        let manager = Arc::new(Self {
@@ -267,19 +240,23 @@ impl MemoryCatalogManager {
        manager
    }

-    /// Registers a catalog and return the catalog already exist
-    pub fn register_catalog_if_absent(&self, name: String) -> bool {
-        let mut catalogs = self.catalogs.write().unwrap();
-        let entry = catalogs.entry(name);
-        match entry {
-            Entry::Occupied(_) => true,
-            Entry::Vacant(v) => {
-                let _ = v.insert(HashMap::new());
-                false
-            }
-        }
+    fn schema_exist_sync(&self, catalog: &str, schema: &str) -> Result<bool> {
+        Ok(self
+            .catalogs
+            .read()
+            .unwrap()
+            .get(catalog)
+            .with_context(|| CatalogNotFoundSnafu {
+                catalog_name: catalog,
+            })?
+            .contains_key(schema))
    }

+    fn catalog_exist_sync(&self, catalog: &str) -> Result<bool> {
+        Ok(self.catalogs.read().unwrap().get(catalog).is_some())
+    }
+
+    /// Registers a catalog if it does not exist and returns false if the schema exists.
    pub fn register_catalog_sync(self: &Arc<Self>, name: String) -> Result<bool> {
        let mut catalogs = self.catalogs.write().unwrap();

@@ -294,6 +271,32 @@ impl MemoryCatalogManager {
        }
    }

+    pub fn deregister_table_sync(&self, request: DeregisterTableRequest) -> Result<()> {
+        let mut catalogs = self.catalogs.write().unwrap();
+        let schema = catalogs
+            .get_mut(&request.catalog)
+            .with_context(|| CatalogNotFoundSnafu {
+                catalog_name: &request.catalog,
+            })?
+            .get_mut(&request.schema)
+            .with_context(|| SchemaNotFoundSnafu {
+                catalog: &request.catalog,
+                schema: &request.schema,
+            })?;
+        let result = schema.remove(&request.table_name);
+        if result.is_some() {
+            decrement_gauge!(
+                crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT,
+                1.0,
+                &[crate::metrics::db_label(&request.catalog, &request.schema)],
+            );
+        }
+        Ok(())
+    }
+
+    /// Registers a schema if it does not exist.
+    /// It returns an error if the catalog does not exist,
+    /// and returns false if the schema exists.
    pub fn register_schema_sync(&self, request: RegisterSchemaRequest) -> Result<bool> {
        let mut catalogs = self.catalogs.write().unwrap();
        let catalog = catalogs
@@ -312,6 +315,7 @@ impl MemoryCatalogManager {
        }
    }

+    /// Registers a schema and returns an error if the catalog or schema does not exist.
    pub fn register_table_sync(&self, request: RegisterTableRequest) -> Result<bool> {
        let mut catalogs = self.catalogs.write().unwrap();
        let schema = catalogs
@@ -353,9 +357,25 @@ impl MemoryCatalogManager {
    #[cfg(any(test, feature = "testing"))]
    pub fn new_with_table(table: TableRef) -> Arc<Self> {
        let manager = Self::with_default_setup();
+        let catalog = &table.table_info().catalog_name;
+        let schema = &table.table_info().schema_name;
+
+        if !manager.catalog_exist_sync(catalog).unwrap() {
+            manager.register_catalog_sync(catalog.to_string()).unwrap();
+        }
+
+        if !manager.schema_exist_sync(catalog, schema).unwrap() {
+            manager
+                .register_schema_sync(RegisterSchemaRequest {
+                    catalog: catalog.to_string(),
+                    schema: schema.to_string(),
+                })
+                .unwrap();
+        }
+
        let request = RegisterTableRequest {
-            catalog: DEFAULT_CATALOG_NAME.to_string(),
-            schema: DEFAULT_SCHEMA_NAME.to_string(),
+            catalog: catalog.to_string(),
+            schema: schema.to_string(),
            table_name: table.table_info().name.clone(),
            table_id: table.table_info().ident.table_id,
            table,
@@ -524,10 +544,14 @@ mod tests {
    }

    #[test]
-    pub fn test_register_if_absent() {
+    pub fn test_register_catalog_sync() {
        let list = MemoryCatalogManager::with_default_setup();
-        assert!(!list.register_catalog_if_absent("test_catalog".to_string(),));
-        assert!(list.register_catalog_if_absent("test_catalog".to_string()));
+        assert!(list
+            .register_catalog_sync("test_catalog".to_string())
+            .unwrap());
+        assert!(!list
+            .register_catalog_sync("test_catalog".to_string())
+            .unwrap());
    }

    #[tokio::test]
--- a/src/catalog/src/remote/manager.rs
+++ b/src/catalog/src/remote/manager.rs
@@ -85,6 +85,7 @@ impl RemoteCatalogManager {
                let engine_manager = self.engine_manager.clone();
                let memory_catalog_manager = self.memory_catalog_manager.clone();
                let table_metadata_manager = self.table_metadata_manager.clone();
+                let region_alive_keepers = self.region_alive_keepers.clone();
                common_runtime::spawn_bg(async move {
                    let table_id = datanode_table_value.table_id;
                    if let Err(e) = open_and_register_table(
@@ -92,6 +93,7 @@ impl RemoteCatalogManager {
                        datanode_table_value,
                        memory_catalog_manager,
                        table_metadata_manager,
+                        region_alive_keepers,
                    )
                    .await
                    {
@@ -116,6 +118,7 @@ async fn open_and_register_table(
    datanode_table_value: DatanodeTableValue,
    memory_catalog_manager: Arc<MemoryCatalogManager>,
    table_metadata_manager: TableMetadataManagerRef,
+    region_alive_keepers: Arc<RegionAliveKeepers>,
 ) -> Result<()> {
    let context = EngineContext {};

@@ -192,7 +195,8 @@ async fn open_and_register_table(
        table_id,
        table,
    };
-    let registered = memory_catalog_manager.register_table_sync(request)?;
+    let registered =
+        register_table(&memory_catalog_manager, &region_alive_keepers, request).await?;
    ensure!(
        registered,
        TableExistsSnafu {
@@ -203,6 +207,32 @@ async fn open_and_register_table(
    Ok(())
 }

+async fn register_table(
+    memory_catalog_manager: &Arc<MemoryCatalogManager>,
+    region_alive_keepers: &Arc<RegionAliveKeepers>,
+    request: RegisterTableRequest,
+) -> Result<bool> {
+    let table = request.table.clone();
+
+    let registered = memory_catalog_manager.register_table_sync(request)?;
+
+    if registered {
+        let table_info = table.table_info();
+        let table_ident = TableIdent {
+            catalog: table_info.catalog_name.clone(),
+            schema: table_info.schema_name.clone(),
+            table: table_info.name.clone(),
+            table_id: table_info.table_id(),
+            engine: table_info.meta.engine.clone(),
+        };
+        region_alive_keepers
+            .register_table(table_ident, table, memory_catalog_manager.clone())
+            .await?;
+    }
+
+    Ok(registered)
+}
+
 #[async_trait]
 impl CatalogManager for RemoteCatalogManager {
    async fn start(&self) -> Result<()> {
@@ -221,25 +251,12 @@ impl CatalogManager for RemoteCatalogManager {
    }

    async fn register_table(&self, request: RegisterTableRequest) -> Result<bool> {
-        let table = request.table.clone();
-
-        let registered = self.memory_catalog_manager.register_table_sync(request)?;
-
-        if registered {
-            let table_info = table.table_info();
-            let table_ident = TableIdent {
-                catalog: table_info.catalog_name.clone(),
-                schema: table_info.schema_name.clone(),
-                table: table_info.name.clone(),
-                table_id: table_info.table_id(),
-                engine: table_info.meta.engine.clone(),
-            };
-            self.region_alive_keepers
-                .register_table(table_ident, table)
-                .await?;
-        }
-
-        Ok(registered)
+        register_table(
+            &self.memory_catalog_manager,
+            &self.region_alive_keepers,
+            request,
+        )
+        .await
    }

    async fn deregister_table(&self, request: DeregisterTableRequest) -> Result<()> {
--- a/src/catalog/src/remote/region_alive_keeper.rs
+++ b/src/catalog/src/remote/region_alive_keeper.rs
@@ -29,6 +29,7 @@ use snafu::{OptionExt, ResultExt};
 use store_api::storage::RegionNumber;
 use table::engine::manager::TableEngineManagerRef;
 use table::engine::{CloseTableResult, EngineContext, TableEngineRef};
+use table::metadata::TableId;
 use table::requests::CloseTableRequest;
 use table::TableRef;
 use tokio::sync::{mpsc, oneshot, Mutex};
@@ -36,11 +37,13 @@ use tokio::task::JoinHandle;
 use tokio::time::{Duration, Instant};

 use crate::error::{Result, TableEngineNotFoundSnafu};
+use crate::local::MemoryCatalogManager;
+use crate::DeregisterTableRequest;

 /// [RegionAliveKeepers] manages all [RegionAliveKeeper] in a scope of tables.
 pub struct RegionAliveKeepers {
    table_engine_manager: TableEngineManagerRef,
-    keepers: Arc<Mutex<HashMap<TableIdent, Arc<RegionAliveKeeper>>>>,
+    keepers: Arc<Mutex<HashMap<TableId, Arc<RegionAliveKeeper>>>>,
    heartbeat_interval_millis: u64,
    started: AtomicBool,

@@ -65,12 +68,18 @@ impl RegionAliveKeepers {
        }
    }

-    pub async fn find_keeper(&self, table_ident: &TableIdent) -> Option<Arc<RegionAliveKeeper>> {
-        self.keepers.lock().await.get(table_ident).cloned()
+    pub async fn find_keeper(&self, table_id: TableId) -> Option<Arc<RegionAliveKeeper>> {
+        self.keepers.lock().await.get(&table_id).cloned()
    }

-    pub async fn register_table(&self, table_ident: TableIdent, table: TableRef) -> Result<()> {
-        let keeper = self.find_keeper(&table_ident).await;
+    pub async fn register_table(
+        &self,
+        table_ident: TableIdent,
+        table: TableRef,
+        catalog_manager: Arc<MemoryCatalogManager>,
+    ) -> Result<()> {
+        let table_id = table_ident.table_id;
+        let keeper = self.find_keeper(table_id).await;
        if keeper.is_some() {
            return Ok(());
        }
@@ -84,6 +93,7 @@ impl RegionAliveKeepers {

        let keeper = Arc::new(RegionAliveKeeper::new(
            table_engine,
+            catalog_manager,
            table_ident.clone(),
            self.heartbeat_interval_millis,
        ));
@@ -92,7 +102,7 @@ impl RegionAliveKeepers {
        }

        let mut keepers = self.keepers.lock().await;
-        let _ = keepers.insert(table_ident.clone(), keeper.clone());
+        let _ = keepers.insert(table_id, keeper.clone());

        if self.started.load(Ordering::Relaxed) {
            keeper.start().await;
@@ -108,15 +118,16 @@ impl RegionAliveKeepers {
        &self,
        table_ident: &TableIdent,
    ) -> Option<Arc<RegionAliveKeeper>> {
-        self.keepers.lock().await.remove(table_ident).map(|x| {
+        let table_id = table_ident.table_id;
+        self.keepers.lock().await.remove(&table_id).map(|x| {
            info!("Deregister RegionAliveKeeper for table {table_ident}");
            x
        })
    }

    pub async fn register_region(&self, region_ident: &RegionIdent) {
-        let table_ident = &region_ident.table_ident;
-        let Some(keeper) = self.find_keeper(table_ident).await else {
+        let table_id = region_ident.table_ident.table_id;
+        let Some(keeper) = self.find_keeper(table_id).await else {
            // Alive keeper could be affected by lagging msg, just warn and ignore.
            warn!("Alive keeper for region {region_ident} is not found!");
            return;
@@ -125,8 +136,8 @@ impl RegionAliveKeepers {
    }

    pub async fn deregister_region(&self, region_ident: &RegionIdent) {
-        let table_ident = &region_ident.table_ident;
-        let Some(keeper) = self.find_keeper(table_ident).await else {
+        let table_id = region_ident.table_ident.table_id;
+        let Some(keeper) = self.find_keeper(table_id).await else {
            // Alive keeper could be affected by lagging msg, just warn and ignore.
            warn!("Alive keeper for region {region_ident} is not found!");
            return;
@@ -178,7 +189,8 @@ impl HeartbeatResponseHandler for RegionAliveKeepers {
                }
            };

-            let Some(keeper) = self.keepers.lock().await.get(&table_ident).cloned() else {
+            let table_id = table_ident.table_id;
+            let Some(keeper) = self.keepers.lock().await.get(&table_id).cloned() else {
                // Alive keeper could be affected by lagging msg, just warn and ignore.
                warn!("Alive keeper for table {table_ident} is not found!");
                continue;
@@ -199,6 +211,7 @@ impl HeartbeatResponseHandler for RegionAliveKeepers {
 /// Datanode, it will "extend" the region's "lease", with a deadline for [RegionAliveKeeper] to
 /// countdown.
 pub struct RegionAliveKeeper {
+    catalog_manager: Arc<MemoryCatalogManager>,
    table_engine: TableEngineRef,
    table_ident: TableIdent,
    countdown_task_handles: Arc<Mutex<HashMap<RegionNumber, Arc<CountdownTaskHandle>>>>,
@@ -209,10 +222,12 @@ pub struct RegionAliveKeeper {
 impl RegionAliveKeeper {
    fn new(
        table_engine: TableEngineRef,
+        catalog_manager: Arc<MemoryCatalogManager>,
        table_ident: TableIdent,
        heartbeat_interval_millis: u64,
    ) -> Self {
        Self {
+            catalog_manager,
            table_engine,
            table_ident,
            countdown_task_handles: Arc::new(Mutex::new(HashMap::new())),
@@ -240,11 +255,29 @@ impl RegionAliveKeeper {
                let _ = x.lock().await.remove(&region);
            } // Else the countdown task handles map could be dropped because the keeper is dropped.
        };
+        let catalog_manager = self.catalog_manager.clone();
+        let ident = self.table_ident.clone();
        let handle = Arc::new(CountdownTaskHandle::new(
            self.table_engine.clone(),
            self.table_ident.clone(),
            region,
-            || on_task_finished,
+            move |result: Option<CloseTableResult>| {
+                if matches!(result, Some(CloseTableResult::Released(_))) {
+                    let result = catalog_manager.deregister_table_sync(DeregisterTableRequest {
+                        catalog: ident.catalog.to_string(),
+                        schema: ident.schema.to_string(),
+                        table_name: ident.table.to_string(),
+                    });
+
+                    info!(
+                        "Deregister table: {} after countdown task finished, result: {result:?}",
+                        ident.table_id
+                    );
+                } else {
+                    debug!("Countdown task returns: {result:?}");
+                }
+                on_task_finished
+            },
        ));

        let mut handles = self.countdown_task_handles.lock().await;
@@ -343,7 +376,7 @@ impl CountdownTaskHandle {
        table_engine: TableEngineRef,
        table_ident: TableIdent,
        region: RegionNumber,
-        on_task_finished: impl FnOnce() -> Fut + Send + 'static,
+        on_task_finished: impl FnOnce(Option<CloseTableResult>) -> Fut + Send + 'static,
    ) -> Self
    where
        Fut: Future<Output = ()> + Send,
@@ -357,8 +390,8 @@ impl CountdownTaskHandle {
            rx,
        };
        let handler = common_runtime::spawn_bg(async move {
-            countdown_task.run().await;
-            on_task_finished().await;
+            let result = countdown_task.run().await;
+            on_task_finished(result).await;
        });

        Self {
@@ -410,7 +443,8 @@ struct CountdownTask {
 }

 impl CountdownTask {
-    async fn run(&mut self) {
+    // returns true if
+    async fn run(&mut self) -> Option<CloseTableResult> {
        // 30 years. See `Instant::far_future`.
        let far_future = Instant::now() + Duration::from_secs(86400 * 365 * 30);

@@ -464,10 +498,11 @@ impl CountdownTask {
                        "Region {region} of table {table_ident} is closed, result: {result:?}. \
                        RegionAliveKeeper out.",
                    );
-                    break;
+                    return Some(result);
                }
            }
        }
+        None
    }

    async fn close_region(&self) -> CloseTableResult {
@@ -543,11 +578,16 @@ mod test {
            table_options: TableOptions::default(),
            engine: "MockTableEngine".to_string(),
        }));
+        let catalog_manager = MemoryCatalogManager::new_with_table(table.clone());
        keepers
-            .register_table(table_ident.clone(), table)
+            .register_table(table_ident.clone(), table, catalog_manager)
            .await
            .unwrap();
-        assert!(keepers.keepers.lock().await.contains_key(&table_ident));
+        assert!(keepers
+            .keepers
+            .lock()
+            .await
+            .contains_key(&table_ident.table_id));

        (table_ident, keepers)
    }
@@ -602,7 +642,7 @@ mod test {
            .keepers
            .lock()
            .await
-            .get(&table_ident)
+            .get(&table_ident.table_id)
            .cloned()
            .unwrap();

@@ -649,7 +689,7 @@ mod test {
            })
            .await;
        let mut regions = keepers
-            .find_keeper(&table_ident)
+            .find_keeper(table_ident.table_id)
            .await
            .unwrap()
            .countdown_task_handles
@@ -676,7 +716,8 @@ mod test {
            table_id: 1024,
            engine: "mito".to_string(),
        };
-        let keeper = RegionAliveKeeper::new(table_engine, table_ident, 1000);
+        let catalog_manager = MemoryCatalogManager::with_default_setup();
+        let keeper = RegionAliveKeeper::new(table_engine, catalog_manager, table_ident, 1000);

        let region = 1;
        assert!(keeper.find_handle(&region).await.is_none());
@@ -719,7 +760,7 @@ mod test {
            table_engine.clone(),
            table_ident.clone(),
            1,
-            || async move { finished_clone.store(true, Ordering::Relaxed) },
+            |_| async move { finished_clone.store(true, Ordering::Relaxed) },
        );
        let tx = handle.tx.clone();

@@ -741,7 +782,7 @@ mod test {

        let finished = Arc::new(AtomicBool::new(false));
        let finished_clone = finished.clone();
-        let handle = CountdownTaskHandle::new(table_engine, table_ident, 1, || async move {
+        let handle = CountdownTaskHandle::new(table_engine, table_ident, 1, |_| async move {
            finished_clone.store(true, Ordering::Relaxed)
        });
        handle.tx.send(CountdownCommand::Start(100)).await.unwrap();
--- a/src/catalog/src/system.rs
+++ b/src/catalog/src/system.rs
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;

@@ -21,24 +20,23 @@ use common_catalog::consts::{
    SYSTEM_CATALOG_NAME, SYSTEM_CATALOG_TABLE_ID, SYSTEM_CATALOG_TABLE_NAME,
 };
 use common_recordbatch::SendableRecordBatchStream;
-use common_telemetry::debug;
+use common_telemetry::{debug, warn};
 use common_time::util;
 use datatypes::prelude::{ConcreteDataType, ScalarVector, VectorRef};
-use datatypes::schema::{ColumnSchema, RawSchema, SchemaRef};
+use datatypes::schema::{ColumnSchema, RawSchema};
 use datatypes::vectors::{BinaryVector, TimestampMillisecondVector, UInt8Vector};
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, OptionExt, ResultExt};
 use store_api::storage::ScanRequest;
 use table::engine::{EngineContext, TableEngineRef};
-use table::metadata::{TableId, TableInfoRef, TableType};
-use table::requests::{
-    CreateTableRequest, DeleteRequest, InsertRequest, OpenTableRequest, TableOptions,
-};
-use table::{Result as TableResult, Table, TableRef};
+use table::metadata::TableId;
+use table::requests::{CreateTableRequest, InsertRequest, OpenTableRequest, TableOptions};
+use table::TableRef;

 use crate::error::{
-    self, CreateSystemCatalogSnafu, EmptyValueSnafu, Error, InvalidEntryTypeSnafu, InvalidKeySnafu,
-    OpenSystemCatalogSnafu, Result, ValueDeserializeSnafu,
+    self, CreateSystemCatalogSnafu, DeregisterTableSnafu, EmptyValueSnafu, Error,
+    InsertCatalogRecordSnafu, InvalidEntryTypeSnafu, InvalidKeySnafu, OpenSystemCatalogSnafu,
+    Result, ValueDeserializeSnafu,
 };
 use crate::DeregisterTableRequest;

@@ -48,42 +46,6 @@ pub const VALUE_INDEX: usize = 3;

 pub struct SystemCatalogTable(TableRef);

-#[async_trait::async_trait]
-impl Table for SystemCatalogTable {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn schema(&self) -> SchemaRef {
-        self.0.schema()
-    }
-
-    async fn scan_to_stream(&self, request: ScanRequest) -> TableResult<SendableRecordBatchStream> {
-        self.0.scan_to_stream(request).await
-    }
-
-    /// Insert values into table.
-    async fn insert(&self, request: InsertRequest) -> TableResult<usize> {
-        self.0.insert(request).await
-    }
-
-    fn table_info(&self) -> TableInfoRef {
-        self.0.table_info()
-    }
-
-    fn table_type(&self) -> TableType {
-        self.0.table_type()
-    }
-
-    async fn delete(&self, request: DeleteRequest) -> TableResult<usize> {
-        self.0.delete(request).await
-    }
-
-    fn statistics(&self) -> Option<table::stats::TableStatistics> {
-        self.0.statistics()
-    }
-}
-
 impl SystemCatalogTable {
    pub async fn new(engine: TableEngineRef) -> Result<Self> {
        let request = OpenTableRequest {
@@ -126,6 +88,54 @@ impl SystemCatalogTable {
        }
    }

+    pub async fn register_table(
+        &self,
+        catalog: String,
+        schema: String,
+        table_name: String,
+        table_id: TableId,
+        engine: String,
+    ) -> Result<usize> {
+        let insert_request =
+            build_table_insert_request(catalog, schema, table_name, table_id, engine);
+        self.0
+            .insert(insert_request)
+            .await
+            .context(InsertCatalogRecordSnafu)
+    }
+
+    pub(crate) async fn deregister_table(
+        &self,
+        request: &DeregisterTableRequest,
+        table_id: TableId,
+    ) -> Result<()> {
+        let deletion_request = build_table_deletion_request(request, table_id);
+        self.0
+            .insert(deletion_request)
+            .await
+            .map(|x| {
+                if x != 1 {
+                    let table = common_catalog::format_full_table_name(
+                        &request.catalog,
+                        &request.schema,
+                        &request.table_name
+                    );
+                    warn!("Failed to delete table record from information_schema, unexpected returned result: {x}, table: {table}");
+                }
+            })
+            .with_context(|_| DeregisterTableSnafu {
+                request: request.clone(),
+            })
+    }
+
+    pub async fn register_schema(&self, catalog: String, schema: String) -> Result<usize> {
+        let insert_request = build_schema_insert_request(catalog, schema);
+        self.0
+            .insert(insert_request)
+            .await
+            .context(InsertCatalogRecordSnafu)
+    }
+
    /// Create a stream of all entries inside system catalog table
    pub async fn records(&self) -> Result<SendableRecordBatchStream> {
        let full_projection = None;
@@ -137,11 +147,16 @@ impl SystemCatalogTable {
            limit: None,
        };
        let stream = self
+            .0
            .scan_to_stream(scan_req)
            .await
            .context(error::SystemCatalogTableScanSnafu)?;
        Ok(stream)
    }
+
+    pub fn as_table_ref(&self) -> TableRef {
+        self.0.clone()
+    }
 }

 /// Build system catalog table schema.
@@ -541,14 +556,14 @@ mod tests {
    async fn test_system_table_type() {
        let (_dir, table_engine) = prepare_table_engine().await;
        let system_table = SystemCatalogTable::new(table_engine).await.unwrap();
-        assert_eq!(Base, system_table.table_type());
+        assert_eq!(Base, system_table.as_table_ref().table_type());
    }

    #[tokio::test]
    async fn test_system_table_info() {
        let (_dir, table_engine) = prepare_table_engine().await;
        let system_table = SystemCatalogTable::new(table_engine).await.unwrap();
-        let info = system_table.table_info();
+        let info = system_table.as_table_ref().table_info();
        assert_eq!(TableType::Base, info.table_type);
        assert_eq!(SYSTEM_CATALOG_TABLE_NAME, info.name);
        assert_eq!(SYSTEM_CATALOG_TABLE_ID, info.ident.table_id);
@@ -561,14 +576,16 @@ mod tests {
        let (_, table_engine) = prepare_table_engine().await;
        let catalog_table = SystemCatalogTable::new(table_engine).await.unwrap();

-        let table_insertion = build_table_insert_request(
-            DEFAULT_CATALOG_NAME.to_string(),
-            DEFAULT_SCHEMA_NAME.to_string(),
-            "my_table".to_string(),
-            1,
-            MITO_ENGINE.to_string(),
-        );
-        let result = catalog_table.insert(table_insertion).await.unwrap();
+        let result = catalog_table
+            .register_table(
+                DEFAULT_CATALOG_NAME.to_string(),
+                DEFAULT_SCHEMA_NAME.to_string(),
+                "my_table".to_string(),
+                1,
+                MITO_ENGINE.to_string(),
+            )
+            .await
+            .unwrap();
        assert_eq!(result, 1);

        let records = catalog_table.records().await.unwrap();
@@ -598,16 +615,17 @@ mod tests {
        });
        assert_eq!(entry, expected);

-        let table_deletion = build_table_deletion_request(
-            &DeregisterTableRequest {
-                catalog: DEFAULT_CATALOG_NAME.to_string(),
-                schema: DEFAULT_SCHEMA_NAME.to_string(),
-                table_name: "my_table".to_string(),
-            },
-            1,
-        );
-        let result = catalog_table.insert(table_deletion).await.unwrap();
-        assert_eq!(result, 1);
+        catalog_table
+            .deregister_table(
+                &DeregisterTableRequest {
+                    catalog: DEFAULT_CATALOG_NAME.to_string(),
+                    schema: DEFAULT_SCHEMA_NAME.to_string(),
+                    table_name: "my_table".to_string(),
+                },
+                1,
+            )
+            .await
+            .unwrap();

        let records = catalog_table.records().await.unwrap();
        let batches = RecordBatches::try_collect(records).await.unwrap().take();
--- a/src/catalog/src/tables.rs
+++ b/src/catalog/src/tables.rs
@@ -16,16 +16,9 @@

 use std::sync::Arc;

-use common_telemetry::logging;
-use snafu::ResultExt;
 use table::metadata::TableId;
-use table::Table;

-use crate::error::{self, InsertCatalogRecordSnafu, Result as CatalogResult};
-use crate::system::{
-    build_schema_insert_request, build_table_deletion_request, build_table_insert_request,
-    SystemCatalogTable,
-};
+use crate::system::SystemCatalogTable;
 use crate::DeregisterTableRequest;

 pub struct InformationSchema {
@@ -54,36 +47,21 @@ impl SystemCatalog {
        table_id: TableId,
        engine: String,
    ) -> crate::error::Result<usize> {
-        let request = build_table_insert_request(catalog, schema, table_name, table_id, engine);
        self.information_schema
            .system
-            .insert(request)
+            .register_table(catalog, schema, table_name, table_id, engine)
            .await
-            .context(InsertCatalogRecordSnafu)
    }

    pub(crate) async fn deregister_table(
        &self,
        request: &DeregisterTableRequest,
        table_id: TableId,
-    ) -> CatalogResult<()> {
+    ) -> crate::error::Result<()> {
        self.information_schema
            .system
-            .insert(build_table_deletion_request(request, table_id))
+            .deregister_table(request, table_id)
            .await
-            .map(|x| {
-                if x != 1 {
-                    let table = common_catalog::format_full_table_name(
-                        &request.catalog,
-                        &request.schema,
-                        &request.table_name
-                    );
-                    logging::warn!("Failed to delete table record from information_schema, unexpected returned result: {x}, table: {table}");
-                }
-            })
-            .with_context(|_| error::DeregisterTableSnafu {
-                request: request.clone(),
-            })
    }

    pub async fn register_schema(
@@ -91,11 +69,9 @@ impl SystemCatalog {
        catalog: String,
        schema: String,
    ) -> crate::error::Result<usize> {
-        let request = build_schema_insert_request(catalog, schema);
        self.information_schema
            .system
-            .insert(request)
+            .register_schema(catalog, schema)
            .await
-            .context(InsertCatalogRecordSnafu)
    }
 }
--- a/src/catalog/tests/remote_catalog_tests.rs
+++ b/src/catalog/tests/remote_catalog_tests.rs
@@ -396,7 +396,7 @@ mod tests {
        assert!(catalog_manager.register_table(request).await.unwrap());

        let keeper = region_alive_keepers
-            .find_keeper(&table_before)
+            .find_keeper(table_before.table_id)
            .await
            .unwrap();
        let deadline = keeper.deadline(0).await.unwrap();
@@ -435,7 +435,7 @@ mod tests {
        assert!(catalog_manager.register_table(request).await.unwrap());

        let keeper = region_alive_keepers
-            .find_keeper(&table_after)
+            .find_keeper(table_after.table_id)
            .await
            .unwrap();
        let deadline = keeper.deadline(0).await.unwrap();
@@ -443,7 +443,7 @@ mod tests {
        assert!(deadline <= Instant::now() + Duration::from_secs(20));

        let keeper = region_alive_keepers
-            .find_keeper(&table_before)
+            .find_keeper(table_before.table_id)
            .await
            .unwrap();
        let deadline = keeper.deadline(0).await.unwrap();
--- a/src/client/Cargo.toml
+++ b/src/client/Cargo.toml
@@ -22,6 +22,7 @@ common-telemetry = { workspace = true }
 common-time = { workspace = true }
 datafusion.workspace = true
 datatypes = { workspace = true }
+derive_builder.workspace = true
 enum_dispatch = "0.3"
 futures-util.workspace = true
 moka = { version = "0.9", features = ["future"] }
--- a/src/client/src/client.rs
+++ b/src/client/src/client.rs
@@ -17,6 +17,7 @@ use std::sync::Arc;
 use api::v1::greptime_database_client::GreptimeDatabaseClient;
 use api::v1::health_check_client::HealthCheckClient;
 use api::v1::prometheus_gateway_client::PrometheusGatewayClient;
+use api::v1::region::region_client::RegionClient as PbRegionClient;
 use api::v1::HealthCheckRequest;
 use arrow_flight::flight_service_client::FlightServiceClient;
 use common_grpc::channel_manager::ChannelManager;
@@ -82,11 +83,6 @@ impl Client {
        Default::default()
    }

-    pub fn with_manager(channel_manager: ChannelManager) -> Self {
-        let inner = Arc::new(Inner::with_manager(channel_manager));
-        Self { inner }
-    }
-
    pub fn with_urls<U, A>(urls: A) -> Self
    where
        U: AsRef<str>,
@@ -157,6 +153,11 @@ impl Client {
        })
    }

+    pub(crate) fn raw_region_client(&self) -> Result<PbRegionClient<Channel>> {
+        let (_, channel) = self.find_channel()?;
+        Ok(PbRegionClient::new(channel))
+    }
+
    pub fn make_prometheus_gateway_client(&self) -> Result<PrometheusGatewayClient<Channel>> {
        let (_, channel) = self.find_channel()?;
        Ok(PrometheusGatewayClient::new(channel))
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -19,18 +19,21 @@ use api::v1::query_request::Query;
 use api::v1::{
    AlterExpr, AuthHeader, CompactTableExpr, CreateTableExpr, DdlRequest, DeleteRequests,
    DropTableExpr, FlushTableExpr, GreptimeRequest, InsertRequests, PromRangeQuery, QueryRequest,
-    RequestHeader, TruncateTableExpr,
+    RequestHeader, RowInsertRequests, TruncateTableExpr,
 };
-use arrow_flight::{FlightData, Ticket};
+use arrow_flight::Ticket;
+use async_stream::stream;
 use common_error::ext::{BoxedError, ErrorExt};
-use common_grpc::flight::{flight_messages_to_recordbatches, FlightDecoder, FlightMessage};
+use common_grpc::flight::{FlightDecoder, FlightMessage};
 use common_query::Output;
+use common_recordbatch::error::ExternalSnafu;
+use common_recordbatch::RecordBatchStreamAdaptor;
 use common_telemetry::{logging, timer};
-use futures_util::{TryFutureExt, TryStreamExt};
+use futures_util::StreamExt;
 use prost::Message;
 use snafu::{ensure, ResultExt};

-use crate::error::{ConvertFlightDataSnafu, IllegalFlightMessagesSnafu, ServerSnafu};
+use crate::error::{ConvertFlightDataSnafu, Error, IllegalFlightMessagesSnafu, ServerSnafu};
 use crate::{error, from_grpc_response, metrics, Client, Result, StreamInserter};

 #[derive(Clone, Debug, Default)]
@@ -112,6 +115,11 @@ impl Database {
        self.handle(Request::Inserts(requests)).await
    }

+    pub async fn row_insert(&self, requests: RowInsertRequests) -> Result<u32> {
+        let _timer = timer!(metrics::METRIC_GRPC_INSERT);
+        self.handle(Request::RowInserts(requests)).await
+    }
+
    pub fn streaming_inserter(&self) -> Result<StreamInserter> {
        self.streaming_inserter_with_channel_size(65536)
    }
@@ -283,55 +291,81 @@ impl Database {

        let mut client = self.client.make_flight_client()?;

-        let flight_data: Vec<FlightData> = client
-            .mut_inner()
-            .do_get(request)
-            .and_then(|response| response.into_inner().try_collect())
-            .await
-            .map_err(|e| {
-                let tonic_code = e.code();
-                let e: error::Error = e.into();
-                let code = e.status_code();
-                let msg = e.to_string();
-                ServerSnafu { code, msg }
-                    .fail::<()>()
-                    .map_err(BoxedError::new)
-                    .context(error::FlightGetSnafu {
-                        tonic_code,
-                        addr: client.addr(),
-                    })
-                    .map_err(|error| {
-                        logging::error!(
-                            "Failed to do Flight get, addr: {}, code: {}, source: {}",
-                            client.addr(),
-                            tonic_code,
-                            error
-                        );
-                        error
-                    })
-                    .unwrap_err()
-            })?;
-
-        let decoder = &mut FlightDecoder::default();
-        let flight_messages = flight_data
-            .into_iter()
-            .map(|x| decoder.try_decode(x).context(ConvertFlightDataSnafu))
-            .collect::<Result<Vec<_>>>()?;
-
-        let output = if let Some(FlightMessage::AffectedRows(rows)) = flight_messages.get(0) {
-            ensure!(
-                flight_messages.len() == 1,
-                IllegalFlightMessagesSnafu {
-                    reason: "Expect 'AffectedRows' Flight messages to be one and only!"
-                }
+        let response = client.mut_inner().do_get(request).await.map_err(|e| {
+            let tonic_code = e.code();
+            let e: error::Error = e.into();
+            let code = e.status_code();
+            let msg = e.to_string();
+            let error = Error::FlightGet {
+                tonic_code,
+                addr: client.addr().to_string(),
+                source: BoxedError::new(ServerSnafu { code, msg }.build()),
+            };
+            logging::error!(
+                "Failed to do Flight get, addr: {}, code: {}, source: {}",
+                client.addr(),
+                tonic_code,
+                error
            );
-            Output::AffectedRows(*rows)
-        } else {
-            let recordbatches = flight_messages_to_recordbatches(flight_messages)
-                .context(ConvertFlightDataSnafu)?;
-            Output::RecordBatches(recordbatches)
+            error
+        })?;
+
+        let flight_data_stream = response.into_inner();
+        let mut decoder = FlightDecoder::default();
+
+        let mut flight_message_stream = flight_data_stream.map(move |flight_data| {
+            flight_data
+                .map_err(Error::from)
+                .and_then(|data| decoder.try_decode(data).context(ConvertFlightDataSnafu))
+        });
+
+        let Some(first_flight_message) = flight_message_stream.next().await else {
+            return IllegalFlightMessagesSnafu {
+                reason: "Expect the response not to be empty",
+            }
+            .fail();
        };
-        Ok(output)
+
+        let first_flight_message = first_flight_message?;
+
+        match first_flight_message {
+            FlightMessage::AffectedRows(rows) => {
+                ensure!(
+                    flight_message_stream.next().await.is_none(),
+                    IllegalFlightMessagesSnafu {
+                        reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
+                    }
+                );
+                Ok(Output::AffectedRows(rows))
+            }
+            FlightMessage::Recordbatch(_) => IllegalFlightMessagesSnafu {
+                reason: "The first flight message cannot be a RecordBatch message",
+            }
+            .fail(),
+            FlightMessage::Schema(schema) => {
+                let stream = Box::pin(stream!({
+                    while let Some(flight_message) = flight_message_stream.next().await {
+                        let flight_message = flight_message
+                            .map_err(BoxedError::new)
+                            .context(ExternalSnafu)?;
+                        let FlightMessage::Recordbatch(record_batch) = flight_message else {
+                            yield IllegalFlightMessagesSnafu {reason: "A Schema message must be succeeded exclusively by a set of RecordBatch messages"}
+                                        .fail()
+                                        .map_err(BoxedError::new)
+                                        .context(ExternalSnafu);
+                            break;
+                        };
+                        yield Ok(record_batch);
+                    }
+                }));
+                let record_batch_stream = RecordBatchStreamAdaptor {
+                    schema,
+                    stream,
+                    output_ordering: None,
+                };
+                Ok(Output::Stream(Box::pin(record_batch_stream)))
+            }
+        }
    }
 }

--- a/src/client/src/lib.rs
+++ b/src/client/src/lib.rs
@@ -18,6 +18,7 @@ mod database;
 pub mod error;
 pub mod load_balance;
 mod metrics;
+pub mod region;
 mod stream_insert;

 pub use api;
--- a/src/client/src/metrics.rs
+++ b/src/client/src/metrics.rs
@@ -25,3 +25,4 @@ pub const METRIC_GRPC_FLUSH_TABLE: &str = "grpc.flush_table";
 pub const METRIC_GRPC_COMPACT_TABLE: &str = "grpc.compact_table";
 pub const METRIC_GRPC_TRUNCATE_TABLE: &str = "grpc.truncate_table";
 pub const METRIC_GRPC_DO_GET: &str = "grpc.do_get";
+pub(crate) const METRIC_REGION_REQUEST_GRPC: &str = "grpc.region_request";
--- a/src/client/src/region.rs
+++ b/src/client/src/region.rs
@@ -0,0 +1,146 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::region::{region_request, RegionRequest, RegionRequestHeader, RegionResponse};
+use api::v1::ResponseHeader;
+use common_error::status_code::StatusCode;
+use common_telemetry::timer;
+use snafu::OptionExt;
+
+use crate::error::{IllegalDatabaseResponseSnafu, Result, ServerSnafu};
+use crate::{metrics, Client};
+
+type AffectedRows = u64;
+
+#[derive(Debug)]
+pub struct RegionRequester {
+    trace_id: Option<u64>,
+    span_id: Option<u64>,
+    client: Client,
+}
+
+impl RegionRequester {
+    pub fn new(client: Client) -> Self {
+        // TODO(LFC): Pass in trace_id and span_id from some context when we have it.
+        Self {
+            trace_id: None,
+            span_id: None,
+            client,
+        }
+    }
+
+    pub async fn handle(self, request: region_request::Body) -> Result<AffectedRows> {
+        let request_type = request.as_ref().to_string();
+
+        let request = RegionRequest {
+            header: Some(RegionRequestHeader {
+                trace_id: self.trace_id,
+                span_id: self.span_id,
+            }),
+            body: Some(request),
+        };
+
+        let _timer = timer!(
+            metrics::METRIC_REGION_REQUEST_GRPC,
+            &[("request_type", request_type)]
+        );
+
+        let mut client = self.client.raw_region_client()?;
+
+        let RegionResponse {
+            header,
+            affected_rows,
+        } = client.handle(request).await?.into_inner();
+
+        check_response_header(header)?;
+
+        Ok(affected_rows)
+    }
+}
+
+fn check_response_header(header: Option<ResponseHeader>) -> Result<()> {
+    let status = header
+        .and_then(|header| header.status)
+        .context(IllegalDatabaseResponseSnafu {
+            err_msg: "either response header or status is missing",
+        })?;
+
+    if StatusCode::is_success(status.status_code) {
+        Ok(())
+    } else {
+        let code =
+            StatusCode::from_u32(status.status_code).context(IllegalDatabaseResponseSnafu {
+                err_msg: format!("unknown server status: {:?}", status),
+            })?;
+        ServerSnafu {
+            code,
+            msg: status.err_msg,
+        }
+        .fail()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use api::v1::Status as PbStatus;
+
+    use super::*;
+    use crate::Error::{IllegalDatabaseResponse, Server};
+
+    #[test]
+    fn test_check_response_header() {
+        let result = check_response_header(None);
+        assert!(matches!(
+            result.unwrap_err(),
+            IllegalDatabaseResponse { .. }
+        ));
+
+        let result = check_response_header(Some(ResponseHeader { status: None }));
+        assert!(matches!(
+            result.unwrap_err(),
+            IllegalDatabaseResponse { .. }
+        ));
+
+        let result = check_response_header(Some(ResponseHeader {
+            status: Some(PbStatus {
+                status_code: StatusCode::Success as u32,
+                err_msg: "".to_string(),
+            }),
+        }));
+        assert!(result.is_ok());
+
+        let result = check_response_header(Some(ResponseHeader {
+            status: Some(PbStatus {
+                status_code: u32::MAX,
+                err_msg: "".to_string(),
+            }),
+        }));
+        assert!(matches!(
+            result.unwrap_err(),
+            IllegalDatabaseResponse { .. }
+        ));
+
+        let result = check_response_header(Some(ResponseHeader {
+            status: Some(PbStatus {
+                status_code: StatusCode::Internal as u32,
+                err_msg: "blabla".to_string(),
+            }),
+        }));
+        let Server { code, msg } = result.unwrap_err() else {
+            unreachable!()
+        };
+        assert_eq!(code, StatusCode::Internal);
+        assert_eq!(msg, "blabla");
+    }
+}
--- a/src/client/src/stream_insert.rs
+++ b/src/client/src/stream_insert.rs
@@ -16,6 +16,7 @@ use api::v1::greptime_database_client::GreptimeDatabaseClient;
 use api::v1::greptime_request::Request;
 use api::v1::{
    AuthHeader, GreptimeRequest, GreptimeResponse, InsertRequest, InsertRequests, RequestHeader,
+    RowInsertRequest, RowInsertRequests,
 };
 use tokio::sync::mpsc;
 use tokio::task::JoinHandle;
@@ -84,6 +85,18 @@ impl StreamInserter {
        })
    }

+    pub async fn row_insert(&self, requests: Vec<RowInsertRequest>) -> Result<()> {
+        let inserts = RowInsertRequests { inserts: requests };
+        let request = self.to_rpc_request(Request::RowInserts(inserts));
+
+        self.sender.send(request).await.map_err(|e| {
+            error::ClientStreamingSnafu {
+                err_msg: e.to_string(),
+            }
+            .build()
+        })
+    }
+
    pub async fn finish(self) -> Result<u32> {
        drop(self.sender);

--- a/src/cmd/src/cli/upgrade.rs
+++ b/src/cmd/src/cli/upgrade.rs
@@ -133,17 +133,18 @@ impl MigrateTableMetadata {
        );

        while let Some((key, value)) = stream.try_next().await.context(error::IterStreamSnafu)? {
-            self.migrate_table_route_key(value).await?;
+            let table_id = self.migrate_table_route_key(value).await?;
            keys.push(key);
+            keys.push(TableRegionKey::new(table_id).as_raw_key())
        }

-        info!("Total migrated TableRouteKeys: {}", keys.len());
+        info!("Total migrated TableRouteKeys: {}", keys.len() / 2);
        self.delete_migrated_keys(keys).await;

        Ok(())
    }

-    async fn migrate_table_route_key(&self, value: TableRouteValue) -> Result<()> {
+    async fn migrate_table_route_key(&self, value: TableRouteValue) -> Result<u32> {
        let table_route = TableRoute::try_from_raw(
            &value.peers,
            value.table_route.expect("expected table_route"),
@@ -152,7 +153,8 @@ impl MigrateTableMetadata {

        let new_table_value = NextTableRouteValue::new(table_route.region_routes);

-        let new_key = NextTableRouteKey::new(table_route.table.id as u32);
+        let table_id = table_route.table.id as u32;
+        let new_key = NextTableRouteKey::new(table_id);
        info!("Creating '{new_key}'");

        if self.dryrun {
@@ -168,7 +170,7 @@ impl MigrateTableMetadata {
                .unwrap();
        }

-        Ok(())
+        Ok(table_id)
    }

    async fn migrate_schema_keys(&self) -> Result<()> {
@@ -203,7 +205,7 @@ impl MigrateTableMetadata {

    async fn migrate_schema_key(&self, key: &v1SchemaKey) -> Result<()> {
        let new_key = SchemaNameKey::new(&key.catalog_name, &key.schema_name);
-        let schema_name_value = SchemaNameValue;
+        let schema_name_value = SchemaNameValue::default();

        info!("Creating '{new_key}'");

@@ -310,7 +312,7 @@ impl MigrateTableMetadata {

    async fn delete_migrated_keys(&self, keys: Vec<Vec<u8>>) {
        for keys in keys.chunks(PAGE_SIZE) {
-            info!("Deleting {} TableGlobalKeys", keys.len());
+            info!("Deleting {} keys", keys.len());
            let req = BatchDeleteRequest {
                keys: keys.to_vec(),
                prev_kv: false,
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -229,7 +229,6 @@ mod tests {
            [storage.manifest]
            checkpoint_margin = 9
            gc_duration = '7s'
-            checkpoint_on_startup = true
            compress = true

            [logging]
@@ -289,7 +288,6 @@ mod tests {
            RegionManifestConfig {
                checkpoint_margin: Some(9),
                gc_duration: Some(Duration::from_secs(7)),
-                checkpoint_on_startup: true,
                compress: true
            },
            options.storage.manifest,
@@ -383,9 +381,6 @@ mod tests {
            max_files_in_level0 = 7
            max_purge_tasks = 32

-            [storage.manifest]
-            checkpoint_on_startup = true
-
            [logging]
            level = "debug"
            dir = "/tmp/greptimedb/test/logs"
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -20,7 +20,7 @@ use common_base::Plugins;
 use common_telemetry::logging;
 use frontend::frontend::FrontendOptions;
 use frontend::instance::{FrontendInstance, Instance as FeInstance};
-use frontend::service_config::{InfluxdbOptions, PrometheusOptions};
+use frontend::service_config::InfluxdbOptions;
 use meta_client::MetaClientOptions;
 use servers::tls::{TlsMode, TlsOption};
 use servers::Mode;
@@ -99,8 +99,6 @@ pub struct StartCommand {
    #[clap(long)]
    mysql_addr: Option<String>,
    #[clap(long)]
-    prom_addr: Option<String>,
-    #[clap(long)]
    postgres_addr: Option<String>,
    #[clap(long)]
    opentsdb_addr: Option<String>,
@@ -171,10 +169,6 @@ impl StartCommand {
            }
        }

-        if let Some(addr) = &self.prom_addr {
-            opts.prometheus_options = Some(PrometheusOptions { addr: addr.clone() });
-        }
-
        if let Some(addr) = &self.postgres_addr {
            if let Some(postgres_opts) = &mut opts.postgres_options {
                postgres_opts.addr = addr.clone();
@@ -248,7 +242,6 @@ mod tests {
    fn test_try_from_start_command() {
        let command = StartCommand {
            http_addr: Some("127.0.0.1:1234".to_string()),
-            prom_addr: Some("127.0.0.1:4444".to_string()),
            mysql_addr: Some("127.0.0.1:5678".to_string()),
            postgres_addr: Some("127.0.0.1:5432".to_string()),
            opentsdb_addr: Some("127.0.0.1:4321".to_string()),
@@ -276,10 +269,6 @@ mod tests {
            opts.opentsdb_options.as_ref().unwrap().addr,
            "127.0.0.1:4321"
        );
-        assert_eq!(
-            opts.prometheus_options.as_ref().unwrap().addr,
-            "127.0.0.1:4444"
-        );

        let default_opts = FrontendOptions::default();
        assert_eq!(
--- a/src/cmd/src/options.rs
+++ b/src/cmd/src/options.rs
@@ -201,17 +201,6 @@ mod tests {
                    .join(ENV_VAR_SEP),
                    Some("42s"),
                ),
-                (
-                    // storage.manifest.checkpoint_on_startup = true
-                    [
-                        env_prefix.to_string(),
-                        "storage".to_uppercase(),
-                        "manifest".to_uppercase(),
-                        "checkpoint_on_startup".to_uppercase(),
-                    ]
-                    .join(ENV_VAR_SEP),
-                    Some("true"),
-                ),
                (
                    // wal.dir = /other/wal/dir
                    [
@@ -253,7 +242,6 @@ mod tests {
                    opts.storage.manifest.gc_duration,
                    Some(Duration::from_secs(42))
                );
-                assert!(opts.storage.manifest.checkpoint_on_startup);
                assert_eq!(
                    opts.meta_client_options.unwrap().metasrv_addrs,
                    vec![
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -24,7 +24,6 @@ use frontend::frontend::FrontendOptions;
 use frontend::instance::{FrontendInstance, Instance as FeInstance};
 use frontend::service_config::{
    GrpcOptions, InfluxdbOptions, MysqlOptions, OpentsdbOptions, PostgresOptions, PromStoreOptions,
-    PrometheusOptions,
 };
 use serde::{Deserialize, Serialize};
 use servers::http::HttpOptions;
@@ -91,7 +90,6 @@ pub struct StandaloneOptions {
    pub opentsdb_options: Option<OpentsdbOptions>,
    pub influxdb_options: Option<InfluxdbOptions>,
    pub prom_store_options: Option<PromStoreOptions>,
-    pub prometheus_options: Option<PrometheusOptions>,
    pub wal: WalConfig,
    pub storage: StorageConfig,
    pub procedure: ProcedureConfig,
@@ -111,7 +109,6 @@ impl Default for StandaloneOptions {
            opentsdb_options: Some(OpentsdbOptions::default()),
            influxdb_options: Some(InfluxdbOptions::default()),
            prom_store_options: Some(PromStoreOptions::default()),
-            prometheus_options: Some(PrometheusOptions::default()),
            wal: WalConfig::default(),
            storage: StorageConfig::default(),
            procedure: ProcedureConfig::default(),
@@ -131,7 +128,6 @@ impl StandaloneOptions {
            opentsdb_options: self.opentsdb_options,
            influxdb_options: self.influxdb_options,
            prom_store_options: self.prom_store_options,
-            prometheus_options: self.prometheus_options,
            meta_client_options: None,
            logging: self.logging,
            ..Default::default()
@@ -193,8 +189,6 @@ struct StartCommand {
    #[clap(long)]
    mysql_addr: Option<String>,
    #[clap(long)]
-    prom_addr: Option<String>,
-    #[clap(long)]
    postgres_addr: Option<String>,
    #[clap(long)]
    opentsdb_addr: Option<String>,
@@ -271,10 +265,6 @@ impl StartCommand {
            }
        }

-        if let Some(addr) = &self.prom_addr {
-            opts.prometheus_options = Some(PrometheusOptions { addr: addr.clone() })
-        }
-
        if let Some(addr) = &self.postgres_addr {
            if let Some(postgres_opts) = &mut opts.postgres_options {
                postgres_opts.addr = addr.clone();
@@ -408,7 +398,6 @@ mod tests {
            [storage.manifest]
            checkpoint_margin = 9
            gc_duration = '7s'
-            checkpoint_on_startup = true

            [http_options]
            addr = "127.0.0.1:4000"
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -35,8 +35,14 @@ pub const INFORMATION_SCHEMA_TABLES_TABLE_ID: u32 = 3;
 pub const INFORMATION_SCHEMA_COLUMNS_TABLE_ID: u32 = 4;

 pub const MITO_ENGINE: &str = "mito";
+pub const MITO2_ENGINE: &str = "mito2";
+
+pub fn default_engine() -> &'static str {
+    MITO_ENGINE
+}
+
 pub const IMMUTABLE_FILE_ENGINE: &str = "file";

-pub const SEMANTIC_TYPE_PRIMARY_KEY: &str = "PRIMARY KEY";
+pub const SEMANTIC_TYPE_PRIMARY_KEY: &str = "TAG";
 pub const SEMANTIC_TYPE_FIELD: &str = "FIELD";
-pub const SEMANTIC_TYPE_TIME_INDEX: &str = "TIME INDEX";
+pub const SEMANTIC_TYPE_TIME_INDEX: &str = "TIMESTAMP";
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -27,7 +27,7 @@ orc-rust = "0.2"
 paste = "1.0"
 regex = "1.7"
 snafu.workspace = true
-strum = { version = "0.21", features = ["derive"] }
+strum.workspace = true
 tokio-util.workspace = true
 tokio.workspace = true
 url = "2.3"
--- a/src/common/error/Cargo.toml
+++ b/src/common/error/Cargo.toml
@@ -6,4 +6,4 @@ license.workspace = true

 [dependencies]
 snafu = { version = "0.7", features = ["backtraces"] }
-strum = { version = "0.24", features = ["std", "derive"] }
+strum.workspace = true
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -14,6 +14,7 @@ common-error = { workspace = true }
 common-recordbatch = { workspace = true }
 common-runtime = { workspace = true }
 common-telemetry = { workspace = true }
+common-time = { workspace = true }
 dashmap = "5.4"
 datafusion.workspace = true
 datatypes = { workspace = true }
--- a/src/common/grpc/src/error.rs
+++ b/src/common/grpc/src/error.rs
@@ -75,6 +75,9 @@ pub enum Error {
        location: Location,
        source: datatypes::error::Error,
    },
+
+    #[snafu(display("Not supported: {}", feat))]
+    NotSupported { feat: String },
 }

 impl ErrorExt for Error {
@@ -83,7 +86,8 @@ impl ErrorExt for Error {
            Error::InvalidTlsConfig { .. }
            | Error::InvalidConfigFilePath { .. }
            | Error::TypeMismatch { .. }
-            | Error::InvalidFlightData { .. } => StatusCode::InvalidArguments,
+            | Error::InvalidFlightData { .. }
+            | Error::NotSupported { .. } => StatusCode::InvalidArguments,

            Error::CreateChannel { .. }
            | Error::Conversion { .. }
--- a/src/common/grpc/src/writer.rs
+++ b/src/common/grpc/src/writer.rs
@@ -18,9 +18,11 @@ use std::fmt::Display;
 use api::helper::values_with_capacity;
 use api::v1::{Column, ColumnDataType, SemanticType};
 use common_base::BitVec;
+use common_time::timestamp::TimeUnit;
 use snafu::ensure;

 use crate::error::{Result, TypeMismatchSnafu};
+use crate::Error;

 type ColumnName = String;

@@ -259,6 +261,24 @@ impl Display for Precision {
    }
 }

+impl TryFrom<Precision> for TimeUnit {
+    type Error = Error;
+
+    fn try_from(precision: Precision) -> std::result::Result<Self, Self::Error> {
+        Ok(match precision {
+            Precision::Second => TimeUnit::Second,
+            Precision::Millisecond => TimeUnit::Millisecond,
+            Precision::Microsecond => TimeUnit::Microsecond,
+            Precision::Nanosecond => TimeUnit::Nanosecond,
+            _ => {
+                return Err(Error::NotSupported {
+                    feat: format!("convert {precision} into TimeUnit"),
+                })
+            }
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use api::v1::{ColumnDataType, SemanticType};
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -15,6 +15,7 @@ common-telemetry = { workspace = true }
 common-time = { workspace = true }
 etcd-client.workspace = true
 futures.workspace = true
+humantime-serde.workspace = true
 lazy_static.workspace = true
 prost.workspace = true
 regex.workspace = true
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -54,6 +54,13 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Failed to parse value {} into key {}", value, key))]
+    ParseOption {
+        key: String,
+        value: String,
+        location: Location,
+    },
+
    #[snafu(display("Corrupted table route data, err: {}", err_msg))]
    RouteInfoCorrupted { err_msg: String, location: Location },

@@ -151,6 +158,7 @@ impl ErrorExt for Error {
            IllegalServerState { .. } | EtcdTxnOpResponse { .. } => StatusCode::Internal,

            SerdeJson { .. }
+            | ParseOption { .. }
            | RouteInfoCorrupted { .. }
            | InvalidProtoMsg { .. }
            | InvalidTableMetadata { .. }
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -215,9 +215,14 @@ impl TableMetadataManager {
    /// The caller MUST ensure it has the exclusive access to `TableNameKey`.
    pub async fn create_table_metadata(
        &self,
-        table_info: RawTableInfo,
+        mut table_info: RawTableInfo,
        region_routes: Vec<RegionRoute>,
    ) -> Result<()> {
+        let region_numbers = region_routes
+            .iter()
+            .map(|region| region.region.id.region_number())
+            .collect::<Vec<_>>();
+        table_info.meta.region_numbers = region_numbers;
        let table_id = table_info.ident.table_id;

        // Creates table name.
@@ -489,15 +494,35 @@ macro_rules! impl_table_meta_value {
    }
 }

+#[macro_export]
+macro_rules! impl_optional_meta_value {
+    ($($val_ty: ty), *) => {
+        $(
+            impl $val_ty {
+                pub fn try_from_raw_value(raw_value: &[u8]) -> Result<Option<Self>> {
+                    serde_json::from_slice(raw_value).context(SerdeJsonSnafu)
+                }
+
+                pub fn try_as_raw_value(&self) -> Result<Vec<u8>> {
+                    serde_json::to_vec(self).context(SerdeJsonSnafu)
+                }
+            }
+        )*
+    }
+}
+
 impl_table_meta_value! {
-    CatalogNameValue,
-    SchemaNameValue,
    TableNameValue,
    TableInfoValue,
    DatanodeTableValue,
    TableRouteValue
 }

+impl_optional_meta_value! {
+    CatalogNameValue,
+    SchemaNameValue
+}
+
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeMap;
@@ -524,7 +549,7 @@ mod tests {
        assert_eq!(removed, to_removed_key(key));
    }

-    fn new_test_table_info() -> TableInfo {
+    fn new_test_table_info(region_numbers: impl Iterator<Item = u32>) -> TableInfo {
        let column_schemas = vec![
            ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
            ColumnSchema::new(
@@ -546,6 +571,7 @@ mod tests {
            .primary_key_indices(vec![0])
            .engine("engine")
            .next_column_id(3)
+            .region_numbers(region_numbers.collect::<Vec<_>>())
            .build()
            .unwrap();
        TableInfoBuilder::default()
@@ -578,9 +604,10 @@ mod tests {
    async fn test_create_table_metadata() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
        let table_metadata_manager = TableMetadataManager::new(mem_kv);
-        let table_info: RawTableInfo = new_test_table_info().into();
        let region_route = new_test_region_route();
        let region_routes = vec![region_route.clone()];
+        let table_info: RawTableInfo =
+            new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
        // creates metadata.
        table_metadata_manager
            .create_table_metadata(table_info.clone(), region_routes.clone())
@@ -612,11 +639,12 @@ mod tests {
    async fn test_delete_table_metadata() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
        let table_metadata_manager = TableMetadataManager::new(mem_kv);
-        let table_info: RawTableInfo = new_test_table_info().into();
-        let table_id = table_info.ident.table_id;
        let region_route = new_test_region_route();
-        let datanode_id = 2;
        let region_routes = vec![region_route.clone()];
+        let table_info: RawTableInfo =
+            new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
+        let table_id = table_info.ident.table_id;
+        let datanode_id = 2;
        let table_route_value = TableRouteValue::new(region_routes.clone());

        // creates metadata.
@@ -682,10 +710,11 @@ mod tests {
    async fn test_rename_table() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
        let table_metadata_manager = TableMetadataManager::new(mem_kv);
-        let table_info: RawTableInfo = new_test_table_info().into();
-        let table_id = table_info.ident.table_id;
        let region_route = new_test_region_route();
        let region_routes = vec![region_route.clone()];
+        let table_info: RawTableInfo =
+            new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
+        let table_id = table_info.ident.table_id;
        // creates metadata.
        table_metadata_manager
            .create_table_metadata(table_info.clone(), region_routes.clone())
@@ -746,10 +775,11 @@ mod tests {
    async fn test_update_table_info() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
        let table_metadata_manager = TableMetadataManager::new(mem_kv);
-        let table_info: RawTableInfo = new_test_table_info().into();
-        let table_id = table_info.ident.table_id;
        let region_route = new_test_region_route();
        let region_routes = vec![region_route.clone()];
+        let table_info: RawTableInfo =
+            new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
+        let table_id = table_info.ident.table_id;
        // creates metadata.
        table_metadata_manager
            .create_table_metadata(table_info.clone(), region_routes.clone())
@@ -811,9 +841,10 @@ mod tests {
    async fn test_update_table_route() {
        let mem_kv = Arc::new(MemoryKvBackend::default());
        let table_metadata_manager = TableMetadataManager::new(mem_kv);
-        let table_info: RawTableInfo = new_test_table_info().into();
        let region_route = new_test_region_route();
        let region_routes = vec![region_route.clone()];
+        let table_info: RawTableInfo =
+            new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
        let table_id = table_info.ident.table_id;
        let current_table_route_value = TableRouteValue::new(region_routes.clone());
        // creates metadata.
--- a/src/common/meta/src/key/schema_name.rs
+++ b/src/common/meta/src/key/schema_name.rs
@@ -12,22 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::collections::HashMap;
 use std::fmt::Display;
 use std::sync::Arc;
+use std::time::Duration;

 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use futures::stream::BoxStream;
 use futures::StreamExt;
+use humantime_serde::re::humantime;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};

-use crate::error::{self, Error, InvalidTableMetadataSnafu, Result};
+use crate::error::{self, Error, InvalidTableMetadataSnafu, ParseOptionSnafu, Result};
 use crate::key::{TableMetaKey, SCHEMA_NAME_KEY_PATTERN, SCHEMA_NAME_KEY_PREFIX};
 use crate::kv_backend::KvBackendRef;
 use crate::range_stream::{PaginationStream, DEFAULT_PAGE_SIZE};
 use crate::rpc::store::{PutRequest, RangeRequest};
 use crate::rpc::KeyValue;

+const OPT_KEY_TTL: &str = "ttl";
+
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub struct SchemaNameKey<'a> {
    pub catalog: &'a str,
@@ -43,8 +48,33 @@ impl<'a> Default for SchemaNameKey<'a> {
    }
 }

-#[derive(Debug, Serialize, Deserialize)]
-pub struct SchemaNameValue;
+#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
+pub struct SchemaNameValue {
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub ttl: Option<Duration>,
+}
+
+impl TryFrom<&HashMap<String, String>> for SchemaNameValue {
+    type Error = Error;
+
+    fn try_from(value: &HashMap<String, String>) -> std::result::Result<Self, Self::Error> {
+        let ttl = value
+            .get(OPT_KEY_TTL)
+            .map(|ttl_str| {
+                ttl_str.parse::<humantime::Duration>().map_err(|_| {
+                    ParseOptionSnafu {
+                        key: OPT_KEY_TTL,
+                        value: ttl_str.clone(),
+                    }
+                    .build()
+                })
+            })
+            .transpose()?
+            .map(|ttl| ttl.into());
+        Ok(Self { ttl })
+    }
+}

 impl<'a> SchemaNameKey<'a> {
    pub fn new(catalog: &'a str, schema: &'a str) -> Self {
@@ -108,11 +138,15 @@ impl SchemaManager {
    }

    /// Creates `SchemaNameKey`.
-    pub async fn create(&self, schema: SchemaNameKey<'_>) -> Result<()> {
+    pub async fn create(
+        &self,
+        schema: SchemaNameKey<'_>,
+        value: Option<SchemaNameValue>,
+    ) -> Result<()> {
        let raw_key = schema.as_raw_key();
        let req = PutRequest::new()
            .with_key(raw_key)
-            .with_value(SchemaNameValue.try_as_raw_value()?);
+            .with_value(value.unwrap_or_default().try_as_raw_value()?);

        self.kv_backend.put(req).await?;

@@ -125,6 +159,14 @@ impl SchemaManager {
        Ok(self.kv_backend.get(&raw_key).await?.is_some())
    }

+    pub async fn get(&self, schema: SchemaNameKey<'_>) -> Result<Option<SchemaNameValue>> {
+        let raw_key = schema.as_raw_key();
+        let value = self.kv_backend.get(&raw_key).await?;
+        value
+            .and_then(|v| SchemaNameValue::try_from_raw_value(v.value.as_ref()).transpose())
+            .transpose()
+    }
+
    /// Returns a schema stream, it lists all schemas belong to the target `catalog`.
    pub async fn schema_names(&self, catalog: &str) -> BoxStream<'static, Result<String>> {
        let start_key = SchemaNameKey::range_start_key(catalog);
@@ -143,25 +185,39 @@ impl SchemaManager {

 #[cfg(test)]
 mod tests {
+
    use super::*;
    use crate::kv_backend::memory::MemoryKvBackend;

    #[test]
    fn test_serialization() {
        let key = SchemaNameKey::new("my-catalog", "my-schema");
-
        assert_eq!(key.to_string(), "__schema_name/my-catalog/my-schema");

        let parsed: SchemaNameKey<'_> = "__schema_name/my-catalog/my-schema".try_into().unwrap();
-
        assert_eq!(key, parsed);
+
+        let value = SchemaNameValue {
+            ttl: Some(Duration::from_secs(10)),
+        };
+        let mut opts: HashMap<String, String> = HashMap::new();
+        opts.insert("ttl".to_string(), "10s".to_string());
+        let from_value = SchemaNameValue::try_from(&opts).unwrap();
+        assert_eq!(value, from_value);
+
+        let parsed = SchemaNameValue::try_from_raw_value("{\"ttl\":\"10s\"}".as_bytes()).unwrap();
+        assert_eq!(Some(value), parsed);
+        let none = SchemaNameValue::try_from_raw_value("null".as_bytes()).unwrap();
+        assert!(none.is_none());
+        let err_empty = SchemaNameValue::try_from_raw_value("".as_bytes());
+        assert!(err_empty.is_err());
    }

    #[tokio::test]
    async fn test_key_exist() {
        let manager = SchemaManager::new(Arc::new(MemoryKvBackend::default()));
        let schema_key = SchemaNameKey::new("my-catalog", "my-schema");
-        manager.create(schema_key).await.unwrap();
+        manager.create(schema_key, None).await.unwrap();

        assert!(manager.exist(schema_key).await.unwrap());

--- a/src/common/procedure/Cargo.toml
+++ b/src/common/procedure/Cargo.toml
@@ -12,7 +12,7 @@ common-error = { workspace = true }
 common-runtime = { workspace = true }
 common-telemetry = { workspace = true }
 futures.workspace = true
-humantime-serde = "1.1"
+humantime-serde.workspace = true
 object-store = { workspace = true }
 serde.workspace = true
 serde_json = "1.0"
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -34,13 +34,8 @@ use crate::{
    SendableRecordBatchStream, Stream,
 };

-type FutureStream = Pin<
-    Box<
-        dyn std::future::Future<
-                Output = std::result::Result<DfSendableRecordBatchStream, DataFusionError>,
-            > + Send,
-    >,
->;
+type FutureStream =
+    Pin<Box<dyn std::future::Future<Output = Result<SendableRecordBatchStream>> + Send>>;

 /// ParquetRecordBatchStream -> DataFusion RecordBatchStream
 pub struct ParquetRecordBatchStreamAdapter<T> {
@@ -223,7 +218,7 @@ impl Stream for RecordBatchStreamAdapter {

 enum AsyncRecordBatchStreamAdapterState {
    Uninit(FutureStream),
-    Ready(DfSendableRecordBatchStream),
+    Ready(SendableRecordBatchStream),
    Failed,
 }

@@ -261,17 +256,12 @@ impl Stream for AsyncRecordBatchStreamAdapter {
                        }
                        Err(e) => {
                            self.state = AsyncRecordBatchStreamAdapterState::Failed;
-                            return Poll::Ready(Some(
-                                Err(e).context(error::InitRecordbatchStreamSnafu),
-                            ));
+                            return Poll::Ready(Some(Err(e)));
                        }
                    };
                }
                AsyncRecordBatchStreamAdapterState::Ready(stream) => {
-                    return Poll::Ready(ready!(Pin::new(stream).poll_next(cx)).map(|x| {
-                        let df_record_batch = x.context(error::PollStreamSnafu)?;
-                        RecordBatch::try_from_df_record_batch(self.schema(), df_record_batch)
-                    }))
+                    return Poll::Ready(ready!(Pin::new(stream).poll_next(cx)))
                }
                AsyncRecordBatchStreamAdapterState::Failed => return Poll::Ready(None),
            }
@@ -296,6 +286,7 @@ mod test {
    use snafu::IntoError;

    use super::*;
+    use crate::error::Error;
    use crate::RecordBatches;

    #[tokio::test]
@@ -330,12 +321,7 @@ mod test {
        ) -> FutureStream {
            Box::pin(async move {
                maybe_recordbatches
-                    .map(|items| {
-                        Box::pin(DfRecordBatchStreamAdapter::new(Box::pin(
-                            MaybeErrorRecordBatchStream { items },
-                        ))) as _
-                    })
-                    .map_err(|e| DataFusionError::External(Box::new(e)))
+                    .map(|items| Box::pin(MaybeErrorRecordBatchStream { items }) as _)
            })
        }

@@ -369,20 +355,24 @@ mod test {
                .into_error(BoxedError::new(MockError::new(StatusCode::Unknown)))),
        ]));
        let adapter = AsyncRecordBatchStreamAdapter::new(schema.clone(), poll_err_stream);
-        let result = RecordBatches::try_collect(Box::pin(adapter)).await;
-        assert_eq!(
-            result.unwrap_err().to_string(),
-            "Failed to poll stream, source: External error: External error, source: Unknown"
+        let err = RecordBatches::try_collect(Box::pin(adapter))
+            .await
+            .unwrap_err();
+        assert!(
+            matches!(err, Error::External { .. }),
+            "unexpected err {err}"
        );

        let failed_to_init_stream =
            new_future_stream(Err(error::ExternalSnafu
                .into_error(BoxedError::new(MockError::new(StatusCode::Internal)))));
        let adapter = AsyncRecordBatchStreamAdapter::new(schema.clone(), failed_to_init_stream);
-        let result = RecordBatches::try_collect(Box::pin(adapter)).await;
-        assert_eq!(
-            result.unwrap_err().to_string(),
-            "Failed to init Recordbatch stream, source: External error: External error, source: Internal"
+        let err = RecordBatches::try_collect(Box::pin(adapter))
+            .await
+            .unwrap_err();
+        assert!(
+            matches!(err, Error::External { .. }),
+            "unexpected err {err}"
        );
    }
 }
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -37,7 +37,7 @@ pub enum Error {
        source: datatypes::error::Error,
    },

-    #[snafu(display("External error, source: {}", source))]
+    #[snafu(display("External error, location: {}, source: {}", location, source))]
    External {
        location: Location,
        source: BoxedError,
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -202,13 +202,26 @@ impl Stream for SimpleRecordBatchStream {
 }

 /// Adapt a [Stream] of [RecordBatch] to a [RecordBatchStream].
-pub struct RecordBatchStreamAdaptor {
+pub struct RecordBatchStreamAdaptor<S> {
    pub schema: SchemaRef,
-    pub stream: Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>,
+    pub stream: S,
    pub output_ordering: Option<Vec<OrderOption>>,
 }

-impl RecordBatchStream for RecordBatchStreamAdaptor {
+impl<S> RecordBatchStreamAdaptor<S> {
+    /// Creates a RecordBatchStreamAdaptor without output ordering requirement.
+    pub fn new(schema: SchemaRef, stream: S) -> RecordBatchStreamAdaptor<S> {
+        RecordBatchStreamAdaptor {
+            schema,
+            stream,
+            output_ordering: None,
+        }
+    }
+}
+
+impl<S: Stream<Item = Result<RecordBatch>> + Unpin> RecordBatchStream
+    for RecordBatchStreamAdaptor<S>
+{
    fn schema(&self) -> SchemaRef {
        self.schema.clone()
    }
@@ -218,7 +231,7 @@ impl RecordBatchStream for RecordBatchStreamAdaptor {
    }
 }

-impl Stream for RecordBatchStreamAdaptor {
+impl<S: Stream<Item = Result<RecordBatch>> + Unpin> Stream for RecordBatchStreamAdaptor<S> {
    type Item = Result<RecordBatch>;

    fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
--- a/src/common/time/src/timestamp.rs
+++ b/src/common/time/src/timestamp.rs
@@ -29,6 +29,16 @@ use crate::error::{ArithmeticOverflowSnafu, Error, ParseTimestampSnafu, Timestam
 use crate::timezone::TimeZone;
 use crate::util::{div_ceil, format_utc_datetime, local_datetime_to_utc};

+/// Timestamp represents the value of units(seconds/milliseconds/microseconds/nanoseconds) elapsed
+/// since UNIX epoch. The valid value range of [Timestamp] depends on it's unit (all in UTC time zone):
+/// - for [TimeUnit::Second]: [-262144-01-01 00:00:00, +262143-12-31 23:59:59]
+/// - for [TimeUnit::Millisecond]: [-262144-01-01 00:00:00.000, +262143-12-31 23:59:59.999]
+/// - for [TimeUnit::Microsecond]: [-262144-01-01 00:00:00.000000, +262143-12-31 23:59:59.999999]
+/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145225, 2262-04-11 23:47:16.854775807]
+///
+/// # Note:
+/// For values out of range, you can still store these timestamps, but while performing arithmetic
+/// or formatting operations, it will return an error or just overflow.
 #[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
 pub struct Timestamp {
    value: i64,
@@ -169,6 +179,28 @@ impl Timestamp {
        (sec_div, nsec)
    }

+    /// Creates a new Timestamp instance from seconds and nanoseconds parts.
+    /// Returns None if overflow.
+    fn from_splits(sec: i64, nsec: u32) -> Option<Self> {
+        if nsec == 0 {
+            Some(Timestamp::new_second(sec))
+        } else if nsec % 1_000_000 == 0 {
+            let millis = nsec / 1_000_000;
+            sec.checked_mul(1000)
+                .and_then(|v| v.checked_add(millis as i64))
+                .map(Timestamp::new_millisecond)
+        } else if nsec % 1000 == 0 {
+            let micros = nsec / 1000;
+            sec.checked_mul(1_000_000)
+                .and_then(|v| v.checked_add(micros as i64))
+                .map(Timestamp::new_microsecond)
+        } else {
+            sec.checked_mul(1_000_000_000)
+                .and_then(|v| v.checked_add(nsec as i64))
+                .map(Timestamp::new_nanosecond)
+        }
+    }
+
    /// Format timestamp to ISO8601 string. If the timestamp exceeds what chrono timestamp can
    /// represent, this function simply print the timestamp unit and value in plain string.
    pub fn to_iso8601_string(&self) -> String {
@@ -205,6 +237,12 @@ impl Timestamp {
        let (sec, nsec) = self.split();
        NaiveDateTime::from_timestamp_opt(sec, nsec)
    }
+
+    pub fn from_chrono_datetime(ndt: NaiveDateTime) -> Option<Self> {
+        let sec = ndt.timestamp();
+        let nsec = ndt.timestamp_subsec_nanos();
+        Timestamp::from_splits(sec, nsec)
+    }
 }

 impl FromStr for Timestamp {
@@ -225,13 +263,16 @@ impl FromStr for Timestamp {
        // RFC3339 timestamp (with a T)
        let s = s.trim();
        if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
-            return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
+            return Timestamp::from_chrono_datetime(ts.naive_utc())
+                .context(ParseTimestampSnafu { raw: s });
        }
        if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
-            return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
+            return Timestamp::from_chrono_datetime(ts.naive_utc())
+                .context(ParseTimestampSnafu { raw: s });
        }
        if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
-            return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
+            return Timestamp::from_chrono_datetime(ts.naive_utc())
+                .context(ParseTimestampSnafu { raw: s });
        }

        if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
@@ -264,7 +305,7 @@ fn naive_datetime_to_timestamp(
    match local_datetime_to_utc(&datetime) {
        LocalResult::None => ParseTimestampSnafu { raw: s }.fail(),
        LocalResult::Single(utc) | LocalResult::Ambiguous(utc, _) => {
-            Ok(Timestamp::new(utc.timestamp_nanos(), TimeUnit::Nanosecond))
+            Timestamp::from_chrono_datetime(utc).context(ParseTimestampSnafu { raw: s })
        }
    }
 }
@@ -608,11 +649,7 @@ mod tests {
    // but expected timestamp is in UTC timezone
    fn check_from_str(s: &str, expect: &str) {
        let ts = Timestamp::from_str(s).unwrap();
-        let time = NaiveDateTime::from_timestamp_opt(
-            ts.value / 1_000_000_000,
-            (ts.value % 1_000_000_000) as u32,
-        )
-        .unwrap();
+        let time = ts.to_chrono_datetime().unwrap();
        assert_eq!(expect, time.to_string());
    }

@@ -1049,4 +1086,70 @@ mod tests {
            TimeUnit::from(ArrowTimeUnit::Nanosecond)
        );
    }
+
+    fn check_conversion(ts: Timestamp, valid: bool) {
+        let Some(t2) = ts.to_chrono_datetime() else {
+            if valid {
+                panic!("Cannot convert {:?} to Chrono NaiveDateTime", ts);
+            }
+            return;
+        };
+        let Some(t3) = Timestamp::from_chrono_datetime(t2) else {
+            if valid {
+                panic!("Cannot convert Chrono NaiveDateTime {:?} to Timestamp", t2);
+            }
+            return;
+        };
+
+        assert_eq!(t3, ts);
+    }
+
+    #[test]
+    fn test_from_naive_date_time() {
+        let min_sec = Timestamp::new_second(-8334632851200);
+        let max_sec = Timestamp::new_second(8210298412799);
+        check_conversion(min_sec, true);
+        check_conversion(Timestamp::new_second(min_sec.value - 1), false);
+        check_conversion(max_sec, true);
+        check_conversion(Timestamp::new_second(max_sec.value + 1), false);
+
+        let min_millis = Timestamp::new_millisecond(-8334632851200000);
+        let max_millis = Timestamp::new_millisecond(8210298412799999);
+        check_conversion(min_millis, true);
+        check_conversion(Timestamp::new_millisecond(min_millis.value - 1), false);
+        check_conversion(max_millis, true);
+        check_conversion(Timestamp::new_millisecond(max_millis.value + 1), false);
+
+        let min_micros = Timestamp::new_microsecond(-8334632851200000000);
+        let max_micros = Timestamp::new_microsecond(8210298412799999999);
+        check_conversion(min_micros, true);
+        check_conversion(Timestamp::new_microsecond(min_micros.value - 1), false);
+        check_conversion(max_micros, true);
+        check_conversion(Timestamp::new_microsecond(max_micros.value + 1), false);
+
+        let min_nanos = Timestamp::new_nanosecond(-9223372036854775000);
+        let max_nanos = Timestamp::new_nanosecond(i64::MAX);
+        check_conversion(min_nanos, true);
+        check_conversion(Timestamp::new_nanosecond(min_nanos.value - 1), false);
+        check_conversion(max_nanos, true);
+    }
+
+    #[test]
+    fn test_parse_timestamp_range() {
+        let valid_strings = vec![
+            "-262144-01-01 00:00:00Z",
+            "+262143-12-31 23:59:59Z",
+            "-262144-01-01 00:00:00Z",
+            "+262143-12-31 23:59:59.999Z",
+            "-262144-01-01 00:00:00Z",
+            "+262143-12-31 23:59:59.999999Z",
+            "1677-09-21 00:12:43.145225Z",
+            "2262-04-11 23:47:16.854775807Z",
+            "+100000-01-01 00:00:01.5Z",
+        ];
+
+        for s in valid_strings {
+            Timestamp::from_str(s).unwrap();
+        }
+    }
 }
--- a/src/datanode/Cargo.toml
+++ b/src/datanode/Cargo.toml
@@ -9,6 +9,7 @@ testing = ["meta-srv/mock"]

 [dependencies]
 api = { workspace = true }
+arrow-flight.workspace = true
 async-compat = "0.2"
 async-stream.workspace = true
 async-trait.workspace = true
@@ -39,7 +40,7 @@ datatypes = { workspace = true }
 file-table-engine = { workspace = true }
 futures = "0.3"
 futures-util.workspace = true
-humantime-serde = "1.1"
+humantime-serde.workspace = true
 hyper = { version = "0.14", features = ["full"] }
 key-lock = "0.1"
 log-store = { workspace = true }
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -256,8 +256,6 @@ pub struct RegionManifestConfig {
    /// Region manifest logs and checkpoints gc task execution duration.
    #[serde(with = "humantime_serde")]
    pub gc_duration: Option<Duration>,
-    /// Whether to try creating a manifest checkpoint on region opening
-    pub checkpoint_on_startup: bool,
    /// Whether to compress manifest and checkpoint file by gzip
    pub compress: bool,
 }
@@ -267,7 +265,6 @@ impl Default for RegionManifestConfig {
        Self {
            checkpoint_margin: Some(10u16),
            gc_duration: Some(Duration::from_secs(600)),
-            checkpoint_on_startup: false,
            compress: false,
        }
    }
@@ -341,7 +338,6 @@ impl From<&DatanodeOptions> for StorageEngineConfig {
    fn from(value: &DatanodeOptions) -> Self {
        Self {
            compress_manifest: value.storage.manifest.compress,
-            manifest_checkpoint_on_startup: value.storage.manifest.checkpoint_on_startup,
            manifest_checkpoint_margin: value.storage.manifest.checkpoint_margin,
            manifest_gc_duration: value.storage.manifest.gc_duration,
            max_files_in_l0: value.storage.compaction.max_files_in_level0,
--- a/src/datanode/src/error.rs
+++ b/src/datanode/src/error.rs
@@ -556,6 +556,16 @@ pub enum Error {
        location: Location,
        source: BoxedError,
    },
+
+    #[snafu(display(
+        "Failed to build region requests, location:{}, source: {}",
+        location,
+        source
+    ))]
+    BuildRegionRequests {
+        location: Location,
+        source: store_api::metadata::MetadataError,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -569,6 +579,7 @@ impl ErrorExt for Error {
            | ExecuteStatement { source, .. }
            | ExecuteLogicalPlan { source, .. } => source.status_code(),

+            BuildRegionRequests { source, .. } => source.status_code(),
            HandleHeartbeatResponse { source, .. } => source.status_code(),

            DecodeLogicalPlan { source, .. } => source.status_code(),
--- a/src/datanode/src/heartbeat/handler/open_region.rs
+++ b/src/datanode/src/heartbeat/handler/open_region.rs
@@ -15,9 +15,9 @@
 use std::sync::Arc;

 use async_trait::async_trait;
-use catalog::error::Error as CatalogError;
+use catalog::error::{Error as CatalogError, Result as CatalogResult};
 use catalog::remote::region_alive_keeper::RegionAliveKeepers;
-use catalog::{CatalogManagerRef, RegisterTableRequest};
+use catalog::{CatalogManagerRef, RegisterSchemaRequest, RegisterTableRequest};
 use common_catalog::format_full_table_name;
 use common_meta::error::Result as MetaResult;
 use common_meta::heartbeat::handler::{
@@ -30,6 +30,7 @@ use store_api::storage::RegionNumber;
 use table::engine::manager::TableEngineManagerRef;
 use table::engine::EngineContext;
 use table::requests::OpenTableRequest;
+use table::Table;

 use crate::error::{self, Result};

@@ -157,6 +158,45 @@ impl OpenRegionHandler {
        Ok(false)
    }

+    async fn register_table(
+        &self,
+        request: &OpenTableRequest,
+        table: Arc<dyn Table>,
+    ) -> CatalogResult<bool> {
+        if !self
+            .catalog_manager
+            .catalog_exist(&request.catalog_name)
+            .await?
+        {
+            self.catalog_manager
+                .clone()
+                .register_catalog(request.catalog_name.to_string())
+                .await?;
+        }
+
+        if !self
+            .catalog_manager
+            .schema_exist(&request.catalog_name, &request.schema_name)
+            .await?
+        {
+            self.catalog_manager
+                .register_schema(RegisterSchemaRequest {
+                    catalog: request.catalog_name.to_string(),
+                    schema: request.schema_name.to_string(),
+                })
+                .await?;
+        }
+
+        let request = RegisterTableRequest {
+            catalog: request.catalog_name.to_string(),
+            schema: request.schema_name.to_string(),
+            table_name: request.table_name.to_string(),
+            table_id: request.table_id,
+            table,
+        };
+        self.catalog_manager.register_table(request).await
+    }
+
    async fn open_region_inner(&self, engine: String, request: OpenTableRequest) -> Result<bool> {
        let OpenTableRequest {
            catalog_name,
@@ -187,14 +227,8 @@ impl OpenRegionHandler {
                table_name: format_full_table_name(catalog_name, schema_name, table_name),
            })?
        {
-            let request = RegisterTableRequest {
-                catalog: request.catalog_name.clone(),
-                schema: request.schema_name.clone(),
-                table_name: request.table_name.clone(),
-                table_id: request.table_id,
-                table,
-            };
-            let result = self.catalog_manager.register_table(request).await;
+            let result = self.register_table(&request, table).await;
+
            match result {
                Ok(_) | Err(CatalogError::TableExists { .. }) => Ok(true),
                e => e.with_context(|_| error::RegisterTableSnafu {
--- a/src/datanode/src/instance/grpc.rs
+++ b/src/datanode/src/instance/grpc.rs
@@ -365,6 +365,7 @@ mod test {
            expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
                database_name: "my_database".to_string(),
                create_if_not_exists: true,
+                options: Default::default(),
            })),
        });
        let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -418,6 +419,7 @@ mod test {
            expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
                database_name: "my_database".to_string(),
                create_if_not_exists: true,
+                options: Default::default(),
            })),
        });
        let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -485,6 +487,7 @@ mod test {
            expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
                database_name: "my_database".to_string(),
                create_if_not_exists: true,
+                options: Default::default(),
            })),
        });
        let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -589,6 +592,7 @@ mod test {
            expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
                database_name: "my_database".to_string(),
                create_if_not_exists: true,
+                options: Default::default(),
            })),
        });
        let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -661,6 +665,7 @@ mod test {
            expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
                database_name: "my_database".to_string(),
                create_if_not_exists: true,
+                options: Default::default(),
            })),
        });
        let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
--- a/src/datanode/src/instance/sql.rs
+++ b/src/datanode/src/instance/sql.rs
@@ -141,7 +141,8 @@ impl Instance {
                let table_ref = TableReference::full(&catalog, &schema, &table);
                let table = self.sql_handler.get_table(&table_ref).await?;

-                query::sql::show_create_table(table, None).context(ExecuteStatementSnafu)
+                query::sql::show_create_table(table, None, query_ctx.clone())
+                    .context(ExecuteStatementSnafu)
            }
            Statement::TruncateTable(truncate_table) => {
                let (catalog_name, schema_name, table_name) =
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -14,15 +14,20 @@

 use std::any::Any;
 use std::collections::HashMap;
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, RwLock};

-use api::v1::region::QueryRequest;
+use api::v1::region::{region_request, QueryRequest, RegionResponse};
+use api::v1::{ResponseHeader, Status};
+use arrow_flight::{FlightData, Ticket};
 use async_trait::async_trait;
 use bytes::Bytes;
+use common_error::ext::BoxedError;
+use common_error::status_code::StatusCode;
 use common_query::logical_plan::Expr;
 use common_query::physical_plan::DfPhysicalPlanAdapter;
 use common_query::{DfPhysicalPlan, Output};
 use common_recordbatch::SendableRecordBatchStream;
+use common_runtime::Runtime;
 use common_telemetry::info;
 use dashmap::DashMap;
 use datafusion::catalog::schema::SchemaProvider;
@@ -33,7 +38,12 @@ use datafusion::execution::context::SessionState;
 use datafusion_common::DataFusionError;
 use datafusion_expr::{Expr as DfExpr, TableType};
 use datatypes::arrow::datatypes::SchemaRef;
+use futures_util::future::try_join_all;
+use prost::Message;
 use query::QueryEngineRef;
+use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
+use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
+use servers::grpc::region_server::RegionServerHandler;
 use session::context::QueryContext;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::RegionMetadataRef;
@@ -42,31 +52,129 @@ use store_api::region_request::RegionRequest;
 use store_api::storage::{RegionId, ScanRequest};
 use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::table::scan::StreamScanAdapter;
+use tonic::{Request, Response, Result as TonicResult};

 use crate::error::{
-    DecodeLogicalPlanSnafu, ExecuteLogicalPlanSnafu, GetRegionMetadataSnafu,
-    HandleRegionRequestSnafu, RegionEngineNotFoundSnafu, RegionNotFoundSnafu, Result,
-    UnsupportedOutputSnafu,
+    BuildRegionRequestsSnafu, DecodeLogicalPlanSnafu, ExecuteLogicalPlanSnafu,
+    GetRegionMetadataSnafu, HandleRegionRequestSnafu, RegionEngineNotFoundSnafu,
+    RegionNotFoundSnafu, Result, UnsupportedOutputSnafu,
 };

+#[derive(Clone)]
 pub struct RegionServer {
-    engines: HashMap<String, RegionEngineRef>,
-    region_map: DashMap<RegionId, RegionEngineRef>,
-    query_engine: QueryEngineRef,
+    inner: Arc<RegionServerInner>,
 }

 impl RegionServer {
-    pub fn new(query_engine: QueryEngineRef) -> Self {
+    pub fn new(query_engine: QueryEngineRef, runtime: Arc<Runtime>) -> Self {
        Self {
-            engines: HashMap::new(),
-            region_map: DashMap::new(),
-            query_engine,
+            inner: Arc::new(RegionServerInner::new(query_engine, runtime)),
        }
    }

    pub fn register_engine(&mut self, engine: RegionEngineRef) {
+        self.inner.register_engine(engine);
+    }
+
+    pub async fn handle_request(
+        &self,
+        region_id: RegionId,
+        request: RegionRequest,
+    ) -> Result<Output> {
+        self.inner.handle_request(region_id, request).await
+    }
+
+    pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
+        self.inner.handle_read(request).await
+    }
+}
+
+#[async_trait]
+impl RegionServerHandler for RegionServer {
+    async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponse> {
+        let requests = RegionRequest::try_from_request_body(request)
+            .context(BuildRegionRequestsSnafu)
+            .map_err(BoxedError::new)
+            .context(ExecuteGrpcRequestSnafu)?;
+        let join_tasks = requests.into_iter().map(|(region_id, req)| {
+            let self_to_move = self.clone();
+            self.inner
+                .runtime
+                .spawn(async move { self_to_move.handle_request(region_id, req).await })
+        });
+
+        let results = try_join_all(join_tasks)
+            .await
+            .context(servers_error::JoinTaskSnafu)?;
+
+        // merge results by simply sum up affected rows.
+        // only insert/delete will have multiple results.
+        let mut affected_rows = 0;
+        for result in results {
+            match result
+                .map_err(BoxedError::new)
+                .context(servers_error::ExecuteGrpcRequestSnafu)?
+            {
+                Output::AffectedRows(rows) => affected_rows += rows,
+                Output::Stream(_) | Output::RecordBatches(_) => {
+                    // TODO: change the output type to only contains `affected_rows`
+                    unreachable!()
+                }
+            }
+        }
+
+        Ok(RegionResponse {
+            header: Some(ResponseHeader {
+                status: Some(Status {
+                    status_code: StatusCode::Success as _,
+                    ..Default::default()
+                }),
+            }),
+            affected_rows: affected_rows as _,
+        })
+    }
+}
+
+#[async_trait]
+impl FlightCraft for RegionServer {
+    async fn do_get(
+        &self,
+        request: Request<Ticket>,
+    ) -> TonicResult<Response<TonicStream<FlightData>>> {
+        let ticket = request.into_inner().ticket;
+        let request = QueryRequest::decode(ticket.as_ref())
+            .context(servers_error::InvalidFlightTicketSnafu)?;
+
+        let result = self.handle_read(request).await?;
+
+        let stream = Box::pin(FlightRecordBatchStream::new(result));
+        Ok(Response::new(stream))
+    }
+}
+
+struct RegionServerInner {
+    engines: RwLock<HashMap<String, RegionEngineRef>>,
+    region_map: DashMap<RegionId, RegionEngineRef>,
+    query_engine: QueryEngineRef,
+    runtime: Arc<Runtime>,
+}
+
+impl RegionServerInner {
+    pub fn new(query_engine: QueryEngineRef, runtime: Arc<Runtime>) -> Self {
+        Self {
+            engines: RwLock::new(HashMap::new()),
+            region_map: DashMap::new(),
+            query_engine,
+            runtime,
+        }
+    }
+
+    pub fn register_engine(&self, engine: RegionEngineRef) {
        let engine_name = engine.name();
-        self.engines.insert(engine_name.to_string(), engine);
+        self.engines
+            .write()
+            .unwrap()
+            .insert(engine_name.to_string(), engine);
    }

    pub async fn handle_request(
@@ -80,7 +188,7 @@ impl RegionServer {
            RegionRequest::Create(create) => RegionChange::Register(create.engine.clone()),
            RegionRequest::Open(open) => RegionChange::Register(open.engine.clone()),
            RegionRequest::Close(_) | RegionRequest::Drop(_) => RegionChange::Deregisters,
-            RegionRequest::Write(_)
+            RegionRequest::Put(_)
            | RegionRequest::Delete(_)
            | RegionRequest::Alter(_)
            | RegionRequest::Flush(_)
@@ -90,6 +198,8 @@ impl RegionServer {
        let engine = match &region_change {
            RegionChange::Register(engine_type) => self
                .engines
+                .read()
+                .unwrap()
                .get(engine_type)
                .with_context(|| RegionEngineNotFoundSnafu { name: engine_type })?
                .clone(),
--- a/src/datanode/src/server.rs
+++ b/src/datanode/src/server.rs
@@ -31,6 +31,7 @@ use crate::error::{
    WaitForGrpcServingSnafu,
 };
 use crate::instance::InstanceRef;
+use crate::region_server::RegionServer;

 pub mod grpc;

@@ -42,6 +43,9 @@ pub struct Services {

 impl Services {
    pub async fn try_new(instance: InstanceRef, opts: &DatanodeOptions) -> Result<Self> {
+        // TODO(ruihang): remove database service once region server is ready.
+        let enable_region_server = option_env!("ENABLE_REGION_SERVER").is_some();
+
        let grpc_runtime = Arc::new(
            RuntimeBuilder::default()
                .worker_threads(opts.rpc_runtime_size)
@@ -50,10 +54,24 @@ impl Services {
                .context(RuntimeResourceSnafu)?,
        );

+        let region_server = RegionServer::new(instance.query_engine(), grpc_runtime.clone());
+        let flight_handler = if enable_region_server {
+            Some(Arc::new(region_server.clone()) as _)
+        } else {
+            None
+        };
+        let region_server_handler = if enable_region_server {
+            Some(Arc::new(region_server.clone()) as _)
+        } else {
+            None
+        };
+
        Ok(Self {
            grpc_server: GrpcServer::new(
                ServerGrpcQueryHandlerAdaptor::arc(instance),
                None,
+                flight_handler,
+                region_server_handler,
                None,
                grpc_runtime,
            ),
--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -20,6 +20,7 @@ use api::v1::greptime_request::Request as GrpcRequest;
 use api::v1::meta::HeartbeatResponse;
 use api::v1::query_request::Query;
 use api::v1::QueryRequest;
+use catalog::local::MemoryCatalogManager;
 use catalog::remote::region_alive_keeper::RegionAliveKeepers;
 use catalog::CatalogManagerRef;
 use common_meta::heartbeat::handler::{
@@ -160,8 +161,10 @@ async fn test_open_region_handler() {
    let table_ident = &region_ident.table_ident;

    let table = prepare_table(instance.inner()).await;
+
+    let dummy_catalog_manager = MemoryCatalogManager::with_default_setup();
    region_alive_keepers
-        .register_table(table_ident.clone(), table)
+        .register_table(table_ident.clone(), table, dummy_catalog_manager)
        .await
        .unwrap();

@@ -173,14 +176,17 @@ async fn test_open_region_handler() {
        InstructionReply::OpenRegion(SimpleReply { result: true, .. })
    );

-    let keeper = region_alive_keepers.find_keeper(table_ident).await.unwrap();
+    let keeper = region_alive_keepers
+        .find_keeper(table_ident.table_id)
+        .await
+        .unwrap();
    let deadline = keeper.deadline(0).await.unwrap();
    assert!(deadline <= Instant::now() + Duration::from_secs(20));

    // Opens a non-exist table
    let non_exist_table_ident = TableIdent {
-        catalog: "greptime".to_string(),
-        schema: "public".to_string(),
+        catalog: "foo".to_string(),
+        schema: "non-exist".to_string(),
        table: "non-exist".to_string(),
        table_id: 2024,
        engine: "mito".to_string(),
@@ -203,7 +209,7 @@ async fn test_open_region_handler() {
    );

    assert!(region_alive_keepers
-        .find_keeper(&non_exist_table_ident)
+        .find_keeper(non_exist_table_ident.table_id)
        .await
        .is_none());

@@ -222,7 +228,7 @@ async fn test_open_region_handler() {
    assert_test_table_not_found(instance.inner()).await;

    assert!(region_alive_keepers
-        .find_keeper(table_ident)
+        .find_keeper(table_ident.table_id)
        .await
        .is_none());

--- a/src/datatypes/src/error.rs
+++ b/src/datatypes/src/error.rs
@@ -115,6 +115,9 @@ pub enum Error {

    #[snafu(display("Column {} already exists", column))]
    DuplicateColumn { column: String, location: Location },
+
+    #[snafu(display("Failed to unpack value to given type: {}", reason))]
+    TryFromValue { reason: String, location: Location },
 }

 impl ErrorExt for Error {
--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -32,7 +32,7 @@ use serde::{Deserialize, Serialize};
 use snafu::{ensure, ResultExt};

 use crate::error;
-use crate::error::Result;
+use crate::error::{Error, Result, TryFromValueSnafu};
 use crate::prelude::*;
 use crate::type_id::LogicalTypeId;
 use crate::types::{IntervalType, ListType};
@@ -441,6 +441,62 @@ impl Ord for Value {
    }
 }

+macro_rules! impl_try_from_value {
+    ($Variant: ident, $Type: ident) => {
+        impl TryFrom<Value> for $Type {
+            type Error = Error;
+
+            #[inline]
+            fn try_from(from: Value) -> std::result::Result<Self, Self::Error> {
+                match from {
+                    Value::$Variant(v) => Ok(v.into()),
+                    _ => TryFromValueSnafu {
+                        reason: format!("{:?} is not a {}", from, stringify!($Type)),
+                    }
+                    .fail(),
+                }
+            }
+        }
+
+        impl TryFrom<Value> for Option<$Type> {
+            type Error = Error;
+
+            #[inline]
+            fn try_from(from: Value) -> std::result::Result<Self, Self::Error> {
+                match from {
+                    Value::$Variant(v) => Ok(Some(v.into())),
+                    Value::Null => Ok(None),
+                    _ => TryFromValueSnafu {
+                        reason: format!("{:?} is not a {}", from, stringify!($Type)),
+                    }
+                    .fail(),
+                }
+            }
+        }
+    };
+}
+
+impl_try_from_value!(Boolean, bool);
+impl_try_from_value!(UInt8, u8);
+impl_try_from_value!(UInt16, u16);
+impl_try_from_value!(UInt32, u32);
+impl_try_from_value!(UInt64, u64);
+impl_try_from_value!(Int8, i8);
+impl_try_from_value!(Int16, i16);
+impl_try_from_value!(Int32, i32);
+impl_try_from_value!(Int64, i64);
+impl_try_from_value!(Float32, f32);
+impl_try_from_value!(Float64, f64);
+impl_try_from_value!(Float32, OrderedF32);
+impl_try_from_value!(Float64, OrderedF64);
+impl_try_from_value!(String, StringBytes);
+impl_try_from_value!(Binary, Bytes);
+impl_try_from_value!(Date, Date);
+impl_try_from_value!(Time, Time);
+impl_try_from_value!(DateTime, DateTime);
+impl_try_from_value!(Timestamp, Timestamp);
+impl_try_from_value!(Interval, Interval);
+
 macro_rules! impl_value_from {
    ($Variant: ident, $Type: ident) => {
        impl From<$Type> for Value {
@@ -471,6 +527,8 @@ impl_value_from!(Int32, i32);
 impl_value_from!(Int64, i64);
 impl_value_from!(Float32, f32);
 impl_value_from!(Float64, f64);
+impl_value_from!(Float32, OrderedF32);
+impl_value_from!(Float64, OrderedF64);
 impl_value_from!(String, StringBytes);
 impl_value_from!(Binary, Bytes);
 impl_value_from!(Date, Date);
--- a/src/datatypes/src/vectors/boolean.rs
+++ b/src/datatypes/src/vectors/boolean.rs
@@ -39,7 +39,8 @@ impl BooleanVector {
        &self.array
    }

-    pub(crate) fn as_boolean_array(&self) -> &BooleanArray {
+    /// Get the inner boolean array.
+    pub fn as_boolean_array(&self) -> &BooleanArray {
        &self.array
    }

--- a/src/datatypes/src/vectors/primitive.rs
+++ b/src/datatypes/src/vectors/primitive.rs
@@ -230,7 +230,8 @@ impl<T: LogicalPrimitiveType> PrimitiveVector<T> {
        }
    }

-    pub(crate) fn as_arrow(&self) -> &PrimitiveArray<T::ArrowPrimitive> {
+    /// Get the inner arrow array.
+    pub fn as_arrow(&self) -> &PrimitiveArray<T::ArrowPrimitive> {
        &self.array
    }

@@ -245,7 +246,11 @@ impl<T: LogicalPrimitiveType> PrimitiveVector<T> {
    }

    // To distinguish with `Vector::slice()`.
-    fn get_slice(&self, offset: usize, length: usize) -> Self {
+    /// Slice the batch, returning a new batch.
+    ///
+    /// # Panics
+    /// This function panics if `offset + length > self.len()`.
+    pub fn get_slice(&self, offset: usize, length: usize) -> Self {
        let data = self.array.to_data().slice(offset, length);
        Self::from_array_data(data)
    }
@@ -295,8 +300,7 @@ impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
    }

    fn slice(&self, offset: usize, length: usize) -> VectorRef {
-        let data = self.array.to_data().slice(offset, length);
-        Arc::new(Self::from_array_data(data))
+        Arc::new(self.get_slice(offset, length))
    }

    fn get(&self, index: usize) -> Value {
--- a/src/flow/Cargo.toml
+++ b/src/flow/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "flow"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# use version from crates.io for now to prevent version slewing
+# disable default-features which include `abomonaion` which we don't need for IPC
+# timely = {version = "0.12.0", default-features = false, features = ["bincode"]}
+# differential-dataflow = "0.12.0"
+# timely = "0.12.0"
+# differential-dataflow = "0.12.0"
+# TODO(discord9): fork later for fixed version git dependency
+timely = { git = "https://github.com/TimelyDataflow/timely-dataflow", default-features = false, features = [
+    "bincode",
+] }
+differential-dataflow = { git = "https://github.com/TimelyDataflow/differential-dataflow" } #, rev = "99fa67db" }
+datafusion-expr.workspace = true
+datafusion-substrait.workspace = true
+serde = { version = "1.0", features = ["derive"] }
+datatypes = { path = "../datatypes" }
+
+common-telemetry = { path = "../common/telemetry" }
--- a/src/flow/src/adapter/mod.rs
+++ b/src/flow/src/adapter/mod.rs
@@ -0,0 +1,3 @@
+//! for getting data from source and sending results to sink
+//! and communicating with other parts of the database
+//! also commands storage and computation layer
--- a/src/flow/src/compute/compute_state.rs
+++ b/src/flow/src/compute/compute_state.rs
@@ -0,0 +1,22 @@
+use std::collections::BTreeMap;
+
+use crate::expr::GlobalId;
+
+/// Worker-local state that is maintained across dataflows.
+///
+/// This state is restricted to the COMPUTE state, the deterministic, idempotent work
+/// done between data ingress and egress.
+pub struct ComputeState {
+    /// State kept for each installed compute collection.
+    ///
+    /// Each collection has exactly one frontier.
+    /// How the frontier is communicated depends on the collection type:
+    ///  * Frontiers of indexes are equal to the frontier of their corresponding traces in the
+    ///    `TraceManager`.
+    ///  * Persist sinks store their current frontier in `CollectionState::sink_write_frontier`.
+    ///  * Subscribes report their frontiers through the `subscribe_response_buffer`.
+    pub collections: BTreeMap<GlobalId, CollectionState>,
+}
+
+/// State maintained for a compute collection.
+pub struct CollectionState {}
--- a/src/flow/src/compute/context.rs
+++ b/src/flow/src/compute/context.rs
@@ -0,0 +1,743 @@
+use std::collections::BTreeMap;
+
+use differential_dataflow::lattice::Lattice;
+use differential_dataflow::operators::arrange::Arranged;
+use differential_dataflow::trace::wrappers::enter::TraceEnter;
+use differential_dataflow::trace::wrappers::frontier::TraceFrontier;
+use differential_dataflow::trace::{BatchReader, Cursor, TraceReader};
+use differential_dataflow::{Collection, Data};
+use timely::communication::message::RefOrMut;
+use timely::dataflow::operators::generic::OutputHandle;
+use timely::dataflow::operators::Capability;
+use timely::dataflow::scopes::Child;
+use timely::dataflow::{Scope, ScopeParent};
+use timely::progress::timestamp::Refines;
+use timely::progress::{Antichain, Timestamp};
+
+use super::plan::Plan;
+use super::types::DataflowDescription;
+use crate::compute::render::RenderTimestamp;
+use crate::compute::typedefs::{TraceErrHandle, TraceRowHandle};
+use crate::expr::{GlobalId, Id, MapFilterProject, ScalarExpr};
+use crate::repr;
+use crate::repr::{Diff, Row};
+use crate::storage::errors::DataflowError;
+
+// Local type definition to avoid the horror in signatures.
+pub(crate) type KeyArrangement<S, K, V> =
+    Arranged<S, TraceRowHandle<K, V, <S as ScopeParent>::Timestamp, Diff>>;
+pub(crate) type Arrangement<S, V> = KeyArrangement<S, V, V>;
+pub(crate) type ErrArrangement<S> =
+    Arranged<S, TraceErrHandle<DataflowError, <S as ScopeParent>::Timestamp, Diff>>;
+pub(crate) type ArrangementImport<S, V, T> = Arranged<
+    S,
+    TraceEnter<TraceFrontier<TraceRowHandle<V, V, T, Diff>>, <S as ScopeParent>::Timestamp>,
+>;
+pub(crate) type ErrArrangementImport<S, T> = Arranged<
+    S,
+    TraceEnter<
+        TraceFrontier<TraceErrHandle<DataflowError, T, Diff>>,
+        <S as ScopeParent>::Timestamp,
+    >,
+>;
+
+/// Describes flavor of arrangement: local or imported trace.
+#[derive(Clone)]
+pub enum ArrangementFlavor<S: Scope, V: Data, T = repr::Timestamp>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// A dataflow-local arrangement.
+    Local(Arrangement<S, V>, ErrArrangement<S>),
+    /// An imported trace from outside the dataflow.
+    ///
+    /// The `GlobalId` identifier exists so that exports of this same trace
+    /// can refer back to and depend on the original instance.
+    Trace(
+        GlobalId,
+        ArrangementImport<S, V, T>,
+        ErrArrangementImport<S, T>,
+    ),
+}
+
+impl<S: Scope, T> ArrangementFlavor<S, Row, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// Presents `self` as a stream of updates.
+    ///
+    /// This method presents the contents as they are, without further computation.
+    /// If you have logic that could be applied to each record, consider using the
+    /// `flat_map` methods which allows this and can reduce the work done.
+    pub fn as_collection(&self) -> (Collection<S, Row, Diff>, Collection<S, DataflowError, Diff>) {
+        match &self {
+            ArrangementFlavor::Local(oks, errs) => (
+                oks.as_collection(move |k: &Row, v: &Row| {
+                    // type annotated because rust-analzyer can't infer the type due to being complex closures
+                    // see https://github.com/rust-lang/rust-analyzer/issues/6338
+                    let mut k = k.clone();
+                    k.extend(v.clone().into_iter());
+                    k
+                }),
+                errs.as_collection(|k, &()| k.clone()),
+            ),
+            ArrangementFlavor::Trace(_, oks, errs) => (
+                oks.as_collection(move |k, v| {
+                    let mut k = k.clone();
+                    k.extend(v.clone().into_iter());
+                    k
+                }),
+                errs.as_collection(|k, &()| k.clone()),
+            ),
+        }
+    }
+
+    /// Constructs and applies logic to elements of `self` and returns the results.
+    ///
+    /// `constructor` takes a permutation and produces the logic to apply on elements. The logic
+    /// conceptually receives `(&Row, &Row)` pairs in the form of a slice. Only after borrowing
+    /// the elements and applying the permutation the datums will be in the expected order.
+    ///
+    /// If `key` is set, this is a promise that `logic` will produce no results on
+    /// records for which the key does not evaluate to the value. This is used to
+    /// leap directly to exactly those records.
+    pub fn flat_map<I, C, L>(
+        &self,
+        key: Option<Row>,
+        constructor: C,
+    ) -> (
+        timely::dataflow::Stream<S, I::Item>,
+        Collection<S, DataflowError, Diff>,
+    )
+    where
+        I: IntoIterator,
+        I::Item: Data,
+        C: FnOnce() -> L,
+        L: for<'a, 'b> FnMut(&'a [&'b RefOrMut<'b, Row>], &'a S::Timestamp, &'a Diff) -> I
+            + 'static,
+    {
+        // Set a number of tuples after which the operator should yield.
+        // This allows us to remain responsive even when enumerating a substantial
+        // arrangement, as well as provides time to accumulate our produced output.
+        let refuel = 1000000;
+
+        match &self {
+            ArrangementFlavor::Local(oks, errs) => {
+                let mut logic = constructor();
+                let oks = CollectionBundle::<S, Row, T>::flat_map_core(
+                    oks,
+                    key,
+                    move |k, v, t, d| logic(&[&k, &v], t, d),
+                    refuel,
+                );
+                let errs = errs.as_collection(|k, &()| k.clone());
+                (oks, errs)
+            }
+            ArrangementFlavor::Trace(_, oks, errs) => {
+                let mut logic = constructor();
+                let oks = CollectionBundle::<S, Row, T>::flat_map_core(
+                    oks,
+                    key,
+                    move |k, v, t, d| logic(&[&k, &v], t, d),
+                    refuel,
+                );
+                let errs = errs.as_collection(|k, &()| k.clone());
+                (oks, errs)
+            }
+        }
+    }
+}
+
+impl<S: Scope, V: Data, T> ArrangementFlavor<S, V, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    pub fn scope(&self) -> S {
+        match self {
+            ArrangementFlavor::Local(oks, _errs) => oks.stream.scope(),
+            ArrangementFlavor::Trace(_gid, oks, _errs) => oks.stream.scope(),
+        }
+    }
+
+    /// Brings the arrangement flavor into a region.
+    pub fn enter_region<'a>(
+        &self,
+        region: &Child<'a, S, S::Timestamp>,
+    ) -> ArrangementFlavor<Child<'a, S, S::Timestamp>, V, T> {
+        match self {
+            ArrangementFlavor::Local(oks, errs) => {
+                ArrangementFlavor::Local(oks.enter_region(region), errs.enter_region(region))
+            }
+            ArrangementFlavor::Trace(gid, oks, errs) => {
+                ArrangementFlavor::Trace(*gid, oks.enter_region(region), errs.enter_region(region))
+            }
+        }
+    }
+}
+
+impl<'a, S: Scope, V: Data, T> ArrangementFlavor<Child<'a, S, S::Timestamp>, V, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// Extracts the arrangement flavor from a region.
+    pub fn leave_region(&self) -> ArrangementFlavor<S, V, T> {
+        match self {
+            ArrangementFlavor::Local(oks, errs) => {
+                ArrangementFlavor::Local(oks.leave_region(), errs.leave_region())
+            }
+            ArrangementFlavor::Trace(gid, oks, errs) => {
+                ArrangementFlavor::Trace(*gid, oks.leave_region(), errs.leave_region())
+            }
+        }
+    }
+}
+
+pub struct Context<S, V: Data, T = repr::Timestamp>
+where
+    T: Timestamp + Lattice,
+    S: Scope,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// The scope within which all managed collections exist.
+    ///
+    /// It is an error to add any collections not contained in this scope.
+    pub(crate) scope: S,
+    /// The debug name of the dataflow associated with this context.
+    pub debug_name: String,
+    /// The Timely ID of the dataflow associated with this context.
+    pub dataflow_id: usize,
+    /// Frontier before which updates should not be emitted.
+    ///
+    /// We *must* apply it to sinks, to ensure correct outputs.
+    /// We *should* apply it to sources and imported traces, because it improves performance.
+    pub since_frontier: Antichain<T>,
+    /// Frontier after which updates should not be emitted.
+    /// Used to limit the amount of work done when appropriate.
+    pub until_frontier: Antichain<T>,
+    /// Bindings of identifiers to collections.
+    pub bindings: BTreeMap<Id, CollectionBundle<S, V, T>>,
+}
+
+impl<S: Scope, V: Data> Context<S, V>
+where
+    S::Timestamp: Lattice + Refines<repr::Timestamp>,
+{
+    /// TODO(discord9)" DataflowDesc & Plan & etc.
+    /// Creates a new empty Context from given dataflow
+    pub fn for_dataflow_in<Plan>(dataflow: &DataflowDescription<Plan, ()>, scope: S) -> Self {
+        let dataflow_id = scope.addr()[0];
+        let since_frontier = dataflow
+            .as_of
+            .clone()
+            .unwrap_or_else(|| Antichain::from_elem(Timestamp::minimum()));
+        // TODO(discord9)=: get since_frontier and until_frontier from dataflow_desc
+        Self {
+            scope,
+            debug_name: dataflow.debug_name.clone(),
+            dataflow_id,
+            since_frontier,
+            until_frontier: dataflow.until.clone(),
+            bindings: BTreeMap::new(),
+        }
+    }
+}
+
+impl<S: Scope, V: Data, T: Lattice> Context<S, V, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// Insert a collection bundle by an identifier.
+    ///
+    /// This is expected to be used to install external collections (sources, indexes, other views),
+    /// as well as for `Let` bindings of local collections.
+    pub fn insert_id(
+        &mut self,
+        id: Id,
+        collection: CollectionBundle<S, V, T>,
+    ) -> Option<CollectionBundle<S, V, T>> {
+        self.bindings.insert(id, collection)
+    }
+
+    /// Remove a collection bundle by an identifier.
+    ///
+    /// The primary use of this method is uninstalling `Let` bindings.
+    pub fn remove_id(&mut self, id: Id) -> Option<CollectionBundle<S, V, T>> {
+        self.bindings.remove(&id)
+    }
+    /// Melds a collection bundle to whatever exists.
+    #[allow(clippy::map_entry)]
+    pub fn update_id(&mut self, id: Id, collection: CollectionBundle<S, V, T>) {
+        if !self.bindings.contains_key(&id) {
+            self.bindings.insert(id, collection);
+        } else {
+            let binding = self
+                .bindings
+                .get_mut(&id)
+                .expect("Binding verified to exist");
+            if collection.collection.is_some() {
+                binding.collection = collection.collection;
+            }
+            for (key, flavor) in collection.arranged.into_iter() {
+                binding.arranged.insert(key, flavor);
+            }
+        }
+    }
+    /// Look up a collection bundle by an identifier.
+    pub fn lookup_id(&self, id: Id) -> Option<CollectionBundle<S, V, T>> {
+        self.bindings.get(&id).cloned()
+    }
+}
+
+type ResultCollection<S, V> = (Collection<S, V, Diff>, Collection<S, DataflowError, Diff>);
+
+/// A bundle of the various ways a collection can be represented.
+///
+/// This type maintains the invariant that it does contain at least one valid
+/// source of data, either a collection or at least one arrangement.
+#[derive(Clone)]
+pub struct CollectionBundle<S, V, T = repr::Timestamp>
+where
+    T: Timestamp + Lattice,
+    S: Scope,
+    S::Timestamp: Lattice + Refines<T>,
+    V: Data,
+{
+    pub(crate) collection: Option<ResultCollection<S, V>>,
+    /// TODO(discord9): impl: 1. ScalarExpr(Could be from substrait), 2. Arrangement
+    pub(crate) arranged: BTreeMap<Vec<ScalarExpr>, ArrangementFlavor<S, V, T>>,
+}
+
+impl<S: Scope, V: Data, T: Lattice> CollectionBundle<S, V, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// Construct a new collection bundle from update streams.
+    pub fn from_collections(
+        oks: Collection<S, V, Diff>,
+        errs: Collection<S, DataflowError, Diff>,
+    ) -> Self {
+        Self {
+            collection: Some((oks, errs)),
+            arranged: BTreeMap::default(),
+        }
+    }
+
+    /// Inserts arrangements by the expressions on which they are keyed.
+    pub fn from_expressions(
+        exprs: Vec<ScalarExpr>,
+        arrangements: ArrangementFlavor<S, V, T>,
+    ) -> Self {
+        let mut arranged = BTreeMap::new();
+        arranged.insert(exprs, arrangements);
+        Self {
+            collection: None,
+            arranged,
+        }
+    }
+
+    /// Inserts arrangements by the columns on which they are keyed.
+    pub fn from_columns<I: IntoIterator<Item = usize>>(
+        columns: I,
+        arrangements: ArrangementFlavor<S, V, T>,
+    ) -> Self {
+        let mut keys = Vec::new();
+        for column in columns {
+            keys.push(ScalarExpr::Column(column));
+        }
+        Self::from_expressions(keys, arrangements)
+    }
+
+    /// The scope containing the collection bundle.
+    pub fn scope(&self) -> S {
+        if let Some((oks, _errs)) = &self.collection {
+            oks.inner.scope()
+        } else {
+            self.arranged
+                .values()
+                .next()
+                .expect("Must contain a valid collection")
+                .scope()
+        }
+    }
+
+    /// Brings the collection bundle into a region.
+    pub fn enter_region<'a>(
+        &self,
+        region: &Child<'a, S, S::Timestamp>,
+    ) -> CollectionBundle<Child<'a, S, S::Timestamp>, V, T> {
+        CollectionBundle {
+            collection: self
+                .collection
+                .as_ref()
+                .map(|(oks, errs)| (oks.enter_region(region), errs.enter_region(region))),
+            arranged: self
+                .arranged
+                .iter()
+                .map(|(key, bundle)| (key.clone(), bundle.enter_region(region)))
+                .collect(),
+        }
+    }
+}
+
+impl<S, T> CollectionBundle<S, repr::Row, T>
+where
+    T: timely::progress::Timestamp + Lattice,
+    S: Scope,
+    S::Timestamp: Refines<T> + Lattice + RenderTimestamp,
+{
+    /// Presents `self` as a stream of updates, having been subjected to `mfp`.
+    ///
+    /// This operator is able to apply the logic of `mfp` early, which can substantially
+    /// reduce the amount of data produced when `mfp` is non-trivial.
+    ///
+    /// The `key_val` argument, when present, indicates that a specific arrangement should
+    /// be used, and if, in addition, the `val` component is present,
+    /// that we can seek to the supplied row.
+    pub fn as_collection_core(
+        &self,
+        mut mfp: MapFilterProject,
+        key_val: Option<(Vec<ScalarExpr>, Option<Row>)>,
+        until: Antichain<repr::Timestamp>,
+    ) -> (Collection<S, Row, Diff>, Collection<S, DataflowError, Diff>) {
+        mfp.optimize();
+
+        let mfp_plan = mfp.into_plan().unwrap();
+
+        // If the MFP is trivial, we can just call `as_collection`.
+        // In the case that we weren't going to apply the `key_val` optimization,
+        // this path results in a slightly smaller and faster
+        // dataflow graph, and is intended to fix
+        let has_key_val = matches!(&key_val, Some((_key, Some(_val))));
+
+        if mfp_plan.is_identity() && !has_key_val {
+            let key = key_val.map(|(k, _v)| k);
+            return self.as_specific_collection(key.as_deref());
+        }
+        let (stream, errors) = self.flat_map(key_val, || {
+            let until = std::rc::Rc::new(until);
+            // this logic get executed every time a new row arrives
+            move |row_parts, time, diff| {
+                let until = std::rc::Rc::clone(&until);
+                let row_iters = row_parts
+                    .iter()
+                    .flat_map(|row| (**row).to_owned().into_iter());
+                let mut datums_local = Vec::new();
+                datums_local.extend(row_iters);
+                let time = time.clone();
+                let event_time: repr::Timestamp = *time.clone().event_time();
+                mfp_plan
+                    .evaluate::<DataflowError, _>(
+                        &mut datums_local,
+                        event_time,
+                        *diff,
+                        move |time| !until.less_equal(time),
+                    )
+                    .map(move |x| match x {
+                        Ok((row, event_time, diff)) => {
+                            // Copy the whole time, and re-populate event time.
+                            let mut time: S::Timestamp = time.clone();
+                            *time.event_time() = event_time;
+                            Ok((row, time, diff))
+                        }
+                        Err((e, event_time, diff)) => {
+                            // Copy the whole time, and re-populate event time.
+                            let mut time: S::Timestamp = time.clone();
+                            *time.event_time() = event_time;
+                            Err((e, time, diff))
+                        }
+                    })
+            }
+        });
+
+        use timely::dataflow::operators::ok_err::OkErr;
+        let (oks, errs) = stream.ok_err(|x| x);
+
+        use differential_dataflow::AsCollection;
+        let oks = oks.as_collection();
+        let errs = errs.as_collection();
+        (oks, errors.concat(&errs))
+    }
+}
+
+impl<'a, S: Scope, V: Data, T> CollectionBundle<Child<'a, S, S::Timestamp>, V, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// Extracts the collection bundle from a region.
+    pub fn leave_region(&self) -> CollectionBundle<S, V, T> {
+        CollectionBundle {
+            collection: self
+                .collection
+                .as_ref()
+                .map(|(oks, errs)| (oks.leave_region(), errs.leave_region())),
+            arranged: self
+                .arranged
+                .iter()
+                .map(|(key, bundle)| (key.clone(), bundle.leave_region()))
+                .collect(),
+        }
+    }
+}
+
+impl<S: Scope, T: Lattice> CollectionBundle<S, Row, T>
+where
+    T: Timestamp + Lattice,
+    S::Timestamp: Lattice + Refines<T>,
+{
+    /// Asserts that the arrangement for a specific key
+    /// (or the raw collection for no key) exists,
+    /// and returns the corresponding collection.
+    ///
+    /// This returns the collection as-is, without
+    /// doing any unthinning transformation.
+    /// Therefore, it should be used when the appropriate transformation
+    /// was planned as part of a following MFP.
+    pub fn as_specific_collection(
+        &self,
+        key: Option<&[ScalarExpr]>,
+    ) -> (Collection<S, Row, Diff>, Collection<S, DataflowError, Diff>) {
+        // Any operator that uses this method was told to use a particular
+        // collection during LIR planning, where we should have made
+        // sure that that collection exists.
+        //
+        // If it doesn't, we panic.
+        match key {
+            None => self
+                .collection
+                .clone()
+                .expect("The unarranged collection doesn't exist."),
+            Some(key) => self
+                .arranged
+                .get(key)
+                .unwrap_or_else(|| panic!("The collection arranged by {:?} doesn't exist.", key))
+                .as_collection(),
+        }
+    }
+
+    /// Constructs and applies logic to elements of a collection and returns the results.
+    ///
+    /// `constructor` takes a permutation and produces the logic to apply on elements. The logic
+    /// conceptually receives `(&Row, &Row)` pairs in the form of a slice. Only after borrowing
+    /// the elements and applying the permutation the datums will be in the expected order.
+    ///
+    /// If `key_val` is set, this is a promise that `logic` will produce no results on
+    /// records for which the key does not evaluate to the value. This is used when we
+    /// have an arrangement by that key to leap directly to exactly those records.
+    /// It is important that `logic` still guard against data that does not satisfy
+    /// this constraint, as this method does not statically know that it will have
+    /// that arrangement.
+    pub fn flat_map<I, C, L>(
+        &self,
+        key_val: Option<(Vec<ScalarExpr>, Option<Row>)>,
+        constructor: C,
+    ) -> (
+        timely::dataflow::Stream<S, I::Item>,
+        Collection<S, DataflowError, Diff>,
+    )
+    where
+        I: IntoIterator,
+        I::Item: Data,
+        C: FnOnce() -> L,
+        L: for<'a, 'b> FnMut(&'a [&'b RefOrMut<'b, Row>], &'a S::Timestamp, &'a Diff) -> I
+            + 'static,
+    {
+        // If `key_val` is set, we should have use the corresponding arrangement.
+        // If there isn't one, that implies an error in the contract between
+        // key-production and available arrangements.
+        if let Some((key, val)) = key_val {
+            let flavor = self
+                .arrangement(&key)
+                .expect("Should have ensured during planning that this arrangement exists.");
+            flavor.flat_map(val, constructor)
+        } else {
+            use timely::dataflow::operators::Map;
+            let (oks, errs) = self
+                .collection
+                .clone()
+                .expect("Invariant violated: CollectionBundle contains no collection.");
+            let mut logic = constructor();
+            (
+                oks.inner
+                    .flat_map(move |(mut v, t, d)| logic(&[&RefOrMut::Mut(&mut v)], &t, &d)),
+                errs,
+            )
+        }
+    }
+
+    /// Factored out common logic for using literal keys in general traces.
+    ///
+    /// This logic is sufficiently interesting that we want to write it only
+    /// once, and thereby avoid any skew in the two uses of the logic.
+    ///
+    /// The function presents the contents of the trace as `(key, value, time, delta)` tuples,
+    /// where key and value are rows.
+    fn flat_map_core<Tr, I, L>(
+        trace: &Arranged<S, Tr>,
+        key: Option<Row>,
+        mut logic: L,
+        refuel: usize,
+    ) -> timely::dataflow::Stream<S, I::Item>
+    where
+        Tr: TraceReader<Key = Row, Val = Row, Time = S::Timestamp, R = repr::Diff>
+            + Clone
+            + 'static,
+        I: IntoIterator,
+        I::Item: Data,
+        L: for<'a, 'b> FnMut(
+                RefOrMut<'b, Row>,
+                RefOrMut<'b, Row>,
+                &'a S::Timestamp,
+                &'a repr::Diff,
+            ) -> I
+            + 'static,
+    {
+        let mode = if key.is_some() { "index" } else { "scan" };
+        let name = format!("ArrangementFlatMap({})", mode);
+        use timely::dataflow::channels::pact::Pipeline;
+        use timely::dataflow::operators::Operator;
+        trace.stream.unary(Pipeline, &name, move |_, info| {
+            // Acquire an activator to reschedule the operator when it has unfinished work.
+            use timely::scheduling::Activator;
+            let activations = trace.stream.scope().activations();
+            let activator = Activator::new(&info.address[..], activations);
+            // Maintain a list of work to do, cursor to navigate and process.
+            let mut todo = std::collections::VecDeque::new();
+            move |input, output| {
+                // First, dequeue all batches.
+                input.for_each(|time, data| {
+                    let capability = time.retain();
+                    for batch in data.iter() {
+                        // enqueue a capability, cursor, and batch.
+                        todo.push_back(PendingWork::new(
+                            capability.clone(),
+                            batch.cursor(),
+                            batch.clone(),
+                        ));
+                    }
+                });
+
+                // Second, make progress on `todo`.
+                let mut fuel = refuel;
+                while !todo.is_empty() && fuel > 0 {
+                    todo.front_mut()
+                        .unwrap()
+                        .do_work(&key, &mut logic, &mut fuel, output);
+                    if fuel > 0 {
+                        todo.pop_front();
+                    }
+                }
+                // If we have not finished all work, re-activate the operator.
+                if !todo.is_empty() {
+                    activator.activate();
+                }
+            }
+        })
+    }
+
+    /// Look up an arrangement by the expressions that form the key.
+    ///
+    /// The result may be `None` if no such arrangement exists, or it may be one of many
+    /// "arrangement flavors" that represent the types of arranged data we might have.
+    pub fn arrangement(&self, key: &[ScalarExpr]) -> Option<ArrangementFlavor<S, Row, T>> {
+        self.arranged.get(key).cloned()
+    }
+}
+
+struct PendingWork<C>
+where
+    C: Cursor,
+    C::Time: Timestamp,
+{
+    capability: Capability<C::Time>,
+    cursor: C,
+    batch: C::Storage,
+}
+
+/// Handle specialized to `Vec`-based container.
+type PendingOutputHandle<'a, C, I> = OutputHandle<
+    'a,
+    <C as Cursor>::Time,
+    <I as IntoIterator>::Item,
+    timely::dataflow::channels::pushers::Tee<<C as Cursor>::Time, <I as IntoIterator>::Item>,
+>;
+impl<C: Cursor> PendingWork<C>
+where
+    C::Key: PartialEq,
+    C::Time: Timestamp,
+{
+    /// Create a new bundle of pending work, from the capability, cursor, and backing storage.
+    fn new(capability: Capability<C::Time>, cursor: C, batch: C::Storage) -> Self {
+        Self {
+            capability,
+            cursor,
+            batch,
+        }
+    }
+    /// Perform roughly `fuel` work through the cursor, applying `logic` and sending results to `output`.
+    fn do_work<I, L>(
+        &mut self,
+        key: &Option<C::Key>,
+        logic: &mut L,
+        fuel: &mut usize,
+        output: &mut PendingOutputHandle<'_, C, I>,
+    ) where
+        I: IntoIterator,
+        I::Item: Data,
+        L: for<'a, 'b> FnMut(
+                RefOrMut<'b, C::Key>,
+                RefOrMut<'b, C::Val>,
+                &'a C::Time,
+                &'a C::R,
+            ) -> I
+            + 'static,
+    {
+        // Attempt to make progress on this batch.
+        let mut work: usize = 0;
+        let mut session = output.session(&self.capability);
+        if let Some(key) = key {
+            if self.cursor.get_key(&self.batch) != Some(key) {
+                self.cursor.seek_key(&self.batch, key);
+            }
+            if self.cursor.get_key(&self.batch) == Some(key) {
+                while let Some(val) = self.cursor.get_val(&self.batch) {
+                    self.cursor.map_times(&self.batch, |time, diff| {
+                        for datum in logic(RefOrMut::Ref(key), RefOrMut::Ref(val), time, diff) {
+                            session.give(datum);
+                            work += 1;
+                        }
+                    });
+                    self.cursor.step_val(&self.batch);
+                    if work >= *fuel {
+                        *fuel = 0;
+                        return;
+                    }
+                }
+            }
+        } else {
+            while let Some(key) = self.cursor.get_key(&self.batch) {
+                while let Some(val) = self.cursor.get_val(&self.batch) {
+                    self.cursor.map_times(&self.batch, |time, diff| {
+                        for datum in logic(RefOrMut::Ref(key), RefOrMut::Ref(val), time, diff) {
+                            session.give(datum);
+                            work += 1;
+                        }
+                    });
+                    self.cursor.step_val(&self.batch);
+                    if work >= *fuel {
+                        *fuel = 0;
+                        return;
+                    }
+                }
+                self.cursor.step_key(&self.batch);
+            }
+        }
+        *fuel -= work;
+    }
+}
--- a/src/flow/src/compute/mod.rs
+++ b/src/flow/src/compute/mod.rs
@@ -0,0 +1,15 @@
+//! for generate dataflow from logical plan and computing the dataflow
+mod compute_state;
+mod context;
+mod plan;
+mod render;
+mod typedefs;
+mod types;
+
+pub use context::Context;
+
+// TODO(discord9): make a simplified version of source/sink
+// sink: simply get rows out of sinked collection/err collection and put it somewhere
+// (R, T, D) row of course with since/until frontier to limit
+
+// source: simply insert stuff into it
--- a/src/flow/src/compute/plan/join/delta_join.rs
+++ b/src/flow/src/compute/plan/join/delta_join.rs
@@ -0,0 +1,10 @@
+use serde::{Deserialize, Serialize};
+
+/// A delta query is implemented by a set of paths, one for each input.
+///
+/// Each delta query path responds to its input changes by repeated lookups
+/// in arrangements for other join inputs. These lookups require specific
+/// instructions about which expressions to use as keys. Along the way,
+/// various closures are applied to filter and project as early as possible.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct DeltaJoinPlan {}
--- a/src/flow/src/compute/plan/join/linear_join.rs
+++ b/src/flow/src/compute/plan/join/linear_join.rs
@@ -0,0 +1,9 @@
+use serde::{Deserialize, Serialize};
+
+/// TODO(discord9): impl Join
+/// A plan for the execution of a linear join.
+///
+/// A linear join is a sequence of stages, each of which introduces
+/// a new collection. Each stage is represented by a [LinearStagePlan].
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct LinearJoinPlan {}
--- a/src/flow/src/compute/plan/join/mod.rs
+++ b/src/flow/src/compute/plan/join/mod.rs
@@ -0,0 +1,15 @@
+use serde::{Deserialize, Serialize};
+mod delta_join;
+mod linear_join;
+pub use delta_join::DeltaJoinPlan;
+pub use linear_join::LinearJoinPlan;
+
+/// TODO(discord9)(discord9): impl Join
+/// A complete enumeration of possible join plans to render.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub enum JoinPlan {
+    /// A join implemented by a linear join.
+    Linear(LinearJoinPlan),
+    /// A join implemented by a delta join.
+    Delta(DeltaJoinPlan),
+}
--- a/src/flow/src/compute/plan/mod.rs
+++ b/src/flow/src/compute/plan/mod.rs
@@ -0,0 +1,222 @@
+mod join;
+mod reduce;
+
+use std::collections::BTreeMap;
+
+use join::JoinPlan;
+pub(crate) use reduce::{
+    convert_indexes_to_skips, AccumulablePlan, BucketedPlan, KeyValPlan, ReducePlan,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::expr::{Id, LocalId, MapFilterProject, ScalarExpr, TableFunc};
+use crate::repr::{self, Diff, Row};
+use crate::storage::errors::EvalError;
+
+/// The forms in which an operator's output is available;
+/// it can be considered the plan-time equivalent of
+/// `render::context::CollectionBundle`.
+///
+/// These forms are either "raw", representing an unarranged collection,
+/// or "arranged", representing one that has been arranged by some key.
+///
+/// The raw collection, if it exists, may be consumed directly.
+///
+/// The arranged collections are slightly more complicated:
+/// Each key here is attached to a description of how the corresponding
+/// arrangement is permuted to remove value columns
+/// that are redundant with key columns. Thus, the first element in each
+/// tuple of `arranged` is the arrangement key; the second is the map of
+/// logical output columns to columns in the key or value of the deduplicated
+/// representation, and the third is a "thinning expression",
+/// or list of columns to include in the value
+/// when arranging.
+///
+/// For example, assume a 5-column collection is to be arranged by the key
+/// `[Column(2), Column(0) + Column(3), Column(1)]`.
+/// Then `Column(1)` and `Column(2)` in the value are redundant with the key, and
+/// only columns 0, 3, and 4 need to be stored separately.
+/// The thinning expression will then be `[0, 3, 4]`.
+///
+/// The permutation represents how to recover the
+/// original values (logically `[Column(0), Column(1), Column(2), Column(3), Column(4)]`)
+/// from the key and value of the arrangement, logically
+/// `[Column(2), Column(0) + Column(3), Column(1), Column(0), Column(3), Column(4)]`.
+/// Thus, the permutation in this case should be `{0: 3, 1: 2, 2: 0, 3: 4, 4: 5}`.
+///
+/// Note that this description, while true at the time of writing, is merely illustrative;
+/// users of this struct should not rely on the exact strategy used for generating
+/// the permutations. As long as clients apply the thinning expression
+/// when creating arrangements, and permute by the hashmap when reading them,
+/// the contract of the function where they are generated (`expr::permutation_for_arrangement`)
+/// ensures that the correct values will be read.
+#[derive(Default, Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
+pub struct AvailableCollections {
+    /// Whether the collection exists in unarranged form.
+    pub raw: bool,
+    /// The set of arrangements of the collection, along with a
+    /// column permutation mapping
+    pub arranged: Vec<KeyWithColumnPermutation>,
+}
+
+pub type KeyWithColumnPermutation = (Vec<ScalarExpr>, BTreeMap<usize, usize>, Vec<usize>);
+
+impl AvailableCollections {
+    /// Represent a collection that has no arrangements.
+    pub fn new_raw() -> Self {
+        Self {
+            raw: true,
+            arranged: vec![],
+        }
+    }
+
+    /// Represent a collection that is arranged in the
+    /// specified ways.
+    pub fn new_arranged(arranged: Vec<KeyWithColumnPermutation>) -> Self {
+        assert!(
+            !arranged.is_empty(),
+            "Invariant violated: at least one collection must exist"
+        );
+        Self {
+            raw: false,
+            arranged,
+        }
+    }
+}
+
+/// Rendering Plan
+///
+/// TODO(discord9): see if we ever need to support recursive plans
+#[derive(Debug, Clone)]
+pub enum Plan<T = repr::Timestamp> {
+    /// A collection containing a pre-determined collection.
+    Constant {
+        rows: Result<Vec<(Row, T, Diff)>, EvalError>,
+    },
+    /// A reference to a bound collection.
+    ///
+    /// This is commonly either an external reference to an existing source or
+    /// maintained arrangement, or an internal reference to a `Let` identifier.
+    Get {
+        id: Id,
+        keys: AvailableCollections,
+        plan: GetPlan,
+    },
+    /// Binds `value` to `id`, and then results in `body` with that binding.
+    ///
+    /// This stage has the effect of sharing `value` across multiple possible
+    /// uses in `body`, and is the only mechanism we have for sharing collection
+    /// information across parts of a dataflow.
+    ///
+    /// The binding is not available outside of `body`.
+    Let {
+        /// The local identifier to be used, available to `body` as `Id::Local(id)`.
+        id: LocalId,
+        /// The collection that should be bound to `id`.
+        value: Box<Plan<T>>,
+        /// The collection that results, which is allowed to contain `Get` stages
+        /// that reference `Id::Local(id)`.
+        body: Box<Plan<T>>,
+    },
+    /// Map, Filter, and Project operators.
+    ///
+    /// This stage contains work that we would ideally like to fuse to other plan
+    /// stages, but for practical reasons cannot. For example: reduce, threshold,
+    /// and topk stages are not able to absorb this operator.
+    Mfp {
+        /// The input collection.
+        input: Box<Plan<T>>,
+        /// Linear operator to apply to each record.
+        mfp: MapFilterProject,
+        /// Whether the input is from an arrangement, and if so,
+        /// whether we can seek to a specific value therein
+        input_key_val: Option<(Vec<ScalarExpr>, Option<Row>)>,
+    },
+    /// A variable number of output records for each input record.
+    ///
+    /// This stage is a bit of a catch-all for logic that does not easily fit in
+    /// map stages. This includes table valued functions, but also functions of
+    /// multiple arguments, and functions that modify the sign of updates.
+    ///
+    /// This stage allows a `MapFilterProject` operator to be fused to its output,
+    /// and this can be very important as otherwise the output of `func` is just
+    /// appended to the input record, for as many outputs as it has. This has the
+    /// unpleasant default behavior of repeating potentially large records that
+    /// are being unpacked, producing quadratic output in those cases. Instead,
+    /// in these cases use a `mfp` member that projects away these large fields.
+    FlatMap {
+        /// The input collection.
+        input: Box<Plan<T>>,
+        /// The variable-record emitting function.
+        func: TableFunc,
+        /// Expressions that for each row prepare the arguments to `func`.
+        exprs: Vec<ScalarExpr>,
+        /// Linear operator to apply to each record produced by `func`.
+        mfp: MapFilterProject,
+        /// The particular arrangement of the input we expect to use,
+        /// if any
+        input_key: Option<Vec<ScalarExpr>>,
+    },
+    /// A multiway relational equijoin, with fused map, filter, and projection.
+    ///
+    /// This stage performs a multiway join among `inputs`, using the equality
+    /// constraints expressed in `plan`. The plan also describes the implementation
+    /// strategy we will use, and any pushed down per-record work.
+    Join {
+        /// An ordered list of inputs that will be joined.
+        inputs: Vec<Plan<T>>,
+        /// Detailed information about the implementation of the join.
+        ///
+        /// This includes information about the implementation strategy, but also
+        /// any map, filter, project work that we might follow the join with, but
+        /// potentially pushed down into the implementation of the join.
+        plan: JoinPlan,
+    },
+    /// Aggregation by key.
+    Reduce {
+        /// The input collection.
+        input: Box<Plan<T>>,
+        /// A plan for changing input records into key, value pairs.
+        key_val_plan: KeyValPlan,
+        /// A plan for performing the reduce.
+        ///
+        /// The implementation of reduction has several different strategies based
+        /// on the properties of the reduction, and the input itself. Please check
+        /// out the documentation for this type for more detail.
+        plan: ReducePlan,
+        /// The particular arrangement of the input we expect to use,
+        /// if any
+        input_key: Option<Vec<ScalarExpr>>,
+    },
+}
+
+/// TODO(discord9): impl GetPlan
+#[derive(Debug, Clone)]
+pub enum GetPlan {
+    /// Simply pass input arrangements on to the next stage.
+    PassArrangements,
+    /// Using the supplied key, optionally seek the row, and apply the MFP.
+    Arrangement(Vec<ScalarExpr>, Option<Row>, MapFilterProject),
+    /// Scan the input collection (unarranged) and apply the MFP.
+    Collection(MapFilterProject),
+}
+
+/// Returns bucket sizes, descending, suitable for hierarchical decomposition of an operator, based
+/// on the expected number of rows that will have the same group key.
+fn bucketing_of_expected_group_size(expected_group_size: Option<u64>) -> Vec<u64> {
+    let mut buckets = vec![];
+    let mut current = 16;
+
+    // Plan for 4B records in the expected case if the user didn't specify a group size.
+    let limit = expected_group_size.unwrap_or(4_000_000_000);
+
+    // Distribute buckets in powers of 16, so that we can strike a balance between how many inputs
+    // each layer gets from the preceding layer, while also limiting the number of layers.
+    while current < limit {
+        buckets.push(current);
+        current = current.saturating_mul(16);
+    }
+
+    buckets.reverse();
+    buckets
+}
--- a/src/flow/src/compute/plan/reduce.rs
+++ b/src/flow/src/compute/plan/reduce.rs
@@ -0,0 +1,233 @@
+use serde::{Deserialize, Serialize};
+
+use crate::expr::{AggregateExpr, AggregateFunc, MapFilterProject, SafeMfpPlan};
+
+/// This enum represents the three potential types of aggregations.
+#[derive(Copy, Clone, Debug, Deserialize, Eq, Hash, Ord, PartialEq, PartialOrd, Serialize)]
+pub enum ReductionType {
+    /// Accumulable functions can be subtracted from (are invertible), and associative.
+    /// We can compute these results by moving some data to the diff field under arbitrary
+    /// changes to inputs. Examples include sum or count.
+    Accumulable,
+    /// Hierarchical functions are associative, which means we can split up the work of
+    /// computing them across subsets. Note that hierarchical reductions should also
+    /// reduce the data in some way, as otherwise rendering them hierarchically is not
+    /// worth it. Examples include min or max.
+    Hierarchical,
+    /// Basic, for lack of a better word, are functions that are neither accumulable
+    /// nor hierarchical. Examples include jsonb_agg.
+    Basic,
+}
+
+/// Plan for extracting keys and values in preparation for a reduction.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct KeyValPlan {
+    /// Extracts the columns used as the key.
+    pub key_plan: SafeMfpPlan,
+    /// Extracts the columns used to feed the aggregations.
+    pub val_plan: SafeMfpPlan,
+}
+
+/// Transforms a vector containing indexes of needed columns into one containing
+/// the "skips" an iterator over a Row would need to perform to see those values.
+///
+/// This function requires that all of the elements in `indexes` are strictly
+/// increasing.
+///
+/// # Examples
+///
+/// ```
+/// assert_eq!(convert_indexes_to_skips(vec![3, 6, 10, 15]), [3, 2, 3, 4])
+/// ```
+pub fn convert_indexes_to_skips(mut indexes: Vec<usize>) -> Vec<usize> {
+    for i in 1..indexes.len() {
+        assert!(
+            indexes[i - 1] < indexes[i],
+            "convert_indexes_to_skip needs indexes to be strictly increasing. Received: {:?}",
+            indexes,
+        );
+    }
+
+    for i in (1..indexes.len()).rev() {
+        indexes[i] -= indexes[i - 1];
+        indexes[i] -= 1;
+    }
+
+    indexes
+}
+
+/// A `ReducePlan` provides a concise description for how we will
+/// execute a given reduce expression.
+///
+/// The provided reduce expression can have no
+/// aggregations, in which case its just a `Distinct` and otherwise
+/// it's composed of a combination of accumulable, hierarchical and
+/// basic aggregations.
+///
+/// We want to try to centralize as much decision making about the
+/// shape / general computation of the rendered dataflow graph
+/// in this plan, and then make actually rendering the graph
+/// be as simple (and compiler verifiable) as possible.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub enum ReducePlan {
+    /// Plan for not computing any aggregations, just determining the set of
+    /// distinct keys.
+    Distinct,
+    /// Plan for computing only accumulable aggregations.
+    Accumulable(AccumulablePlan),
+    /// Plan for computing only hierarchical aggregations.
+    Hierarchical(HierarchicalPlan),
+    /// Plan for computing only basic aggregations.
+    Basic(BasicPlan),
+    /// Plan for computing a mix of different kinds of aggregations.
+    /// We need to do extra work here to reassemble results back in the
+    /// requested order.
+    Collation(CollationPlan),
+}
+
+/// Plan for computing a set of accumulable aggregations.
+///
+/// We fuse all of the accumulable aggregations together
+/// and compute them with one dataflow fragment. We need to
+/// be careful to separate out the aggregations that
+/// apply only to the distinct set of values. We need
+/// to apply a distinct operator to those before we
+/// combine them with everything else.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct AccumulablePlan {
+    /// All of the aggregations we were asked to compute, stored
+    /// in order.
+    pub full_aggrs: Vec<AggregateExpr>,
+    /// All of the non-distinct accumulable aggregates.
+    /// Each element represents:
+    /// (index of the aggregation among accumulable aggregations,
+    ///  index of the datum among inputs, aggregation expr)
+    /// These will all be rendered together in one dataflow fragment.
+    pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
+    /// Same as above but for all of the `DISTINCT` accumulable aggregations.
+    pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
+}
+
+// TODO(discord9): others
+
+/// Plan for computing a set of hierarchical aggregations.
+///
+/// In the append-only setting we can render them in-place
+/// with monotonic plans, but otherwise, we need to render
+/// them with a reduction tree that splits the inputs into
+/// small, and then progressively larger, buckets
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub enum HierarchicalPlan {
+    /// Plan hierarchical aggregations under monotonic inputs.
+    Monotonic(MonotonicPlan),
+    /// Plan for hierarchical aggregations under non-monotonic inputs.
+    Bucketed(BucketedPlan),
+}
+
+/// Plan for computing a set of hierarchical aggregations with a
+/// monotonic input.
+///
+/// Here, the aggregations will be rendered in place. We don't
+/// need to worry about retractions because the inputs are
+/// append only, so we can change our computation to
+/// only retain the "best" value in the diff field, instead
+/// of holding onto all values.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct MonotonicPlan {
+    /// All of the aggregations we were asked to compute.
+    pub aggr_funcs: Vec<AggregateFunc>,
+    /// Set of "skips" or calls to `nth()` an iterator needs to do over
+    /// the input to extract the relevant datums.
+    pub skips: Vec<usize>,
+    /// True if the input is logically but not physically monotonic,
+    /// and the operator must first consolidate the inputs to remove
+    /// potential negations.
+    pub must_consolidate: bool,
+}
+
+/// Plan for computing a set of hierarchical aggregations
+/// with non-monotonic inputs.
+///
+/// To perform hierarchical aggregations with stable runtimes
+/// under updates we'll subdivide the group key into buckets, compute
+/// the reduction in each of those subdivided buckets and then combine
+/// the results into a coarser bucket (one that represents a larger
+/// fraction of the original input) and redo the reduction in another
+/// layer. Effectively, we'll construct a min / max heap out of a series
+/// of reduce operators (each one is a separate layer).
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct BucketedPlan {
+    /// All of the aggregations we were asked to compute.
+    pub aggr_funcs: Vec<AggregateFunc>,
+    /// Set of "skips" or calls to `nth()` an iterator needs to do over
+    /// the input to extract the relevant datums.
+    pub skips: Vec<usize>,
+    /// The number of buckets in each layer of the reduction tree. Should
+    /// be decreasing, and ideally, a power of two so that we can easily
+    /// distribute values to buckets with `value.hashed() % buckets[layer]`.
+    pub buckets: Vec<u64>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub enum BasicPlan {}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct CollationPlan {}
+
+/// Determines whether a function can be accumulated in an update's "difference" field,
+/// and whether it can be subjected to recursive (hierarchical) aggregation.
+///
+/// Accumulable aggregations will be packed into differential dataflow's "difference" field,
+/// which can be accumulated in-place using the addition operation on the type. Aggregations
+/// that indicate they are accumulable will still need to provide an action that takes their
+/// data and introduces it as a difference, and the post-processing when the accumulated value
+/// is presented as data.
+///
+/// Hierarchical aggregations will be subjected to repeated aggregation on initially small but
+/// increasingly large subsets of each key. This has the intended property that no invocation
+/// is on a significantly large set of values (and so, no incremental update needs to reform
+/// significant input data). Hierarchical aggregates can be rendered more efficiently if the
+/// input stream is append-only as then we only need to retain the "currently winning" value.
+/// Every hierarchical aggregate needs to supply a corresponding ReductionMonoid implementation.
+fn reduction_type(func: &AggregateFunc) -> ReductionType {
+    match func {
+        AggregateFunc::SumInt16
+        | AggregateFunc::SumInt32
+        | AggregateFunc::SumInt64
+        | AggregateFunc::SumUInt16
+        | AggregateFunc::SumUInt32
+        | AggregateFunc::SumUInt64
+        | AggregateFunc::SumFloat32
+        | AggregateFunc::SumFloat64
+        | AggregateFunc::Count
+        | AggregateFunc::Any
+        | AggregateFunc::All => ReductionType::Accumulable,
+        AggregateFunc::MaxInt16
+        | AggregateFunc::MaxInt32
+        | AggregateFunc::MaxInt64
+        | AggregateFunc::MaxUInt16
+        | AggregateFunc::MaxUInt32
+        | AggregateFunc::MaxUInt64
+        | AggregateFunc::MaxFloat32
+        | AggregateFunc::MaxFloat64
+        | AggregateFunc::MaxBool
+        | AggregateFunc::MaxString
+        | AggregateFunc::MaxDate
+        | AggregateFunc::MaxTimestamp
+        | AggregateFunc::MaxTimestampTz
+        | AggregateFunc::MinInt16
+        | AggregateFunc::MinInt32
+        | AggregateFunc::MinInt64
+        | AggregateFunc::MinUInt16
+        | AggregateFunc::MinUInt32
+        | AggregateFunc::MinUInt64
+        | AggregateFunc::MinFloat32
+        | AggregateFunc::MinFloat64
+        | AggregateFunc::MinBool
+        | AggregateFunc::MinString
+        | AggregateFunc::MinDate
+        | AggregateFunc::MinTimestamp
+        | AggregateFunc::MinTimestampTz => ReductionType::Hierarchical,
+        _ => ReductionType::Basic,
+    }
+}
--- a/src/flow/src/compute/render/error.rs
+++ b/src/flow/src/compute/render/error.rs
@@ -0,0 +1,60 @@
+use std::hash::Hash;
+
+use differential_dataflow::ExchangeData;
+
+use crate::repr::Row;
+
+/// Used to make possibly-validating code generic: think of this as a kind of `MaybeResult`,
+/// specialized for use in compute.  Validation code will only run when the error constructor is
+/// Some.
+pub(super) trait MaybeValidatingRow<T, E>: ExchangeData + Hash {
+    fn ok(t: T) -> Self;
+    fn into_error() -> Option<fn(E) -> Self>;
+}
+
+impl<E> MaybeValidatingRow<Row, E> for Row {
+    fn ok(t: Row) -> Self {
+        t
+    }
+
+    fn into_error() -> Option<fn(E) -> Self> {
+        None
+    }
+}
+
+impl<E> MaybeValidatingRow<(), E> for () {
+    fn ok(t: ()) -> Self {
+        t
+    }
+
+    fn into_error() -> Option<fn(E) -> Self> {
+        None
+    }
+}
+
+impl<E, R> MaybeValidatingRow<Vec<R>, E> for Vec<R>
+where
+    R: ExchangeData + Hash,
+{
+    fn ok(t: Vec<R>) -> Self {
+        t
+    }
+
+    fn into_error() -> Option<fn(E) -> Self> {
+        None
+    }
+}
+
+impl<T, E> MaybeValidatingRow<T, E> for Result<T, E>
+where
+    T: ExchangeData + Hash,
+    E: ExchangeData + Hash,
+{
+    fn ok(row: T) -> Self {
+        Ok(row)
+    }
+
+    fn into_error() -> Option<fn(E) -> Self> {
+        Some(Err)
+    }
+}
--- a/src/flow/src/compute/render/mod.rs
+++ b/src/flow/src/compute/render/mod.rs
@@ -0,0 +1,626 @@
+//! for building the flow graph from PLAN
+//! this is basically the last step before actually running the flow graph
+
+use differential_dataflow::lattice::Lattice;
+use differential_dataflow::AsCollection;
+use timely::communication::Allocate;
+use timely::dataflow::operators::capture::Extract;
+use timely::dataflow::operators::{Capture, ToStream};
+use timely::dataflow::Scope;
+use timely::progress::timestamp::Refines;
+use timely::progress::Timestamp;
+use timely::worker::Worker as TimelyWorker;
+
+use super::types::DataflowDescription;
+use crate::compute::compute_state::ComputeState;
+use crate::compute::context::CollectionBundle;
+use crate::compute::plan::Plan;
+use crate::compute::types::BuildDesc;
+use crate::compute::Context;
+use crate::expr::Id;
+use crate::repr::{self, Row};
+use crate::storage::errors::DataflowError;
+
+mod error;
+mod reduce;
+
+/// Assemble the "compute"  side of a dataflow, i.e. all but the sources.
+///
+/// This method imports sources from provided assets, and then builds the remaining
+/// dataflow using "compute-local" assets like shared arrangements, and producing
+/// both arrangements and sinks.
+pub fn build_compute_dataflow<A: Allocate>(
+    timely_worker: &mut TimelyWorker<A>,
+    compute_state: &mut ComputeState,
+    dataflow: DataflowDescription<Plan, ()>,
+) {
+    todo!()
+}
+
+pub trait RenderTimestamp: Timestamp + Lattice + Refines<repr::Timestamp> {
+    /// The system timestamp component of the timestamp.
+    ///
+    /// This is useful for manipulating the system time, as when delaying
+    /// updates for subsequent cancellation, as with monotonic reduction.
+    fn system_time(&mut self) -> &mut repr::Timestamp;
+    /// Effects a system delay in terms of the timestamp summary.
+    fn system_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary;
+    /// The event timestamp component of the timestamp.
+    fn event_time(&mut self) -> &mut repr::Timestamp;
+    /// Effects an event delay in terms of the timestamp summary.
+    fn event_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary;
+    /// Steps the timestamp back so that logical compaction to the output will
+    /// not conflate `self` with any historical times.
+    fn step_back(&self) -> Self;
+}
+
+impl RenderTimestamp for repr::Timestamp {
+    fn system_time(&mut self) -> &mut repr::Timestamp {
+        self
+    }
+    fn system_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary {
+        delay
+    }
+    fn event_time(&mut self) -> &mut repr::Timestamp {
+        self
+    }
+    fn event_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary {
+        delay
+    }
+    fn step_back(&self) -> Self {
+        self.saturating_sub(1)
+    }
+}
+
+// This implementation block allows child timestamps to vary from parent timestamps.
+impl<G> Context<G, Row>
+where
+    G: Scope,
+    G::Timestamp: RenderTimestamp,
+{
+    /// render plan and insert into context with given GlobalId
+    pub(crate) fn build_object(&mut self, object: BuildDesc<Plan>) {
+        // First, transform the relation expression into a render plan.
+        let bundle = self.render_plan(object.plan);
+        self.insert_id(Id::Global(object.id), bundle);
+    }
+}
+
+impl<S> Context<S, Row>
+where
+    S: Scope,
+    S::Timestamp: RenderTimestamp,
+{
+    /// Renders a plan to a differential dataflow, producing the collection of results.
+    ///
+    /// The return type reflects the uncertainty about the data representation, perhaps
+    /// as a stream of data, perhaps as an arrangement, perhaps as a stream of batches.
+    pub fn render_plan(&mut self, plan: Plan) -> CollectionBundle<S, Row> {
+        match plan {
+            Plan::Constant { rows } => {
+                let (rows, errs) = match rows {
+                    Ok(rows) => (rows, Vec::new()),
+                    Err(err) => (Vec::new(), vec![err]),
+                };
+                let since_frontier = self.since_frontier.clone();
+                let until = self.until_frontier.clone();
+                let ok_collection = rows
+                    .into_iter()
+                    .filter_map(move |(row, mut time, diff)| {
+                        time.advance_by(since_frontier.borrow());
+                        if !until.less_equal(&time) {
+                            Some((
+                                row,
+                                <S::Timestamp as Refines<repr::Timestamp>>::to_inner(time),
+                                diff,
+                            ))
+                        } else {
+                            None
+                        }
+                    })
+                    .to_stream(&mut self.scope)
+                    .as_collection();
+                let mut error_time: repr::Timestamp = Timestamp::minimum();
+                error_time.advance_by(self.since_frontier.borrow());
+                let err_collection = errs
+                    .into_iter()
+                    .map(move |e| {
+                        (
+                            DataflowError::from(e),
+                            <S::Timestamp as Refines<repr::Timestamp>>::to_inner(error_time),
+                            1,
+                        )
+                    })
+                    .to_stream(&mut self.scope)
+                    .as_collection();
+                CollectionBundle::from_collections(ok_collection, err_collection)
+            }
+            Plan::Get { id, keys, plan } => {
+                // Recover the collection from `self` and then apply `mfp` to it.
+                // If `mfp` happens to be trivial, we can just return the collection.
+                let mut collection = self
+                    .lookup_id(id)
+                    .unwrap_or_else(|| panic!("Get({:?}) not found at render time", id));
+                match plan {
+                    crate::compute::plan::GetPlan::PassArrangements => {
+                        // Assert that each of `keys` are present in `collection`.
+                        if !keys
+                            .arranged
+                            .iter()
+                            .all(|(key, _, _)| collection.arranged.contains_key(key))
+                        {
+                            let not_included: Vec<_> = keys
+                                .arranged
+                                .iter()
+                                .filter(|(key, _, _)| !collection.arranged.contains_key(key))
+                                .map(|(key, _, _)| key)
+                                .collect();
+                            panic!(
+                                "Those keys {:?} is not included in collections keys:{:?}",
+                                not_included,
+                                collection.arranged.keys().cloned().collect::<Vec<_>>()
+                            );
+                        }
+                        assert!(keys.raw <= collection.collection.is_some());
+                        // Retain only those keys we want to import.
+                        collection.arranged.retain(|key, _val| {
+                            keys.arranged.iter().any(|(key2, _, _)| key2 == key)
+                        });
+                        collection
+                    }
+                    crate::compute::plan::GetPlan::Arrangement(key, row, mfp) => {
+                        let (oks, errs) = collection.as_collection_core(
+                            mfp,
+                            Some((key, row)),
+                            self.until_frontier.clone(),
+                        );
+                        CollectionBundle::from_collections(oks, errs)
+                    }
+                    crate::compute::plan::GetPlan::Collection(mfp) => {
+                        let (oks, errs) =
+                            collection.as_collection_core(mfp, None, self.until_frontier.clone());
+                        CollectionBundle::from_collections(oks, errs)
+                    }
+                }
+            }
+            Plan::Let { id, value, body } => {
+                // Render `value` and bind it to `id`. Complain if this shadows an id.
+                let value = self.render_plan(*value);
+                let prebound = self.insert_id(Id::Local(id), value);
+                assert!(prebound.is_none());
+
+                let body = self.render_plan(*body);
+                self.remove_id(Id::Local(id));
+                body
+            }
+            Plan::Mfp {
+                input,
+                mfp,
+                input_key_val,
+            } => {
+                let input = self.render_plan(*input);
+                // If `mfp` is non-trivial, we should apply it and produce a collection.
+                if mfp.is_identity() {
+                    input
+                } else {
+                    let (oks, errs) =
+                        input.as_collection_core(mfp, input_key_val, self.until_frontier.clone());
+                    CollectionBundle::from_collections(oks, errs)
+                }
+            }
+            Plan::Reduce {
+                input,
+                key_val_plan,
+                plan,
+                input_key,
+            } => {
+                let input = self.render_plan(*input);
+                self.render_reduce(input, key_val_plan, plan, input_key)
+            }
+            _ => todo!("To be implemented"),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::any::Any;
+    use std::collections::{BTreeMap, BTreeSet};
+    use std::rc::Rc;
+
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::value::Value;
+    use differential_dataflow::input::{Input, InputSession};
+    use differential_dataflow::Collection;
+    use timely::dataflow::scopes::Child;
+    use timely::dataflow::Stream;
+    use timely::Config;
+
+    use super::*;
+    use crate::compute::plan::{
+        AccumulablePlan, AvailableCollections, GetPlan, KeyValPlan, ReducePlan,
+    };
+    use crate::expr::{
+        AggregateExpr, BinaryFunc, GlobalId, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
+        UnaryFunc,
+    };
+    use crate::repr::Diff;
+    type OkStream<G> = Stream<G, (Row, repr::Timestamp, Diff)>;
+    type ErrStream<G> = Stream<G, (DataflowError, repr::Timestamp, Diff)>;
+    type OkCollection<G> = Collection<G, Row, Diff>;
+    type ErrCollection<G> = Collection<G, DataflowError, Diff>;
+    /// used as a token to prevent certain resources from being dropped
+    type AnyToken = Rc<dyn Any>;
+    struct MockSourceToken {
+        handle: InputSession<repr::Timestamp, Row, Diff>,
+        err_handle: InputSession<repr::Timestamp, DataflowError, Diff>,
+    }
+
+    fn mock_input_session(input: &mut InputSession<repr::Timestamp, Row, Diff>, cnt: i64) {
+        // TODO: mock a cpu usage monotonic input with timestamp
+        // cpu, mem, ts
+        // f32, f32, DateTime
+        let schema = [
+            ConcreteDataType::float32_datatype(),
+            ConcreteDataType::float32_datatype(),
+            ConcreteDataType::datetime_datatype(),
+        ];
+        let arrs = (0..cnt).map(|i| (i as f32 / cnt as f32, i as f32 / cnt as f32, i));
+        // need more mechanism to make timestamp also timestamp here
+        for (cpu, mem, ts) in arrs {
+            input.update(
+                Row::pack(vec![cpu.into(), mem.into(), Value::DateTime(ts.into())]),
+                1,
+            );
+            input.advance_to(ts as u64)
+        }
+        input.flush();
+    }
+
+    // a simple test to see if the dataflow can be built and run
+    fn exec_dataflow(
+        input_id: Vec<Id>,
+        dataflow: DataflowDescription<Plan>,
+        sink_ids: Vec<GlobalId>,
+        output_keys: Vec<Option<Vec<ScalarExpr>>>,
+        input_mock_length: i64,
+    ) {
+        timely::execute(Config::thread(), move |worker| {
+            println!("worker: {:?}", worker.index());
+            let mut input = InputSession::<repr::Timestamp, Row, Diff>::new();
+            worker.dataflow_named(
+                "ProofOfConcept",
+                |scope: &mut Child<'_, _, repr::Timestamp>| {
+                    let mut test_ctx =
+                        Context::<_, Row, _>::for_dataflow_in(&dataflow, scope.clone());
+
+                    let ok_collection = input.to_collection(scope);
+                    let (err_handle, err_collection) = scope.new_collection();
+                    let input_collection =
+                        CollectionBundle::<_, _, repr::Timestamp>::from_collections(
+                            ok_collection,
+                            err_collection,
+                        );
+
+                    // TODO: generate `import_sources` from `dataflow.source_imports`
+                    let import_sources: Vec<_> = input_id
+                        .clone()
+                        .into_iter()
+                        .zip(vec![input_collection])
+                        .collect();
+
+                    // import sources
+                    for (id, collection) in import_sources {
+                        test_ctx.insert_id(id, collection);
+                    }
+
+                    for build_desc in &dataflow.objects_to_build {
+                        test_ctx.build_object(build_desc.clone());
+                    }
+
+                    dbg!(test_ctx.bindings.keys());
+
+                    // TODO: export sinks
+
+                    for (sink, output_key) in sink_ids.iter().zip(output_keys.iter()) {
+                        let sink = *sink;
+                        println!("Inspecting sink {:?}", sink.clone());
+                        let inspect = test_ctx.lookup_id(Id::Global(sink)).unwrap();
+                        dbg!(inspect.collection.is_some());
+                        dbg!(inspect.arranged.keys());
+                        let inspect = inspect.as_specific_collection(output_key.as_deref());
+                        inspect
+                            .0
+                            .inspect(move |x| println!("inspect {:?} {:?}", sink.clone(), x));
+                    }
+                },
+            );
+            mock_input_session(&mut input, input_mock_length);
+        })
+        .expect("Computation terminated abnormally");
+    }
+
+    #[test]
+    fn test_simple_poc_reduce_group_by() {
+        // 1. build dataflow with input collection connected
+        // 2. give input
+        // type annotation is needed to prevent rust-analyzer to give up type deduction
+
+        // simple give dataflow information
+        // will be build by given dataflow information from other nodes later
+        // key is the third column
+        let place_holder =
+            ScalarExpr::Literal(Ok(Value::Boolean(true)), ConcreteDataType::int64_datatype());
+
+        let count_col = |i: usize| AggregateExpr {
+            func: crate::expr::AggregateFunc::Count,
+            expr: ScalarExpr::Column(i),
+            distinct: false,
+        };
+        let sum_col = |i: usize| AggregateExpr {
+            func: crate::expr::AggregateFunc::SumFloat32,
+            expr: ScalarExpr::Column(i),
+            distinct: false,
+        };
+        // equal to `SELECT minute, SUM(cpu) FROM input GROUP BY ts/300 as minute;
+        // cpu, mem, ts
+        // --map--> cpu, mem, ts/300
+        // --reduce--> ts/300, AVG(cpu), AVG(mem)
+        let cast_datetime = ScalarExpr::CallUnary {
+            func: UnaryFunc::CastDatetimeToInt64,
+            expr: Box::new(ScalarExpr::Column(2)),
+        };
+        let ts_div_5 = ScalarExpr::CallBinary {
+            func: BinaryFunc::DivInt64,
+            expr1: Box::new(cast_datetime),
+            expr2: Box::new(ScalarExpr::Literal(
+                Ok(Value::Int64(5.into())),
+                ConcreteDataType::int64_datatype(),
+            )),
+        };
+        let cast_int64_to_float32 = |i: usize| ScalarExpr::CallUnary {
+            func: UnaryFunc::CastInt64ToFloat32,
+            expr: Box::new(ScalarExpr::Column(i)),
+        };
+        let reduce_group_by_window = vec![
+            // cpu, mem, ts
+            // --reduce--> ts/300, SUM(cpu), SUM(mem), COUNT(cpu), COUNT(mem)
+            // -- map --> ts/300, AVG(cpu), AVG(mem)
+            BuildDesc {
+                id: GlobalId::User(0),
+                plan: Plan::Reduce {
+                    input: Box::new(Plan::Get {
+                        id: Id::Global(GlobalId::System(0)),
+                        keys: AvailableCollections::new_raw(),
+                        plan: GetPlan::Collection(
+                            MapFilterProject::new(3).map([ts_div_5]).project([0, 1, 3]),
+                        ),
+                    }),
+                    key_val_plan: KeyValPlan {
+                        key_plan: SafeMfpPlan {
+                            mfp: MapFilterProject::new(3).project([2]),
+                        },
+                        val_plan: SafeMfpPlan {
+                            mfp: MapFilterProject::new(3).project([0, 1]),
+                        },
+                    },
+                    // --reduce--> ts/300(key), SUM(cpu), SUM(mem), COUNT(cpu), COUNT(mem)
+                    plan: ReducePlan::Accumulable(AccumulablePlan {
+                        full_aggrs: vec![sum_col(0), sum_col(1), count_col(0), count_col(1)],
+                        simple_aggrs: vec![
+                            (0, 0, sum_col(0)),
+                            (1, 1, sum_col(1)),
+                            (2, 0, count_col(0)),
+                            (3, 1, count_col(1)),
+                        ],
+                        distinct_aggrs: vec![],
+                    }),
+                    input_key: None,
+                },
+            },
+            // 0            1           2       3           4
+            // ts/300(key), SUM(cpu), SUM(mem), COUNT(cpu), COUNT(mem),
+            // -- map --> AVG(cpu), AVG(mem), ts/300
+            BuildDesc {
+                id: GlobalId::User(1),
+                plan: Plan::Get {
+                    id: Id::Global(GlobalId::User(0)),
+                    // not used since plan is GetPlan::Arrangement
+                    keys: AvailableCollections::new_raw(),
+                    plan: GetPlan::Arrangement(
+                        vec![ScalarExpr::Column(0)],
+                        None,
+                        MapFilterProject::new(5)
+                            .map([
+                                ScalarExpr::CallBinary {
+                                    func: BinaryFunc::DivFloat32,
+                                    expr1: Box::new(ScalarExpr::Column(1)),
+                                    expr2: Box::new(cast_int64_to_float32(3)),
+                                },
+                                ScalarExpr::CallBinary {
+                                    func: BinaryFunc::DivFloat32,
+                                    expr1: Box::new(ScalarExpr::Column(2)),
+                                    expr2: Box::new(cast_int64_to_float32(4)),
+                                },
+                            ])
+                            .project([0, 5, 6]),
+                    ),
+                },
+            },
+        ];
+        let input_id = vec![Id::Global(GlobalId::System(0))];
+        let dataflow = {
+            let mut dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
+            dataflow.objects_to_build = reduce_group_by_window;
+            dataflow
+        };
+        let sink_ids = [GlobalId::User(0), GlobalId::User(1)];
+        exec_dataflow(
+            input_id.clone(),
+            dataflow.clone(),
+            sink_ids.to_vec(),
+            vec![Some(vec![ScalarExpr::Column(0)]), None],
+            10,
+        );
+    }
+
+    #[test]
+    fn test_simple_poc_reduce_count() {
+        // 1. build dataflow with input collection connected
+        // 2. give input
+        // type annotation is needed to prevent rust-analyzer to give up type deduction
+
+        // simple give dataflow information
+        // will be build by given dataflow information from other nodes later
+        // key is the third column
+        let place_holder =
+            ScalarExpr::Literal(Ok(Value::Boolean(true)), ConcreteDataType::int64_datatype());
+        let key_plan = SafeMfpPlan {
+            mfp: MapFilterProject::new(3)
+                .map([place_holder.clone()])
+                .project([3]),
+        };
+        let val_plan = SafeMfpPlan {
+            mfp: MapFilterProject::new(3).project([0, 1, 2]),
+        };
+        let count = AggregateExpr {
+            func: crate::expr::AggregateFunc::Count,
+            expr: place_holder,
+            distinct: false,
+        };
+        // equal to `SELECT COUNT(*) FROM input;`
+        let reduce_group_by_window = vec![
+            // count(true)
+            BuildDesc {
+                id: GlobalId::User(0),
+                plan: Plan::Reduce {
+                    input: Box::new(Plan::Get {
+                        id: Id::Global(GlobalId::System(0)),
+                        keys: AvailableCollections::new_raw(),
+                        plan: GetPlan::Collection(MapFilterProject::new(3)),
+                    }),
+                    key_val_plan: KeyValPlan { key_plan, val_plan },
+                    plan: ReducePlan::Accumulable(AccumulablePlan {
+                        full_aggrs: vec![count.clone()],
+                        simple_aggrs: vec![(0, 0, count)],
+                        distinct_aggrs: vec![],
+                    }),
+                    input_key: None,
+                },
+            },
+            // get second column
+            BuildDesc {
+                id: GlobalId::User(1),
+                plan: Plan::Get {
+                    id: Id::Global(GlobalId::User(0)),
+                    // not used since plan is GetPlan::Arrangement
+                    keys: AvailableCollections::new_raw(),
+                    plan: GetPlan::Arrangement(
+                        vec![ScalarExpr::Column(0)],
+                        None,
+                        MapFilterProject::new(2).project([1]),
+                    ),
+                },
+            },
+        ];
+        let input_id = vec![Id::Global(GlobalId::System(0))];
+        let dataflow = {
+            let mut dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
+            dataflow.objects_to_build = reduce_group_by_window;
+            dataflow
+        };
+        let sink_ids = [GlobalId::User(1)];
+        exec_dataflow(
+            input_id.clone(),
+            dataflow.clone(),
+            sink_ids.to_vec(),
+            vec![None],
+            10,
+        );
+    }
+
+    #[test]
+    fn test_simple_poc_reduce_distinct() {
+        // 1. build dataflow with input collection connected
+        // 2. give input
+        // type annotation is needed to prevent rust-analyzer to give up type deduction
+
+        // simple give dataflow information
+        // will be build by given dataflow information from other nodes later
+        // window need date_trunc which is still WIP
+        // key is the third column
+        let key_plan = SafeMfpPlan {
+            mfp: MapFilterProject::new(3).project([2]),
+        };
+        let val_plan = SafeMfpPlan {
+            mfp: MapFilterProject::new(3).project([0, 1]),
+        };
+        // equal to `SELECT ts, COUNT(*) FROM input GROUP BY ts;`
+        let reduce_plan = vec![BuildDesc {
+            id: GlobalId::User(0),
+            plan: Plan::Reduce {
+                input: Box::new(Plan::Get {
+                    id: Id::Global(GlobalId::System(0)),
+                    keys: AvailableCollections::new_raw(),
+                    plan: GetPlan::Collection(MapFilterProject::new(3)),
+                }),
+                key_val_plan: KeyValPlan { key_plan, val_plan },
+                plan: ReducePlan::Distinct,
+                input_key: None,
+            },
+        }];
+        let input_id = vec![Id::Global(GlobalId::System(0))];
+        let dataflow = {
+            let mut dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
+            dataflow.objects_to_build = reduce_plan;
+            dataflow
+        };
+        let sink_ids = [GlobalId::User(0)];
+        exec_dataflow(
+            input_id.clone(),
+            dataflow.clone(),
+            sink_ids.to_vec(),
+            vec![Some(vec![ScalarExpr::Column(0)])],
+            10,
+        );
+    }
+    #[test]
+    #[allow(clippy::print_stdout)]
+    fn test_constant_plan_render() {
+        let build_descs = vec![BuildDesc {
+            id: GlobalId::User(0),
+            plan: Plan::Constant {
+                rows: Ok(vec![(Row::default(), 0, 1)]),
+            },
+        }];
+        let dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
+
+        timely::execute_from_args(std::iter::empty::<String>(), move |worker| {
+            println!("worker: {:?}", worker.index());
+            let mut input = InputSession::<repr::Timestamp, Row, Diff>::new();
+            worker.dataflow(|scope: &mut Child<'_, _, repr::Timestamp>| {
+                let mut test_ctx = Context::<_, Row, _>::for_dataflow_in(&dataflow, scope.clone());
+                for build_desc in &build_descs {
+                    test_ctx.build_object(build_desc.clone());
+                }
+                let input_collection = input.to_collection(scope);
+                let err_collection = InputSession::new().to_collection(scope);
+                let input_collection =
+                    CollectionBundle::from_collections(input_collection, err_collection);
+
+                // insert collection
+                test_ctx.insert_id(Id::Local(LocalId(0)), input_collection);
+
+                let inspect = test_ctx
+                    .lookup_id(Id::Global(GlobalId::User(0)))
+                    .unwrap()
+                    .as_specific_collection(None);
+                inspect.0.inspect(|x| println!("inspect {:?}", x));
+            });
+            // input.insert(Row::default());
+            input.update(Row::default(), 1);
+            input.advance_to(1);
+        })
+        .expect("Computation terminated abnormally");
+    }
+}
--- a/src/flow/src/compute/render/reduce.rs
+++ b/src/flow/src/compute/render/reduce.rs
--- a/src/flow/src/compute/typedefs.rs
+++ b/src/flow/src/compute/typedefs.rs
@@ -0,0 +1,20 @@
+use differential_dataflow::operators::arrange::TraceAgent;
+use differential_dataflow::trace::implementations::ord::{OrdKeySpine, OrdValSpine};
+
+use crate::repr::{Diff, Row, Timestamp};
+use crate::storage::errors::DataflowError;
+
+// TODO(discord9): consider use ColValSpine for columnation storage
+
+/// T: Time, R: Diff, O: Offset
+pub type RowSpine<K, V, T, R, O = usize> = OrdValSpine<K, V, T, R, O>;
+/// T: Time, R: Diff, O: Offset
+pub type RowKeySpine<K, T, R, O = usize> = OrdKeySpine<K, T, R, O>;
+/// T: Time, R: Diff, O: Offset
+pub type ErrSpine<K, T, R, O = usize> = OrdKeySpine<K, T, R, O>;
+/// T: Time, R: Diff, O: Offset
+pub type ErrValSpine<K, T, R, O = usize> = OrdValSpine<K, DataflowError, T, R, O>;
+pub type TraceRowHandle<K, V, T, R> = TraceAgent<RowSpine<K, V, T, R>>;
+pub type TraceErrHandle<K, T, R> = TraceAgent<ErrSpine<K, T, R>>;
+pub type KeysValsHandle = TraceRowHandle<Row, Row, Timestamp, Diff>;
+pub type ErrsHandle = TraceErrHandle<DataflowError, Timestamp, Diff>;
--- a/src/flow/src/compute/types/dataflow.rs
+++ b/src/flow/src/compute/types/dataflow.rs
@@ -0,0 +1,75 @@
+use std::collections::BTreeMap;
+
+use serde::{Deserialize, Serialize};
+use timely::progress::Antichain;
+
+use crate::compute::plan::Plan;
+use crate::compute::types::sinks::ComputeSinkDesc;
+use crate::compute::types::sources::SourceInstanceDesc;
+use crate::expr::{GlobalId, ScalarExpr};
+use crate::repr::{self, RelationType};
+
+/// A description of a dataflow to construct and results to surface.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct DataflowDescription<P, S: 'static = (), T = repr::Timestamp> {
+    /// Sources instantiations made available to the dataflow pair with monotonicity information.
+    pub source_imports: BTreeMap<GlobalId, (SourceInstanceDesc<S>, bool)>,
+    /// Indexes made available to the dataflow.
+    /// (id of new index, description of index, relationtype of base source/view, monotonic)
+    pub index_imports: BTreeMap<GlobalId, (IndexDesc, RelationType, bool)>,
+    /// Views and indexes to be built and stored in the local context.
+    /// Objects must be built in the specific order, as there may be
+    /// dependencies of later objects on prior identifiers.
+    pub objects_to_build: Vec<BuildDesc<P>>,
+    /// Indexes to be made available to be shared with other dataflows
+    /// (id of new index, description of index, relationtype of base source/view)
+    pub index_exports: BTreeMap<GlobalId, (IndexDesc, RelationType)>,
+    /// sinks to be created
+    /// (id of new sink, description of sink)
+    pub sink_exports: BTreeMap<GlobalId, ComputeSinkDesc<S, T>>,
+    /// An optional frontier to which inputs should be advanced.
+    ///
+    /// If this is set, it should override the default setting determined by
+    /// the upper bound of `since` frontiers contributing to the dataflow.
+    /// It is an error for this to be set to a frontier not beyond that default.
+    pub as_of: Option<Antichain<T>>,
+    /// Frontier beyond which the dataflow should not execute.
+    /// Specifically, updates at times greater or equal to this frontier are suppressed.
+    /// This is often set to `as_of + 1` to enable "batch" computations.
+    pub until: Antichain<T>,
+    /// Human readable name
+    pub debug_name: String,
+}
+
+impl<P, T> DataflowDescription<P, (), T> {
+    /// Creates a new dataflow description with a human-readable name.
+    pub fn new(name: String) -> Self {
+        Self {
+            source_imports: Default::default(),
+            index_imports: Default::default(),
+            objects_to_build: Vec::new(),
+            index_exports: Default::default(),
+            sink_exports: Default::default(),
+            as_of: Default::default(),
+            until: Antichain::new(),
+            debug_name: name,
+        }
+    }
+}
+
+/// An association of a global identifier to an expression.
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+pub struct BuildDesc<P = Plan> {
+    pub id: GlobalId,
+    pub plan: P,
+}
+
+/// An index storing processed updates so they can be queried
+/// or reused in other computations
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, Hash)]
+pub struct IndexDesc {
+    /// Identity of the collection the index is on.
+    pub on_id: GlobalId,
+    /// Expressions to be arranged, in order of decreasing primacy.
+    pub key: Vec<ScalarExpr>,
+}
--- a/src/flow/src/compute/types/mod.rs
+++ b/src/flow/src/compute/types/mod.rs
@@ -0,0 +1,8 @@
+use serde::{Deserialize, Serialize};
+
+use crate::expr::GlobalId;
+mod dataflow;
+mod sinks;
+mod sources;
+
+pub(crate) use dataflow::{BuildDesc, DataflowDescription, IndexDesc};
--- a/src/flow/src/compute/types/sinks.rs
+++ b/src/flow/src/compute/types/sinks.rs
@@ -0,0 +1,28 @@
+use serde::{Deserialize, Serialize};
+use timely::progress::Antichain;
+
+use crate::expr::GlobalId;
+use crate::repr::{self, RelationDesc};
+
+/// A sink for updates to a relational collection.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct ComputeSinkDesc<S: 'static = (), T = repr::Timestamp> {
+    pub from: GlobalId,
+    pub from_desc: RelationDesc,
+    pub connection: ComputeSinkConnection<S>,
+    pub with_snapshot: bool,
+    pub up_to: Antichain<T>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub enum ComputeSinkConnection<S: 'static = ()> {
+    // TODO(discord9): consider if ever needed
+    Subscribe,
+    Persist(PersistSinkConnection<S>),
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+pub struct PersistSinkConnection<S> {
+    pub value_desc: RelationDesc,
+    pub storage_metadata: S,
+}
--- a/src/flow/src/compute/types/sources.rs
+++ b/src/flow/src/compute/types/sources.rs
@@ -0,0 +1,26 @@
+use serde::{Deserialize, Serialize};
+
+use crate::expr::MapFilterProject;
+use crate::repr::RelationType;
+
+/// A description of an instantiation of a source.
+///
+/// This includes a description of the source, but additionally any
+/// context-dependent options like the ability to apply filtering and
+/// projection to the records as they emerge.
+#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
+pub struct SourceInstanceDesc<M> {
+    /// Arguments for this instantiation of the source.
+    pub arguments: SourceInstanceArguments,
+    /// Additional metadata used by the storage client of a compute instance to read it.
+    pub storage_metadata: M,
+    /// The relation type of this source
+    pub typ: RelationType,
+}
+
+/// Per-source construction arguments.
+#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
+pub struct SourceInstanceArguments {
+    /// Linear operators to be applied record-by-record.
+    pub operators: Option<MapFilterProject>,
+}
--- a/src/flow/src/expr/func.rs
+++ b/src/flow/src/expr/func.rs
@@ -0,0 +1,224 @@
+use datatypes::value::Value;
+use serde::{Deserialize, Serialize};
+
+use super::ScalarExpr;
+// TODO(discord9): more function & eval
+use crate::{repr::Row, storage::errors::EvalError};
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
+pub enum UnaryFunc {
+    Not,
+    IsNull,
+    IsTrue,
+    IsFalse,
+    CastDatetimeToInt64,
+    CastInt64ToFloat32,
+}
+
+impl UnaryFunc {
+    pub fn eval(&self, values: &[Value], expr: &ScalarExpr) -> Result<Value, EvalError> {
+        let arg = expr.eval(values)?;
+        match self {
+            Self::CastDatetimeToInt64 => {
+                let datetime = if let Value::DateTime(datetime) = arg {
+                    Ok(datetime.val())
+                } else {
+                    Err(EvalError::TypeMismatch(format!(
+                        "cannot cast {:?} to datetime",
+                        arg
+                    )))
+                }?;
+                Ok(Value::from(datetime))
+            }
+            Self::CastInt64ToFloat32 => {
+                let int64 = if let Value::Int64(int64) = arg {
+                    Ok(int64)
+                } else {
+                    Err(EvalError::TypeMismatch(format!(
+                        "cannot cast {:?} to int64",
+                        arg
+                    )))
+                }?;
+                Ok(Value::from(int64 as f32))
+            }
+            _ => todo!(),
+        }
+    }
+}
+
+/// TODO: support more binary functions for more types
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
+pub enum BinaryFunc {
+    Eq,
+    NotEq,
+    Lt,
+    Lte,
+    Gt,
+    Gte,
+    AddInt16,
+    AddInt32,
+    AddInt64,
+    AddUInt16,
+    AddUInt32,
+    AddUInt64,
+    AddFloat32,
+    AddFloat64,
+    SubInt16,
+    SubInt32,
+    SubInt64,
+    SubUInt16,
+    SubUInt32,
+    SubUInt64,
+    SubFloat32,
+    SubFloat64,
+    MulInt16,
+    MulInt32,
+    MulInt64,
+    MulUInt16,
+    MulUInt32,
+    MulUInt64,
+    MulFloat32,
+    MulFloat64,
+    DivInt16,
+    DivInt32,
+    DivInt64,
+    DivUInt16,
+    DivUInt32,
+    DivUInt64,
+    DivFloat32,
+    DivFloat64,
+    ModInt16,
+    ModInt32,
+    ModInt64,
+    ModUInt16,
+    ModUInt32,
+    ModUInt64,
+}
+
+impl BinaryFunc {
+    pub fn eval(
+        &self,
+        values: &[Value],
+        expr1: &ScalarExpr,
+        expr2: &ScalarExpr,
+    ) -> Result<Value, EvalError> {
+        let left = expr1.eval(values)?;
+        let right = expr2.eval(values)?;
+        match self {
+            Self::Eq => Ok(Value::from(left == right)),
+            Self::NotEq => Ok(Value::from(left != right)),
+            Self::Lt => Ok(Value::from(left < right)),
+            Self::Lte => Ok(Value::from(left <= right)),
+            Self::Gt => Ok(Value::from(left > right)),
+            Self::Gte => Ok(Value::from(left >= right)),
+            Self::AddInt16 => Ok(add::<i16>(left, right)?),
+            Self::AddInt32 => Ok(add::<i32>(left, right)?),
+            Self::AddInt64 => Ok(add::<i64>(left, right)?),
+            Self::AddUInt16 => Ok(add::<u16>(left, right)?),
+            Self::AddUInt32 => Ok(add::<u32>(left, right)?),
+            Self::AddUInt64 => Ok(add::<u64>(left, right)?),
+            Self::AddFloat32 => Ok(add::<f32>(left, right)?),
+            Self::AddFloat64 => Ok(add::<f64>(left, right)?),
+
+            Self::SubInt16 => Ok(sub::<i16>(left, right)?),
+            Self::SubInt32 => Ok(sub::<i32>(left, right)?),
+            Self::SubInt64 => Ok(sub::<i64>(left, right)?),
+            Self::SubUInt16 => Ok(sub::<u16>(left, right)?),
+            Self::SubUInt32 => Ok(sub::<u32>(left, right)?),
+            Self::SubUInt64 => Ok(sub::<u64>(left, right)?),
+            Self::SubFloat32 => Ok(sub::<f32>(left, right)?),
+            Self::SubFloat64 => Ok(sub::<f64>(left, right)?),
+
+            Self::MulInt16 => Ok(mul::<i16>(left, right)?),
+            Self::MulInt32 => Ok(mul::<i32>(left, right)?),
+            Self::MulInt64 => Ok(mul::<i64>(left, right)?),
+            Self::MulUInt16 => Ok(mul::<u16>(left, right)?),
+            Self::MulUInt32 => Ok(mul::<u32>(left, right)?),
+            Self::MulUInt64 => Ok(mul::<u64>(left, right)?),
+            Self::MulFloat32 => Ok(mul::<f32>(left, right)?),
+            Self::MulFloat64 => Ok(mul::<f64>(left, right)?),
+
+            Self::DivInt16 => Ok(div::<i16>(left, right)?),
+            Self::DivInt32 => Ok(div::<i32>(left, right)?),
+            Self::DivInt64 => Ok(div::<i64>(left, right)?),
+            Self::DivUInt16 => Ok(div::<u16>(left, right)?),
+            Self::DivUInt32 => Ok(div::<u32>(left, right)?),
+            Self::DivUInt64 => Ok(div::<u64>(left, right)?),
+            Self::DivFloat32 => Ok(div::<f32>(left, right)?),
+            Self::DivFloat64 => Ok(div::<f64>(left, right)?),
+
+            Self::ModInt16 => Ok(rem::<i16>(left, right)?),
+            Self::ModInt32 => Ok(rem::<i32>(left, right)?),
+            Self::ModInt64 => Ok(rem::<i64>(left, right)?),
+            Self::ModUInt16 => Ok(rem::<u16>(left, right)?),
+            Self::ModUInt32 => Ok(rem::<u32>(left, right)?),
+            Self::ModUInt64 => Ok(rem::<u64>(left, right)?),
+
+            _ => todo!(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
+pub enum VariadicFunc {}
+
+impl VariadicFunc {
+    pub fn eval(&self, values: &[Value], exprs: &[ScalarExpr]) -> Result<Value, EvalError> {
+        todo!()
+    }
+}
+
+fn add<T>(left: Value, right: Value) -> Result<Value, EvalError>
+where
+    T: TryFrom<Value> + std::ops::Add<Output = T>,
+    <T as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<T>,
+{
+    let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    Ok(Value::from(left + right))
+}
+
+fn sub<T>(left: Value, right: Value) -> Result<Value, EvalError>
+where
+    T: TryFrom<Value> + std::ops::Sub<Output = T>,
+    <T as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<T>,
+{
+    let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    Ok(Value::from(left - right))
+}
+
+fn mul<T>(left: Value, right: Value) -> Result<Value, EvalError>
+where
+    T: TryFrom<Value> + std::ops::Mul<Output = T>,
+    <T as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<T>,
+{
+    let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    Ok(Value::from(left * right))
+}
+
+fn div<T>(left: Value, right: Value) -> Result<Value, EvalError>
+where
+    T: TryFrom<Value> + std::ops::Div<Output = T>,
+    <T as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<T>,
+{
+    let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    Ok(Value::from(left / right))
+}
+
+fn rem<T>(left: Value, right: Value) -> Result<Value, EvalError>
+where
+    T: TryFrom<Value> + std::ops::Rem<Output = T>,
+    <T as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<T>,
+{
+    let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
+    Ok(Value::from(left % right))
+}
--- a/src/flow/src/expr/id.rs
+++ b/src/flow/src/expr/id.rs
@@ -0,0 +1,24 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+pub enum GlobalId {
+    /// System namespace.
+    System(u64),
+    /// User namespace.
+    User(u64),
+    /// Transient namespace.
+    Transient(u64),
+    /// Dummy id for query being explained
+    Explain,
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub struct LocalId(pub(crate) u64);
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub enum Id {
+    /// An identifier that refers to a local component of a dataflow.
+    Local(LocalId),
+    /// An identifier that refers to a global dataflow.
+    Global(GlobalId),
+}
--- a/src/flow/src/expr/linear.rs
+++ b/src/flow/src/expr/linear.rs
@@ -0,0 +1,381 @@
+use std::collections::{BTreeMap, BTreeSet};
+
+use datatypes::value::Value;
+use serde::{Deserialize, Serialize};
+
+use crate::expr::{Id, LocalId, ScalarExpr};
+use crate::repr::{self, Diff, Row};
+use crate::storage::errors::EvalError;
+
+/// A compound operator that can be applied row-by-row.
+///
+/// This operator integrates the map, filter, and project operators.
+/// It applies a sequences of map expressions, which are allowed to
+/// refer to previous expressions, interleaved with predicates which
+/// must be satisfied for an output to be produced. If all predicates
+/// evaluate to `Datum::True` the data at the identified columns are
+/// collected and produced as output in a packed `Row`.
+///
+/// This operator is a "builder" and its contents may contain expressions
+/// that are not yet executable. For example, it may contain temporal
+/// expressions in `self.expressions`, even though this is not something
+/// we can directly evaluate. The plan creation methods will defensively
+/// ensure that the right thing happens.
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+pub struct MapFilterProject {
+    /// A sequence of expressions that should be appended to the row.
+    ///
+    /// Many of these expressions may not be produced in the output,
+    /// and may only be present as common subexpressions.
+    pub expressions: Vec<ScalarExpr>,
+    /// Expressions that must evaluate to `Datum::True` for the output
+    /// row to be produced.
+    ///
+    /// Each entry is prepended with a column identifier indicating
+    /// the column *before* which the predicate should first be applied.
+    /// Most commonly this would be one plus the largest column identifier
+    /// in the predicate's support, but it could be larger to implement
+    /// guarded evaluation of predicates.
+    ///
+    /// This list should be sorted by the first field.
+    pub predicates: Vec<(usize, ScalarExpr)>,
+    /// A sequence of column identifiers whose data form the output row.
+    pub projection: Vec<usize>,
+    /// The expected number of input columns.
+    ///
+    /// This is needed to ensure correct identification of newly formed
+    /// columns in the output.
+    pub input_arity: usize,
+}
+
+impl MapFilterProject {
+    /// Create a no-op operator for an input of a supplied arity.
+    pub fn new(input_arity: usize) -> Self {
+        Self {
+            expressions: Vec::new(),
+            predicates: Vec::new(),
+            projection: (0..input_arity).collect(),
+            input_arity,
+        }
+    }
+
+    /// Given two mfps, return an mfp that applies one
+    /// followed by the other.
+    /// Note that the arguments are in the opposite order
+    /// from how function composition is usually written in mathematics.
+    pub fn compose(before: Self, after: Self) -> Self {
+        let (m, f, p) = after.into_map_filter_project();
+        before.map(m).filter(f).project(p)
+    }
+
+    /// True if the operator describes the identity transformation.
+    pub fn is_identity(&self) -> bool {
+        self.expressions.is_empty()
+            && self.predicates.is_empty()
+            && self.projection.len() == self.input_arity
+            && self.projection.iter().enumerate().all(|(i, p)| i == *p)
+    }
+
+    /// Retain only the indicated columns in the presented order.
+    pub fn project<I>(mut self, columns: I) -> Self
+    where
+        I: IntoIterator<Item = usize> + std::fmt::Debug,
+    {
+        self.projection = columns.into_iter().map(|c| self.projection[c]).collect();
+        self
+    }
+
+    /// Retain only rows satisfying these predicates.
+    ///
+    /// This method introduces predicates as eagerly as they can be evaluated,
+    /// which may not be desired for predicates that may cause exceptions.
+    /// If fine manipulation is required, the predicates can be added manually.
+    pub fn filter<I>(mut self, predicates: I) -> Self
+    where
+        I: IntoIterator<Item = ScalarExpr>,
+    {
+        for mut predicate in predicates {
+            // Correct column references.
+            predicate.permute(&self.projection[..]);
+
+            // Validate column references.
+            assert!(predicate
+                .support()
+                .into_iter()
+                .all(|c| c < self.input_arity + self.expressions.len()));
+
+            // Insert predicate as eagerly as it can be evaluated:
+            // just after the largest column in its support is formed.
+            let max_support = predicate
+                .support()
+                .into_iter()
+                .max()
+                .map(|c| c + 1)
+                .unwrap_or(0);
+            self.predicates.push((max_support, predicate))
+        }
+        // Stable sort predicates by position at which they take effect.
+        // We put literal errors at the end as a stop-gap to avoid erroring
+        // before we are able to evaluate any predicates that might prevent it.
+        self.predicates
+            .sort_by_key(|(position, predicate)| (predicate.is_literal_err(), *position));
+        self
+    }
+
+    /// Append the result of evaluating expressions to each row.
+    pub fn map<I>(mut self, expressions: I) -> Self
+    where
+        I: IntoIterator<Item = ScalarExpr>,
+    {
+        for mut expression in expressions {
+            // Correct column references.
+            expression.permute(&self.projection[..]);
+
+            // Validate column references.
+            assert!(expression
+                .support()
+                .into_iter()
+                .all(|c| c < self.input_arity + self.expressions.len()));
+
+            // Introduce expression and produce as output.
+            self.expressions.push(expression);
+            self.projection
+                .push(self.input_arity + self.expressions.len() - 1);
+        }
+
+        self
+    }
+
+    /// Like [`MapFilterProject::as_map_filter_project`], but consumes `self` rather than cloning.
+    pub fn into_map_filter_project(self) -> (Vec<ScalarExpr>, Vec<ScalarExpr>, Vec<usize>) {
+        let predicates = self
+            .predicates
+            .into_iter()
+            .map(|(_pos, predicate)| predicate)
+            .collect();
+        (self.expressions, predicates, self.projection)
+    }
+
+    /// As the arguments to `Map`, `Filter`, and `Project` operators.
+    ///
+    /// In principle, this operator can be implemented as a sequence of
+    /// more elemental operators, likely less efficiently.
+    pub fn as_map_filter_project(&self) -> (Vec<ScalarExpr>, Vec<ScalarExpr>, Vec<usize>) {
+        self.clone().into_map_filter_project()
+    }
+}
+
+impl MapFilterProject {
+    pub fn optimize(&mut self) {
+        // TODO(discord9): optimize later
+    }
+
+    /// Convert the `MapFilterProject` into a staged evaluation plan.
+    ///
+    /// The main behavior is extract temporal predicates, which cannot be evaluated
+    /// using the standard machinery.
+    pub fn into_plan(self) -> Result<MfpPlan, String> {
+        MfpPlan::create_from(self)
+    }
+
+    /// Lists input columns whose values are used in outputs.
+    ///
+    /// It is entirely appropriate to determine the demand of an instance
+    /// and then both apply a projection to the subject of the instance and
+    /// `self.permute` this instance.
+    pub fn demand(&self) -> BTreeSet<usize> {
+        let mut demanded = BTreeSet::new();
+        for (_index, pred) in self.predicates.iter() {
+            demanded.extend(pred.support());
+        }
+        demanded.extend(self.projection.iter().cloned());
+        for index in (0..self.expressions.len()).rev() {
+            if demanded.contains(&(self.input_arity + index)) {
+                demanded.extend(self.expressions[index].support());
+            }
+        }
+        demanded.retain(|col| col < &self.input_arity);
+        demanded
+    }
+
+    /// Update input column references, due to an input projection or permutation.
+    ///
+    /// The `shuffle` argument remaps expected column identifiers to new locations,
+    /// with the expectation that `shuffle` describes all input columns, and so the
+    /// intermediate results will be able to start at position `shuffle.len()`.
+    ///
+    /// The supplied `shuffle` may not list columns that are not "demanded" by the
+    /// instance, and so we should ensure that `self` is optimized to not reference
+    /// columns that are not demanded.
+    pub fn permute(&mut self, mut shuffle: BTreeMap<usize, usize>, new_input_arity: usize) {
+        let (mut map, mut filter, mut project) = self.as_map_filter_project();
+        for index in 0..map.len() {
+            // Intermediate columns are just shifted.
+            shuffle.insert(self.input_arity + index, new_input_arity + index);
+        }
+        for expr in map.iter_mut() {
+            expr.permute_map(&shuffle);
+        }
+        for pred in filter.iter_mut() {
+            pred.permute_map(&shuffle);
+        }
+        for proj in project.iter_mut() {
+            assert!(shuffle[proj] < new_input_arity + map.len());
+            *proj = shuffle[proj];
+        }
+        *self = Self::new(new_input_arity)
+            .map(map)
+            .filter(filter)
+            .project(project)
+    }
+}
+
+/// A wrapper type which indicates it is safe to simply evaluate all expressions.
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub struct SafeMfpPlan {
+    pub(crate) mfp: MapFilterProject,
+}
+
+impl SafeMfpPlan {
+    pub fn permute(&mut self, map: BTreeMap<usize, usize>, new_arity: usize) {
+        self.mfp.permute(map, new_arity);
+    }
+
+    /// Evaluates the linear operator on a supplied list of datums.
+    ///
+    /// The arguments are the initial datums associated with the row,
+    /// and an appropriately lifetimed arena for temporary allocations
+    /// needed by scalar evaluation.
+    ///
+    /// An `Ok` result will either be `None` if any predicate did not
+    /// evaluate to `Value::Boolean(true)`, or the values of the columns listed
+    /// by `self.projection` if all predicates passed. If an error
+    /// occurs in the evaluation it is returned as an `Err` variant.
+    /// As the evaluation exits early with failed predicates, it may
+    /// miss some errors that would occur later in evaluation.
+    ///
+    /// The `row` is not cleared first, but emptied if the function
+    /// returns `Ok(Some(row)).
+    #[inline(always)]
+    pub fn evaluate_into(
+        &self,
+        values: &mut Vec<Value>,
+        row_buf: &mut Row,
+    ) -> Result<Option<Row>, EvalError> {
+        let passed_predicates = self.evaluate_inner(values)?;
+        if !passed_predicates {
+            Ok(None)
+        } else {
+            row_buf.clear();
+            row_buf.extend(self.mfp.projection.iter().map(|c| values[*c].clone()));
+            Ok(Some(row_buf.clone()))
+        }
+    }
+
+    /// A version of `evaluate` which produces an iterator over `Datum`
+    /// as output.
+    ///
+    /// This version can be useful when one wants to capture the resulting
+    /// datums without packing and then unpacking a row.
+    #[inline(always)]
+    pub fn evaluate_iter<'a>(
+        &'a self,
+        datums: &'a mut Vec<Value>,
+    ) -> Result<Option<impl Iterator<Item = Value> + 'a>, EvalError> {
+        let passed_predicates = self.evaluate_inner(datums)?;
+        if !passed_predicates {
+            Ok(None)
+        } else {
+            Ok(Some(
+                self.mfp.projection.iter().map(move |i| datums[*i].clone()),
+            ))
+        }
+    }
+
+    /// Populates `datums` with `self.expressions` and tests `self.predicates`.
+    ///
+    /// This does not apply `self.projection`, which is up to the calling method.
+    pub fn evaluate_inner(&self, values: &mut Vec<Value>) -> Result<bool, EvalError> {
+        let mut expression = 0;
+        for (support, predicate) in self.mfp.predicates.iter() {
+            while self.mfp.input_arity + expression < *support {
+                values.push(self.mfp.expressions[expression].eval(&values[..])?);
+                expression += 1;
+            }
+            if predicate.eval(&values[..])? != Value::Boolean(true) {
+                return Ok(false);
+            }
+        }
+        while expression < self.mfp.expressions.len() {
+            values.push(self.mfp.expressions[expression].eval(&values[..])?);
+            expression += 1;
+        }
+        Ok(true)
+    }
+}
+
+impl std::ops::Deref for SafeMfpPlan {
+    type Target = MapFilterProject;
+    fn deref(&self) -> &Self::Target {
+        &self.mfp
+    }
+}
+
+/// Predicates partitioned into temporal and non-temporal.
+///
+/// Temporal predicates require some recognition to determine their
+/// structure, and it is best to do that once and re-use the results.
+///
+/// There are restrictions on the temporal predicates we currently support.
+/// They must directly constrain `MzNow` from below or above,
+/// by expressions that do not themselves contain `MzNow`.
+/// Conjunctions of such constraints are also ok.
+#[derive(Clone, Debug, PartialEq)]
+pub struct MfpPlan {
+    /// Normal predicates to evaluate on `&[Datum]` and expect `Ok(Datum::True)`.
+    pub(crate) mfp: SafeMfpPlan,
+    /// TODO(discord9): impl temporal filter later
+    /// Expressions that when evaluated lower-bound `MzNow`.
+    pub(crate) lower_bounds: Vec<ScalarExpr>,
+    /// Expressions that when evaluated upper-bound `MzNow`.
+    pub(crate) upper_bounds: Vec<ScalarExpr>,
+}
+
+impl MfpPlan {
+    pub fn create_from(mut mfp: MapFilterProject) -> Result<Self, String> {
+        Ok(Self {
+            mfp: SafeMfpPlan { mfp },
+            lower_bounds: Vec::new(),
+            upper_bounds: Vec::new(),
+        })
+    }
+    pub fn evaluate<E: From<EvalError>, V: Fn(&repr::Timestamp) -> bool>(
+        &self,
+        values: &mut Vec<Value>,
+        time: repr::Timestamp,
+        diff: Diff,
+        valid_time: V,
+    ) -> impl Iterator<Item = Result<(Row, repr::Timestamp, Diff), (E, repr::Timestamp, Diff)>>
+    {
+        match self.mfp.evaluate_inner(values) {
+            Err(e) => {
+                return Some(Err((e.into(), time, diff)))
+                    .into_iter()
+                    .chain(None.into_iter());
+            }
+            Ok(true) => {}
+            Ok(false) => {
+                return None.into_iter().chain(None.into_iter());
+            }
+        }
+        // TODO(discord9): Temporal filter
+        let ret = Row::pack(self.mfp.mfp.projection.iter().map(|c| values[*c].clone()));
+        Some(Ok((ret, time, diff)))
+            .into_iter()
+            .chain(None.into_iter())
+    }
+    /// Indicates if the planned `MapFilterProject` emits exactly its inputs as outputs.
+    pub fn is_identity(&self) -> bool {
+        self.mfp.mfp.is_identity() && self.lower_bounds.is_empty() && self.upper_bounds.is_empty()
+    }
+}
--- a/src/flow/src/expr/mod.rs
+++ b/src/flow/src/expr/mod.rs
@@ -0,0 +1,207 @@
+//! for declare dataflow description that is the last step before build dataflow
+
+mod func;
+mod id;
+mod linear;
+mod relation;
+
+use std::collections::{BTreeMap, BTreeSet};
+
+use datatypes::prelude::ConcreteDataType;
+use datatypes::value::Value;
+pub use id::{GlobalId, Id, LocalId};
+pub use linear::{MapFilterProject, SafeMfpPlan};
+pub(crate) use relation::{AggregateExpr, AggregateFunc, TableFunc};
+use serde::{Deserialize, Serialize};
+
+pub(crate) use crate::expr::func::{BinaryFunc, UnaryFunc, VariadicFunc};
+use crate::storage::errors::EvalError;
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum ScalarExpr {
+    /// A column of the input row
+    Column(usize),
+    /// A literal value.
+    Literal(Result<Value, EvalError>, ConcreteDataType),
+    CallUnary {
+        func: UnaryFunc,
+        expr: Box<ScalarExpr>,
+    },
+    CallBinary {
+        func: BinaryFunc,
+        expr1: Box<ScalarExpr>,
+        expr2: Box<ScalarExpr>,
+    },
+    CallVariadic {
+        func: VariadicFunc,
+        exprs: Vec<ScalarExpr>,
+    },
+    /// Conditionally evaluated expressions.
+    ///
+    /// It is important that `then` and `els` only be evaluated if
+    /// `cond` is true or not, respectively. This is the only way
+    /// users can guard execution (other logical operator do not
+    /// short-circuit) and we need to preserve that.
+    If {
+        cond: Box<ScalarExpr>,
+        then: Box<ScalarExpr>,
+        els: Box<ScalarExpr>,
+    },
+}
+
+impl ScalarExpr {
+    pub fn eval(&self, values: &[Value]) -> Result<Value, EvalError> {
+        match self {
+            ScalarExpr::Column(index) => Ok(values[*index].clone()),
+            ScalarExpr::Literal(row_res, _ty) => row_res.clone(),
+            ScalarExpr::CallUnary { func, expr } => func.eval(values, expr),
+            ScalarExpr::CallBinary { func, expr1, expr2 } => func.eval(values, expr1, expr2),
+            ScalarExpr::CallVariadic { func, exprs } => func.eval(values, exprs),
+            ScalarExpr::If { cond, then, els } => match cond.eval(values) {
+                Ok(Value::Boolean(true)) => then.eval(values),
+                Ok(Value::Boolean(false)) => els.eval(values),
+                _ => Err(EvalError::InvalidArgument(
+                    "if condition must be boolean".to_string(),
+                )),
+            },
+        }
+    }
+
+    /// Rewrites column indices with their value in `permutation`.
+    ///
+    /// This method is applicable even when `permutation` is not a
+    /// strict permutation, and it only needs to have entries for
+    /// each column referenced in `self`.
+    pub fn permute(&mut self, permutation: &[usize]) {
+        #[allow(deprecated)]
+        self.visit_mut_post_nolimit(&mut |e| {
+            if let ScalarExpr::Column(old_i) = e {
+                *old_i = permutation[*old_i];
+            }
+        });
+    }
+
+    /// Rewrites column indices with their value in `permutation`.
+    ///
+    /// This method is applicable even when `permutation` is not a
+    /// strict permutation, and it only needs to have entries for
+    /// each column referenced in `self`.
+    pub fn permute_map(&mut self, permutation: &BTreeMap<usize, usize>) {
+        #[allow(deprecated)]
+        self.visit_mut_post_nolimit(&mut |e| {
+            if let ScalarExpr::Column(old_i) = e {
+                *old_i = permutation[old_i];
+            }
+        });
+    }
+
+    pub fn support(&self) -> BTreeSet<usize> {
+        let mut support = BTreeSet::new();
+        #[allow(deprecated)]
+        self.visit_post_nolimit(&mut |e| {
+            if let ScalarExpr::Column(i) = e {
+                support.insert(*i);
+            }
+        });
+        support
+    }
+
+    pub fn as_literal(&self) -> Option<Result<Value, &EvalError>> {
+        if let ScalarExpr::Literal(lit, _column_type) = self {
+            Some(lit.as_ref().map(|row| row.clone()))
+        } else {
+            None
+        }
+    }
+
+    pub fn is_literal(&self) -> bool {
+        matches!(self, ScalarExpr::Literal(_, _))
+    }
+
+    pub fn is_literal_true(&self) -> bool {
+        Some(Ok(Value::Boolean(true))) == self.as_literal()
+    }
+
+    pub fn is_literal_false(&self) -> bool {
+        Some(Ok(Value::Boolean(false))) == self.as_literal()
+    }
+
+    pub fn is_literal_null(&self) -> bool {
+        Some(Ok(Value::Null)) == self.as_literal()
+    }
+
+    pub fn is_literal_ok(&self) -> bool {
+        matches!(self, ScalarExpr::Literal(Ok(_), _typ))
+    }
+
+    pub fn is_literal_err(&self) -> bool {
+        matches!(self, ScalarExpr::Literal(Err(_), _typ))
+    }
+}
+
+impl ScalarExpr {
+    /// visit post-order without stack call limit, but may cause stack overflow
+    fn visit_post_nolimit<F>(&self, f: &mut F)
+    where
+        F: FnMut(&Self),
+    {
+        self.visit_children(|e| e.visit_post_nolimit(f));
+        f(self);
+    }
+
+    fn visit_children<F>(&self, mut f: F)
+    where
+        F: FnMut(&Self),
+    {
+        match self {
+            ScalarExpr::Column(_) | ScalarExpr::Literal(_, _) => (),
+            ScalarExpr::CallUnary { func, expr } => f(expr),
+            ScalarExpr::CallBinary { func, expr1, expr2 } => {
+                f(expr1);
+                f(expr2);
+            }
+            ScalarExpr::CallVariadic { func, exprs } => {
+                for expr in exprs {
+                    f(expr);
+                }
+            }
+            ScalarExpr::If { cond, then, els } => {
+                f(cond);
+                f(then);
+                f(els);
+            }
+        }
+    }
+
+    fn visit_mut_post_nolimit<F>(&mut self, f: &mut F)
+    where
+        F: FnMut(&mut Self),
+    {
+        self.visit_mut_children(|e: &mut Self| e.visit_mut_post_nolimit(f));
+        f(self);
+    }
+
+    fn visit_mut_children<F>(&mut self, mut f: F)
+    where
+        F: FnMut(&mut Self),
+    {
+        match self {
+            ScalarExpr::Column(_) | ScalarExpr::Literal(_, _) => (),
+            ScalarExpr::CallUnary { func, expr } => f(expr),
+            ScalarExpr::CallBinary { func, expr1, expr2 } => {
+                f(expr1);
+                f(expr2);
+            }
+            ScalarExpr::CallVariadic { func, exprs } => {
+                for expr in exprs {
+                    f(expr);
+                }
+            }
+            ScalarExpr::If { cond, then, els } => {
+                f(cond);
+                f(then);
+                f(els);
+            }
+        }
+    }
+}
--- a/src/flow/src/expr/relation/func.rs
+++ b/src/flow/src/expr/relation/func.rs
@@ -0,0 +1,206 @@
+use datatypes::prelude::ConcreteDataType;
+use datatypes::value::{OrderedF32, OrderedF64, Value};
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
+pub enum AggregateFunc {
+    MaxInt16,
+    MaxInt32,
+    MaxInt64,
+    MaxUInt16,
+    MaxUInt32,
+    MaxUInt64,
+    MaxFloat32,
+    MaxFloat64,
+    MaxBool,
+    MaxString,
+    MaxDate,
+    MaxTimestamp,
+    MaxTimestampTz,
+    MinInt16,
+    MinInt32,
+    MinInt64,
+    MinUInt16,
+    MinUInt32,
+    MinUInt64,
+    MinFloat32,
+    MinFloat64,
+    MinBool,
+    MinString,
+    MinDate,
+    MinTimestamp,
+    MinTimestampTz,
+    SumInt16,
+    SumInt32,
+    SumInt64,
+    SumUInt16,
+    SumUInt32,
+    SumUInt64,
+    SumFloat32,
+    SumFloat64,
+    Count,
+    Any,
+    All,
+}
+
+impl AggregateFunc {
+    pub fn eval<I>(&self, values: I) -> Value
+    where
+        I: IntoIterator<Item = Value>,
+    {
+        // TODO: impl more functions like min/max/sumTimestamp etc.
+        match self {
+            AggregateFunc::MaxInt16 => max_value::<I, i16>(values),
+            AggregateFunc::MaxInt32 => max_value::<I, i32>(values),
+            AggregateFunc::MaxInt64 => max_value::<I, i64>(values),
+            AggregateFunc::MaxUInt16 => max_value::<I, u16>(values),
+            AggregateFunc::MaxUInt32 => max_value::<I, u32>(values),
+            AggregateFunc::MaxUInt64 => max_value::<I, u64>(values),
+            AggregateFunc::MaxFloat32 => max_value::<I, OrderedF32>(values),
+            AggregateFunc::MaxFloat64 => max_value::<I, OrderedF64>(values),
+            AggregateFunc::MaxBool => max_value::<I, bool>(values),
+            AggregateFunc::MaxString => max_string(values),
+
+            AggregateFunc::MinInt16 => min_value::<I, i16>(values),
+            AggregateFunc::MinInt32 => min_value::<I, i32>(values),
+            AggregateFunc::MinInt64 => min_value::<I, i64>(values),
+            AggregateFunc::MinUInt16 => min_value::<I, u16>(values),
+            AggregateFunc::MinUInt32 => min_value::<I, u32>(values),
+            AggregateFunc::MinUInt64 => min_value::<I, u16>(values),
+            AggregateFunc::MinFloat32 => min_value::<I, OrderedF32>(values),
+            AggregateFunc::MinFloat64 => min_value::<I, OrderedF64>(values),
+            AggregateFunc::MinBool => min_value::<I, bool>(values),
+            AggregateFunc::MinString => min_string(values),
+
+            AggregateFunc::SumInt16 => sum_value::<I, i16, i64>(values),
+            AggregateFunc::SumInt32 => sum_value::<I, i32, i64>(values),
+            AggregateFunc::SumInt64 => sum_value::<I, i64, i64>(values),
+            AggregateFunc::SumUInt16 => sum_value::<I, u16, u64>(values),
+            AggregateFunc::SumUInt32 => sum_value::<I, u32, u64>(values),
+            AggregateFunc::SumUInt64 => sum_value::<I, u64, u64>(values),
+            AggregateFunc::SumFloat32 => sum_value::<I, f32, f32>(values),
+            AggregateFunc::SumFloat64 => sum_value::<I, f64, f64>(values),
+
+            AggregateFunc::Count => count(values),
+            AggregateFunc::All => all(values),
+            AggregateFunc::Any => any(values),
+            _ => todo!(),
+        }
+    }
+}
+
+fn max_string<I>(values: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+{
+    match values.into_iter().filter(|d| !d.is_null()).max_by(|a, b| {
+        let a = a.as_value_ref();
+        let a = a.as_string().expect("unexpected type").unwrap();
+        let b = b.as_value_ref();
+        let b = b.as_string().expect("unexpected type").unwrap();
+        a.cmp(b)
+    }) {
+        Some(v) => v,
+        None => Value::Null,
+    }
+}
+
+fn max_value<I, TypedValue>(values: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+    TypedValue: TryFrom<Value> + Ord,
+    <TypedValue as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<Option<TypedValue>>,
+{
+    let x: Option<TypedValue> = values
+        .into_iter()
+        .filter(|v| !v.is_null())
+        .map(|v| TypedValue::try_from(v).expect("unexpected type"))
+        .max();
+    x.into()
+}
+
+fn min_string<I>(values: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+{
+    match values.into_iter().filter(|d| !d.is_null()).min_by(|a, b| {
+        let a = a.as_value_ref();
+        let a = a.as_string().expect("unexpected type").unwrap();
+        let b = b.as_value_ref();
+        let b = b.as_string().expect("unexpected type").unwrap();
+        a.cmp(b)
+    }) {
+        Some(v) => v,
+        None => Value::Null,
+    }
+}
+
+fn min_value<I, TypedValue>(values: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+    TypedValue: TryFrom<Value> + Ord,
+    <TypedValue as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<Option<TypedValue>>,
+{
+    let x: Option<TypedValue> = values
+        .into_iter()
+        .filter(|v| !v.is_null())
+        .map(|v| TypedValue::try_from(v).expect("unexpected type"))
+        .min();
+    x.into()
+}
+
+fn sum_value<I, ValueType, ResultType>(values: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+    ValueType: TryFrom<Value>,
+    <ValueType as TryFrom<Value>>::Error: std::fmt::Debug,
+    Value: From<Option<ValueType>>,
+    ResultType: From<ValueType> + std::iter::Sum + Into<Value>,
+{
+    // If no row qualifies, then the result of COUNT is 0 (zero), and the result of any other aggregate function is the null value.
+    let mut values = values.into_iter().filter(|v| !v.is_null()).peekable();
+    if values.peek().is_none() {
+        Value::Null
+    } else {
+        let x = values
+            .map(|v| ResultType::from(ValueType::try_from(v).expect("unexpected type")))
+            .sum::<ResultType>();
+        x.into()
+    }
+}
+
+fn count<I>(values: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+{
+    let x = values.into_iter().filter(|v| !v.is_null()).count() as i64;
+    Value::from(x)
+}
+
+fn any<I>(datums: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+{
+    datums
+        .into_iter()
+        .fold(Value::Boolean(false), |state, next| match (state, next) {
+            (Value::Boolean(true), _) | (_, Value::Boolean(true)) => Value::Boolean(true),
+            (Value::Null, _) | (_, Value::Null) => Value::Null,
+            _ => Value::Boolean(false),
+        })
+}
+
+fn all<I>(datums: I) -> Value
+where
+    I: IntoIterator<Item = Value>,
+{
+    datums
+        .into_iter()
+        .fold(Value::Boolean(true), |state, next| match (state, next) {
+            (Value::Boolean(false), _) | (_, Value::Boolean(false)) => Value::Boolean(false),
+            (Value::Null, _) | (_, Value::Null) => Value::Null,
+            _ => Value::Boolean(true),
+        })
+}
--- a/src/flow/src/expr/relation/mod.rs
+++ b/src/flow/src/expr/relation/mod.rs
@@ -0,0 +1,22 @@
+pub(crate) use func::AggregateFunc;
+use serde::{Deserialize, Serialize};
+
+use crate::expr::ScalarExpr;
+
+mod func;
+
+/// function that might emit multiple output record for one input row
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
+pub enum TableFunc {}
+
+/// Describes an aggregation expression.
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+pub struct AggregateExpr {
+    /// Names the aggregation function.
+    pub func: AggregateFunc,
+    /// An expression which extracts from each row the input to `func`.
+    pub expr: ScalarExpr,
+    /// Should the aggregation be applied only to distinct results in each group.
+    #[serde(default)]
+    pub distinct: bool,
+}
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -0,0 +1,9 @@
+#![allow(unused)]
+#![allow(clippy::mutable_key_type)]
+
+mod adapter;
+mod compute;
+mod expr;
+mod repr;
+mod storage;
+mod util;
--- a/src/flow/src/repr/mod.rs
+++ b/src/flow/src/repr/mod.rs
@@ -0,0 +1,62 @@
+//! basically a wrapper around the `datatype` crate
+//! for basic Data Representation
+use std::borrow::Borrow;
+use std::slice::SliceIndex;
+
+use datatypes::value::Value;
+pub(crate) use relation::{RelationDesc, RelationType};
+use serde::{Deserialize, Serialize};
+/// System-wide Record count difference type.
+pub type Diff = i64;
+
+mod relation;
+mod timestamp;
+
+/// A row is a vector of values.
+///
+/// TODO(discord9): use a more efficient representation
+///i.e. more compact like raw u8 of \[tag0, value0, tag1, value1, ...\]
+
+#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default, Serialize, Deserialize)]
+pub struct Row {
+    inner: Vec<Value>,
+}
+
+impl Row {
+    pub fn get(&self, idx: usize) -> Option<&Value> {
+        self.inner.get(idx)
+    }
+    pub fn clear(&mut self) {
+        self.inner.clear();
+    }
+    pub fn packer(&mut self) -> &mut Vec<Value> {
+        self.inner.clear();
+        &mut self.inner
+    }
+    pub fn pack<I>(iter: I) -> Row
+    where
+        I: IntoIterator<Item = Value>,
+    {
+        Self {
+            inner: iter.into_iter().collect(),
+        }
+    }
+    pub fn unpack(&self) -> Vec<Value> {
+        self.inner.clone()
+    }
+    pub fn extend<I>(&mut self, iter: I)
+    where
+        I: IntoIterator<Item = Value>,
+    {
+        self.inner.extend(iter);
+    }
+    pub fn into_iter(self) -> impl Iterator<Item = Value> {
+        self.inner.into_iter()
+    }
+    pub fn iter(&self) -> impl Iterator<Item = &Value> {
+        self.inner.iter()
+    }
+}
+
+/// System-wide default timestamp type
+pub type Timestamp = u64;
--- a/src/flow/src/repr/relation.rs
+++ b/src/flow/src/repr/relation.rs
@@ -0,0 +1,342 @@
+use datatypes::prelude::ConcreteDataType;
+use serde::{Deserialize, Serialize};
+
+/// The type of a relation.
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
+pub struct RelationType {
+    /// The type for each column, in order.
+    pub column_types: Vec<ColumnType>,
+    /// Sets of indices that are "keys" for the collection.
+    ///
+    /// Each element in this list is a set of column indices, each with the
+    /// property that the collection contains at most one record with each
+    /// distinct set of values for each column. Alternately, for a specific set
+    /// of values assigned to the these columns there is at most one record.
+    ///
+    /// A collection can contain multiple sets of keys, although it is common to
+    /// have either zero or one sets of key indices.
+    #[serde(default)]
+    pub keys: Vec<Vec<usize>>,
+}
+
+impl RelationType {
+    /// Constructs a `RelationType` representing the relation with no columns and
+    /// no keys.
+    pub fn empty() -> Self {
+        RelationType::new(vec![])
+    }
+
+    /// Constructs a new `RelationType` from specified column types.
+    ///
+    /// The `RelationType` will have no keys.
+    pub fn new(column_types: Vec<ColumnType>) -> Self {
+        RelationType {
+            column_types,
+            keys: Vec::new(),
+        }
+    }
+
+    /// Adds a new key for the relation.
+    pub fn with_key(mut self, mut indices: Vec<usize>) -> Self {
+        indices.sort_unstable();
+        if !self.keys.contains(&indices) {
+            self.keys.push(indices);
+        }
+        self
+    }
+
+    pub fn with_keys(mut self, keys: Vec<Vec<usize>>) -> Self {
+        for key in keys {
+            self = self.with_key(key)
+        }
+        self
+    }
+
+    /// Computes the number of columns in the relation.
+    pub fn arity(&self) -> usize {
+        self.column_types.len()
+    }
+
+    /// Gets the index of the columns used when creating a default index.
+    pub fn default_key(&self) -> Vec<usize> {
+        if let Some(key) = self.keys.first() {
+            if key.is_empty() {
+                (0..self.column_types.len()).collect()
+            } else {
+                key.clone()
+            }
+        } else {
+            (0..self.column_types.len()).collect()
+        }
+    }
+
+    /// True if any collection described by `self` could safely be described by `other`.
+    ///
+    /// In practice this means checking that the scalar types match exactly, and that the
+    /// nullability of `self` is at least as strict as `other`, and that all keys of `other`
+    /// contain some key of `self` (as a set of key columns is less strict than any subset).
+    pub fn subtypes(&self, other: &RelationType) -> bool {
+        let all_keys = other.keys.iter().all(|key1| {
+            self.keys
+                .iter()
+                .any(|key2| key1.iter().all(|k| key2.contains(k)))
+        });
+        if !all_keys {
+            return false;
+        }
+
+        if self.column_types.len() != other.column_types.len() {
+            return false;
+        }
+
+        for (col1, col2) in self.column_types.iter().zip(other.column_types.iter()) {
+            if col1.nullable && !col2.nullable {
+                return false;
+            }
+            if col1.scalar_type != col2.scalar_type {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+/// The type of a `Value`
+///
+/// [`ColumnType`] bundles information about the scalar type of a datum (e.g.,
+/// Int32 or String) with its nullability.
+///
+/// To construct a column type, either initialize the struct directly, or
+/// use the [`ScalarType::nullable`] method.
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
+pub struct ColumnType {
+    /// The underlying scalar type (e.g., Int32 or String) of this column.
+    pub scalar_type: ConcreteDataType,
+    /// Whether this datum can be null.`
+    #[serde(default = "return_true")]
+    pub nullable: bool,
+}
+
+/// This method exists solely for the purpose of making ColumnType nullable by
+/// default in unit tests. The default value of a bool is false, and the only
+/// way to make an object take on any other value by default is to pass it a
+/// function that returns the desired default value. See
+/// <https://github.com/serde-rs/serde/issues/1030>
+#[inline(always)]
+fn return_true() -> bool {
+    true
+}
+
+/// A description of the shape of a relation.
+///
+/// It bundles a [`RelationType`] with the name of each column in the relation.
+/// Individual column names are optional.
+///
+/// # Examples
+///
+/// A `RelationDesc`s is typically constructed via its builder API:
+///
+/// ```
+/// use mz_repr::{ColumnType, RelationDesc, ScalarType};
+///
+/// let desc = RelationDesc::empty()
+///     .with_column("id", ScalarType::Int64.nullable(false))
+///     .with_column("price", ScalarType::Float64.nullable(true));
+/// ```
+///
+/// In more complicated cases, like when constructing a `RelationDesc` in
+/// response to user input, it may be more convenient to construct a relation
+/// type first, and imbue it with column names to form a `RelationDesc` later:
+///
+/// ```
+/// use mz_repr::RelationDesc;
+///
+/// # fn plan_query(_: &str) -> mz_repr::RelationType { mz_repr::RelationType::new(vec![]) }
+/// let relation_type = plan_query("SELECT * FROM table");
+/// let names = (0..relation_type.arity()).map(|i| match i {
+///     0 => "first",
+///     1 => "second",
+///     _ => "unknown",
+/// });
+/// let desc = RelationDesc::new(relation_type, names);
+/// ```
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, Hash)]
+pub struct RelationDesc {
+    typ: RelationType,
+    names: Vec<ColumnName>,
+}
+
+impl RelationDesc {
+    /// Constructs a new `RelationDesc` that represents the empty relation
+    /// with no columns and no keys.
+    pub fn empty() -> Self {
+        RelationDesc {
+            typ: RelationType::empty(),
+            names: vec![],
+        }
+    }
+
+    /// Constructs a new `RelationDesc` from a `RelationType` and an iterator
+    /// over column names.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the arity of the `RelationType` is not equal to the number of
+    /// items in `names`.
+    pub fn new<I, N>(typ: RelationType, names: I) -> Self
+    where
+        I: IntoIterator<Item = N>,
+        N: Into<ColumnName>,
+    {
+        let names: Vec<_> = names.into_iter().map(|name| name.into()).collect();
+        assert_eq!(typ.column_types.len(), names.len());
+        RelationDesc { typ, names }
+    }
+
+    pub fn from_names_and_types<I, T, N>(iter: I) -> Self
+    where
+        I: IntoIterator<Item = (N, T)>,
+        T: Into<ColumnType>,
+        N: Into<ColumnName>,
+    {
+        let (names, types): (Vec<_>, Vec<_>) = iter.into_iter().unzip();
+        let types = types.into_iter().map(Into::into).collect();
+        let typ = RelationType::new(types);
+        Self::new(typ, names)
+    }
+    /// Concatenates a `RelationDesc` onto the end of this `RelationDesc`.
+    pub fn concat(mut self, other: Self) -> Self {
+        let self_len = self.typ.column_types.len();
+        self.names.extend(other.names);
+        self.typ.column_types.extend(other.typ.column_types);
+        for k in other.typ.keys {
+            let k = k.into_iter().map(|idx| idx + self_len).collect();
+            self = self.with_key(k);
+        }
+        self
+    }
+
+    /// Appends a column with the specified name and type.
+    pub fn with_column<N>(mut self, name: N, column_type: ColumnType) -> Self
+    where
+        N: Into<ColumnName>,
+    {
+        self.typ.column_types.push(column_type);
+        self.names.push(name.into());
+        self
+    }
+
+    /// Adds a new key for the relation.
+    pub fn with_key(mut self, indices: Vec<usize>) -> Self {
+        self.typ = self.typ.with_key(indices);
+        self
+    }
+
+    /// Drops all existing keys.
+    pub fn without_keys(mut self) -> Self {
+        self.typ.keys.clear();
+        self
+    }
+
+    /// Builds a new relation description with the column names replaced with
+    /// new names.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the arity of the relation type does not match the number of
+    /// items in `names`.
+    pub fn with_names<I, N>(self, names: I) -> Self
+    where
+        I: IntoIterator<Item = N>,
+        N: Into<ColumnName>,
+    {
+        Self::new(self.typ, names)
+    }
+
+    /// Computes the number of columns in the relation.
+    pub fn arity(&self) -> usize {
+        self.typ.arity()
+    }
+
+    /// Returns the relation type underlying this relation description.
+    pub fn typ(&self) -> &RelationType {
+        &self.typ
+    }
+
+    /// Returns an iterator over the columns in this relation.
+    pub fn iter(&self) -> impl Iterator<Item = (&ColumnName, &ColumnType)> {
+        self.iter_names().zip(self.iter_types())
+    }
+
+    /// Returns an iterator over the types of the columns in this relation.
+    pub fn iter_types(&self) -> impl Iterator<Item = &ColumnType> {
+        self.typ.column_types.iter()
+    }
+
+    /// Returns an iterator over the names of the columns in this relation.
+    pub fn iter_names(&self) -> impl Iterator<Item = &ColumnName> {
+        self.names.iter()
+    }
+
+    /// Finds a column by name.
+    ///
+    /// Returns the index and type of the column named `name`. If no column with
+    /// the specified name exists, returns `None`. If multiple columns have the
+    /// specified name, the leftmost column is returned.
+    pub fn get_by_name(&self, name: &ColumnName) -> Option<(usize, &ColumnType)> {
+        self.iter_names()
+            .position(|n| n == name)
+            .map(|i| (i, &self.typ.column_types[i]))
+    }
+
+    /// Gets the name of the `i`th column.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `i` is not a valid column index.
+    pub fn get_name(&self, i: usize) -> &ColumnName {
+        &self.names[i]
+    }
+
+    /// Mutably gets the name of the `i`th column.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `i` is not a valid column index.
+    pub fn get_name_mut(&mut self, i: usize) -> &mut ColumnName {
+        &mut self.names[i]
+    }
+
+    /// Gets the name of the `i`th column if that column name is unambiguous.
+    ///
+    /// If at least one other column has the same name as the `i`th column,
+    /// returns `None`. If the `i`th column has no name, returns `None`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `i` is not a valid column index.
+    pub fn get_unambiguous_name(&self, i: usize) -> Option<&ColumnName> {
+        let name = &self.names[i];
+        if self.iter_names().filter(|n| *n == name).count() == 1 {
+            Some(name)
+        } else {
+            None
+        }
+    }
+}
+
+/// The name of a column in a [`RelationDesc`].
+#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
+pub struct ColumnName(pub(crate) String);
+
+impl ColumnName {
+    /// Returns this column name as a `str`.
+    pub fn as_str(&self) -> &str {
+        &self.0
+    }
+
+    /// Returns a mutable reference to the string underlying this column name.
+    pub fn as_mut_str(&mut self) -> &mut String {
+        &mut self.0
+    }
+}
--- a/src/flow/src/repr/timestamp.rs
+++ b/src/flow/src/repr/timestamp.rs
@@ -0,0 +1 @@
+
--- a/src/flow/src/storage/errors.rs
+++ b/src/flow/src/storage/errors.rs
@@ -0,0 +1,28 @@
+use serde::{Deserialize, Serialize};
+
+// TODO(discord9): more error types
+#[derive(Ord, PartialOrd, Clone, Debug, Eq, Deserialize, Serialize, PartialEq, Hash)]
+pub enum DataflowError {
+    EvalError(Box<EvalError>),
+}
+
+impl From<EvalError> for DataflowError {
+    fn from(e: EvalError) -> Self {
+        DataflowError::EvalError(Box::new(e))
+    }
+}
+
+#[derive(Ord, PartialOrd, Clone, Debug, Eq, Deserialize, Serialize, PartialEq, Hash)]
+pub enum EvalError {
+    DivisionByZero,
+    TypeMismatch(String),
+    InvalidArgument(String),
+    Internal(String),
+}
+
+#[test]
+fn tell_goal() {
+    use differential_dataflow::ExchangeData;
+    fn a<T: ExchangeData>(_: T) {}
+    a(DataflowError::from(EvalError::DivisionByZero));
+}
--- a/src/flow/src/storage/mod.rs
+++ b/src/flow/src/storage/mod.rs
@@ -0,0 +1,4 @@
+//! TODO: Storage Layer: wrap grpc write request for providing definite collection for streaming process, and able to send read request should random access is needed
+//! and store result of stream processing
+
+pub(crate) mod errors;
--- a/src/flow/src/storage/source.rs
+++ b/src/flow/src/storage/source.rs
--- a/src/flow/src/util/buffer.rs
+++ b/src/flow/src/util/buffer.rs
@@ -0,0 +1,150 @@
+use differential_dataflow::consolidation::consolidate_updates;
+use differential_dataflow::difference::Semigroup;
+use differential_dataflow::Data;
+use timely::communication::Push;
+use timely::dataflow::channels::Bundle;
+use timely::dataflow::operators::generic::OutputHandle;
+use timely::dataflow::operators::{Capability, InputCapability};
+use timely::progress::Timestamp;
+
+/// A buffer that consolidates updates
+///
+/// The buffer implements a wrapper around [OutputHandle] consolidating elements pushed to it. It is
+/// backed by a capacity-limited buffer, which means that compaction only occurs within the
+/// dimensions of the buffer, i.e. the number of unique keys is less than half of the buffer's
+/// capacity.
+///
+/// A cap is retained whenever the current time changes to be able to flush on drop or when the time
+/// changes again.
+///
+/// The buffer is filled with updates until it reaches its capacity. At this point, the updates are
+/// consolidated to free up space. This process repeats until the consolidation recovered less than
+/// half of the buffer's capacity, at which point the buffer will be shipped.
+///
+/// The buffer retains a capability to send data on flush. It will flush all data once dropped, if
+/// time changes, or if the buffer capacity is reached.
+pub struct ConsolidateBuffer<'a, 'b, T, D: Data, R: Semigroup, P>
+where
+    P: Push<Bundle<T, (D, T, R)>> + 'a,
+    T: Data + Timestamp + 'a,
+    D: 'a,
+{
+    // a buffer for records, to send at self.cap
+    // Invariant: Buffer only contains data if cap is Some.
+    buffer: Vec<(D, T, R)>,
+    output_handle: &'b mut OutputHandle<'a, T, (D, T, R), P>,
+    cap: Option<Capability<T>>,
+    port: usize,
+    previous_len: usize,
+}
+
+impl<'a, 'b, T, D: Data, R: Semigroup, P> ConsolidateBuffer<'a, 'b, T, D, R, P>
+where
+    T: Data + Timestamp + 'a,
+    P: Push<Bundle<T, (D, T, R)>> + 'a,
+{
+    /// Create a new [ConsolidateBuffer], wrapping the provided session.
+    ///
+    /// * `output_handle`: The output to send data to.
+    /// * 'port': The output port to retain capabilities for.
+    pub fn new(output_handle: &'b mut OutputHandle<'a, T, (D, T, R), P>, port: usize) -> Self {
+        Self {
+            output_handle,
+            port,
+            cap: None,
+            buffer: Vec::with_capacity(::timely::container::buffer::default_capacity::<(D, T, R)>()),
+            previous_len: 0,
+        }
+    }
+
+    #[inline]
+    /// Provides an iterator of elements to the buffer
+    pub fn give_iterator<I: Iterator<Item = (D, T, R)>>(
+        &mut self,
+        cap: &InputCapability<T>,
+        iter: I,
+    ) {
+        for item in iter {
+            self.give(cap, item);
+        }
+    }
+
+    /// Give an element to the buffer
+    pub fn give(&mut self, cap: &InputCapability<T>, data: (D, T, R)) {
+        // Retain a cap for the current time, which will be used on flush.
+        if self.cap.as_ref().map_or(true, |t| t.time() != cap.time()) {
+            // Flush on capability change
+            self.flush();
+            // Retain capability for the specified output port.
+            self.cap = Some(cap.delayed_for_output(cap.time(), self.port));
+        }
+        self.give_internal(data);
+    }
+
+    /// Give an element to the buffer, using a pre-fabricated capability. Note that the capability
+    /// must be valid for the associated output.
+    pub fn give_at(&mut self, cap: &Capability<T>, data: (D, T, R)) {
+        // Retain a cap for the current time, which will be used on flush.
+        if self.cap.as_ref().map_or(true, |t| t.time() != cap.time()) {
+            // Flush on capability change
+            self.flush();
+            // Retain capability.
+            self.cap = Some(cap.clone());
+        }
+        self.give_internal(data);
+    }
+
+    /// Give an element and possibly flush the buffer. Note that this needs to have access
+    /// to a capability, which the public functions ensure.
+    fn give_internal(&mut self, data: (D, T, R)) {
+        self.buffer.push(data);
+
+        // Limit, if possible, the lifetime of the allocations for data
+        // and consolidate smaller buffers if we're in the lucky case
+        // of a small domain for D
+        if self.buffer.len() >= 2 * self.previous_len {
+            // Consolidate while the consolidation frees at least half the buffer
+            consolidate_updates(&mut self.buffer);
+            if self.buffer.len() > self.buffer.capacity() / 2 {
+                self.flush();
+            } else {
+                self.previous_len = self.buffer.len();
+            }
+            // At this point, it is an invariant across give calls that self.previous_len
+            // will be in the interval [0, self.buffer.capacity() / 2]. So, we will enter
+            // this if-statement block again when self.buffer.len() == self.buffer.capacity()
+            // or earlier. If consolidation is not effective to keep self.buffer.len()
+            // below half capacity, then flushing when more than half-full will
+            // maintain the invariant.
+        }
+    }
+
+    /// Flush the internal buffer to the underlying session
+    pub fn flush(&mut self) {
+        if let Some(cap) = &self.cap {
+            self.output_handle.session(cap).give_vec(&mut self.buffer);
+
+            // Ensure that the capacity is at least equal to the default in case
+            // it was reduced by give_vec. Note that we cannot rely here on give_vec
+            // returning us a buffer with zero capacity.
+            if self.buffer.capacity() < ::timely::container::buffer::default_capacity::<(D, T, R)>()
+            {
+                let to_reserve = ::timely::container::buffer::default_capacity::<(D, T, R)>()
+                    - self.buffer.capacity();
+                self.buffer.reserve_exact(to_reserve);
+            }
+            self.previous_len = 0;
+        }
+    }
+}
+
+impl<'a, 'b, T, D: Data, R: Semigroup, P> Drop for ConsolidateBuffer<'a, 'b, T, D, R, P>
+where
+    P: Push<Bundle<T, (D, T, R)>> + 'a,
+    T: Data + Timestamp + 'a,
+    D: 'a,
+{
+    fn drop(&mut self) {
+        self.flush();
+    }
+}
--- a/src/flow/src/util/mod.rs
+++ b/src/flow/src/util/mod.rs
@@ -0,0 +1,7 @@
+//! utilitys including extend differential dataflow to deal with errors and etc.
+mod buffer;
+mod operator;
+mod reduce;
+
+pub use operator::CollectionExt;
+pub use reduce::ReduceExt;
--- a/src/flow/src/util/operator.rs
+++ b/src/flow/src/util/operator.rs
@@ -0,0 +1,257 @@
+use differential_dataflow::difference::{Multiply, Semigroup};
+use differential_dataflow::lattice::Lattice;
+use differential_dataflow::operators::arrange::Arrange;
+use differential_dataflow::trace::{Batch, Trace, TraceReader};
+use differential_dataflow::{AsCollection, Collection};
+use timely::dataflow::channels::pact::{Exchange, ParallelizationContract, Pipeline};
+use timely::dataflow::channels::pushers::Tee;
+use timely::dataflow::operators::generic::builder_rc::OperatorBuilder as OperatorBuilderRc;
+use timely::dataflow::operators::generic::operator::{self, Operator};
+use timely::dataflow::operators::generic::{InputHandle, OperatorInfo, OutputHandle};
+use timely::dataflow::operators::Capability;
+use timely::dataflow::{Scope, Stream};
+use timely::{Data, ExchangeData};
+
+use crate::util::buffer::ConsolidateBuffer;
+
+pub trait StreamExt<G, D1>
+where
+    D1: Data,
+    G: Scope,
+{
+    /// Like `timely::dataflow::operators::generic::operator::Operator::unary`,
+    /// but the logic function can handle failures.
+    ///
+    /// Creates a new dataflow operator that partitions its input stream by a
+    /// parallelization strategy `pact` and repeatedly invokes `logic`, the
+    /// function returned by the function passed as `constructor`. The `logic`
+    /// function can read to the input stream and write to either of two output
+    /// streams, where the first output stream represents successful
+    /// computations and the second output stream represents failed
+    /// computations.
+    fn unary_fallible<D2, E, B, P>(
+        &self,
+        pact: P,
+        name: &str,
+        constructor: B,
+    ) -> (Stream<G, D2>, Stream<G, E>)
+    where
+        D2: Data,
+        E: Data,
+        B: FnOnce(
+            Capability<G::Timestamp>,
+            OperatorInfo,
+        ) -> Box<
+            dyn FnMut(
+                    &mut InputHandle<G::Timestamp, D1, P::Puller>,
+                    &mut OutputHandle<G::Timestamp, D2, Tee<G::Timestamp, D2>>,
+                    &mut OutputHandle<G::Timestamp, E, Tee<G::Timestamp, E>>,
+                ) + 'static,
+        >,
+        P: ParallelizationContract<G::Timestamp, D1>;
+
+    /// Like [`timely::dataflow::operators::map::Map::flat_map`], but `logic`
+    /// is allowed to fail. The first returned stream will contain the
+    /// successful applications of `logic`, while the second returned stream
+    /// will contain the failed applications.
+    fn flat_map_fallible<D2, E, I, L>(&self, name: &str, logic: L) -> (Stream<G, D2>, Stream<G, E>)
+    where
+        D2: Data,
+        E: Data,
+        I: IntoIterator<Item = Result<D2, E>>,
+        L: FnMut(D1) -> I + 'static;
+}
+
+/// Extension methods for differential [`Collection`]s.
+pub trait CollectionExt<G, D1, R>
+where
+    G: Scope,
+    R: Semigroup,
+{
+    /// Creates a new empty collection in `scope`.
+    fn empty(scope: &G) -> Collection<G, D1, R>;
+
+    /// Like [`Collection::map`], but `logic` is allowed to fail. The first
+    /// returned collection will contain successful applications of `logic`,
+    /// while the second returned collection will contain the failed
+    /// applications.
+    fn map_fallible<D2, E, L>(
+        &self,
+        name: &str,
+        mut logic: L,
+    ) -> (Collection<G, D2, R>, Collection<G, E, R>)
+    where
+        D2: Data,
+        E: Data,
+        L: FnMut(D1) -> Result<D2, E> + 'static,
+    {
+        self.flat_map_fallible(name, move |record| Some(logic(record)))
+    }
+
+    /// Like [`Collection::flat_map`], but `logic` is allowed to fail. The first
+    /// returned collection will contain the successful applications of `logic`,
+    /// while the second returned collection will contain the failed
+    /// applications.
+    fn flat_map_fallible<D2, E, I, L>(
+        &self,
+        name: &str,
+        logic: L,
+    ) -> (Collection<G, D2, R>, Collection<G, E, R>)
+    where
+        D2: Data,
+        E: Data,
+        I: IntoIterator<Item = Result<D2, E>>,
+        L: FnMut(D1) -> I + 'static;
+
+    /// Replaces each record with another, with a new difference type.
+    ///
+    /// This method is most commonly used to take records containing aggregatable data (e.g. numbers to be summed)
+    /// and move the data into the difference component. This will allow differential dataflow to update in-place.
+    fn explode_one<D2, R2, L>(&self, logic: L) -> Collection<G, D2, <R2 as Multiply<R>>::Output>
+    where
+        D2: differential_dataflow::Data,
+        R2: Semigroup + Multiply<R>,
+        <R2 as Multiply<R>>::Output: Data + Semigroup,
+        L: FnMut(D1) -> (D2, R2) + 'static,
+        G::Timestamp: Lattice;
+}
+
+impl<G, D1> StreamExt<G, D1> for Stream<G, D1>
+where
+    D1: Data,
+    G: Scope,
+{
+    fn unary_fallible<D2, E, B, P>(
+        &self,
+        pact: P,
+        name: &str,
+        constructor: B,
+    ) -> (Stream<G, D2>, Stream<G, E>)
+    where
+        D2: Data,
+        E: Data,
+        B: FnOnce(
+            Capability<G::Timestamp>,
+            OperatorInfo,
+        ) -> Box<
+            dyn FnMut(
+                    &mut InputHandle<G::Timestamp, D1, P::Puller>,
+                    &mut OutputHandle<G::Timestamp, D2, Tee<G::Timestamp, D2>>,
+                    &mut OutputHandle<G::Timestamp, E, Tee<G::Timestamp, E>>,
+                ) + 'static,
+        >,
+        P: ParallelizationContract<G::Timestamp, D1>,
+    {
+        let mut builder = OperatorBuilderRc::new(name.into(), self.scope());
+        builder.set_notify(false);
+
+        let operator_info = builder.operator_info();
+
+        let mut input = builder.new_input(self, pact);
+        let (mut ok_output, ok_stream) = builder.new_output();
+        let (mut err_output, err_stream) = builder.new_output();
+
+        builder.build(move |mut capabilities| {
+            // `capabilities` should be a single-element vector.
+            let capability = capabilities.pop().unwrap();
+            let mut logic = constructor(capability, operator_info);
+            move |_frontiers| {
+                let mut ok_output_handle = ok_output.activate();
+                let mut err_output_handle = err_output.activate();
+                logic(&mut input, &mut ok_output_handle, &mut err_output_handle);
+            }
+        });
+
+        (ok_stream, err_stream)
+    }
+
+    #[allow(clippy::redundant_closure)]
+    fn flat_map_fallible<D2, E, I, L>(
+        &self,
+        name: &str,
+        mut logic: L,
+    ) -> (Stream<G, D2>, Stream<G, E>)
+    where
+        D2: Data,
+        E: Data,
+        I: IntoIterator<Item = Result<D2, E>>,
+        L: FnMut(D1) -> I + 'static,
+    {
+        let mut storage = Vec::new();
+        self.unary_fallible(Pipeline, name, move |_, _| {
+            Box::new(move |input, ok_output, err_output| {
+                input.for_each(|time, data| {
+                    let mut ok_session = ok_output.session(&time);
+                    let mut err_session = err_output.session(&time);
+                    data.swap(&mut storage);
+                    for r in storage.drain(..).flat_map(|d1| logic(d1)) {
+                        match r {
+                            Ok(d2) => ok_session.give(d2),
+                            Err(e) => err_session.give(e),
+                        }
+                    }
+                })
+            })
+        })
+    }
+}
+
+impl<G, D1, R> CollectionExt<G, D1, R> for Collection<G, D1, R>
+where
+    G: Scope,
+    G::Timestamp: Data,
+    D1: Data,
+    R: Semigroup,
+{
+    fn empty(scope: &G) -> Collection<G, D1, R> {
+        operator::empty(scope).as_collection()
+    }
+
+    fn flat_map_fallible<D2, E, I, L>(
+        &self,
+        name: &str,
+        mut logic: L,
+    ) -> (Collection<G, D2, R>, Collection<G, E, R>)
+    where
+        D2: Data,
+        E: Data,
+        I: IntoIterator<Item = Result<D2, E>>,
+        L: FnMut(D1) -> I + 'static,
+    {
+        let (ok_stream, err_stream) = self.inner.flat_map_fallible(name, move |(d1, t, r)| {
+            logic(d1).into_iter().map(move |res| match res {
+                Ok(d2) => Ok((d2, t.clone(), r.clone())),
+                Err(e) => Err((e, t.clone(), r.clone())),
+            })
+        });
+        (ok_stream.as_collection(), err_stream.as_collection())
+    }
+
+    fn explode_one<D2, R2, L>(&self, mut logic: L) -> Collection<G, D2, <R2 as Multiply<R>>::Output>
+    where
+        D2: differential_dataflow::Data,
+        R2: Semigroup + Multiply<R>,
+        <R2 as Multiply<R>>::Output: Data + Semigroup,
+        L: FnMut(D1) -> (D2, R2) + 'static,
+        G::Timestamp: Lattice,
+    {
+        self.inner
+            .unary(Pipeline, "ExplodeOne", move |_, _| {
+                let mut buffer = Vec::new();
+                move |input, output| {
+                    let mut out = ConsolidateBuffer::new(output, 0);
+                    input.for_each(|time, data| {
+                        data.swap(&mut buffer);
+                        out.give_iterator(
+                            &time,
+                            buffer.drain(..).map(|(x, t, d)| {
+                                let (x, d2) = logic(x);
+                                (x, t, d2.multiply(&d))
+                            }),
+                        );
+                    });
+                }
+            })
+            .as_collection()
+    }
+}
--- a/src/flow/src/util/reduce.rs
+++ b/src/flow/src/util/reduce.rs
@@ -0,0 +1,68 @@
+use differential_dataflow::difference::{Abelian, Semigroup};
+use differential_dataflow::lattice::Lattice;
+use differential_dataflow::operators::arrange::{Arranged, TraceAgent};
+use differential_dataflow::operators::reduce::ReduceCore;
+use differential_dataflow::trace::{Batch, Trace, TraceReader};
+use differential_dataflow::Data;
+use timely::dataflow::Scope;
+
+/// Extension trait for `ReduceCore`, currently providing a reduction based
+/// on an operator-pair approach.
+pub trait ReduceExt<G: Scope, K: Data, V: Data, R: Semigroup>
+where
+    G::Timestamp: Lattice + Ord,
+{
+    /// This method produces a reduction pair based on the same input arrangement. Each reduction
+    /// in the pair operates with its own logic and the two output arrangements from the reductions
+    /// are produced as a result. The method is useful for reductions that need to present different
+    /// output views on the same input data. An example is producing an error-free reduction output
+    /// along with a separate error output indicating when the error-free output is valid.
+    fn reduce_pair<L1, T1, L2, T2>(
+        &self,
+        name1: &str,
+        name2: &str,
+        logic1: L1,
+        logic2: L2,
+    ) -> (Arranged<G, TraceAgent<T1>>, Arranged<G, TraceAgent<T2>>)
+    where
+        T1: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
+        T1::Val: Data,
+        T1::R: Abelian,
+        T1::Batch: Batch,
+        L1: FnMut(&K, &[(&V, R)], &mut Vec<(T1::Val, T1::R)>) + 'static,
+        T2: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
+        T2::Val: Data,
+        T2::R: Abelian,
+        T2::Batch: Batch,
+        L2: FnMut(&K, &[(&V, R)], &mut Vec<(T2::Val, T2::R)>) + 'static;
+}
+
+impl<G: Scope, K: Data, V: Data, Tr, R: Semigroup> ReduceExt<G, K, V, R> for Arranged<G, Tr>
+where
+    G::Timestamp: Lattice + Ord,
+    Tr: TraceReader<Key = K, Val = V, Time = G::Timestamp, R = R> + Clone + 'static,
+{
+    fn reduce_pair<L1, T1, L2, T2>(
+        &self,
+        name1: &str,
+        name2: &str,
+        logic1: L1,
+        logic2: L2,
+    ) -> (Arranged<G, TraceAgent<T1>>, Arranged<G, TraceAgent<T2>>)
+    where
+        T1: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
+        T1::Val: Data,
+        T1::R: Abelian,
+        T1::Batch: Batch,
+        L1: FnMut(&K, &[(&V, R)], &mut Vec<(T1::Val, T1::R)>) + 'static,
+        T2: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
+        T2::Val: Data,
+        T2::R: Abelian,
+        T2::Batch: Batch,
+        L2: FnMut(&K, &[(&V, R)], &mut Vec<(T2::Val, T2::R)>) + 'static,
+    {
+        let arranged1 = self.reduce_abelian::<L1, T1>(name1, logic1);
+        let arranged2 = self.reduce_abelian::<L2, T2>(name2, logic2);
+        (arranged1, arranged2)
+    }
+}
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -39,7 +39,7 @@ datatypes = { workspace = true }
 file-table-engine = { workspace = true }
 futures = "0.3"
 futures-util.workspace = true
-humantime-serde = "1.1"
+humantime-serde.workspace = true
 itertools.workspace = true
 meta-client = { workspace = true }
 # Although it is not used, please do not delete it.
--- a/src/frontend/src/catalog.rs
+++ b/src/frontend/src/catalog.rs
@@ -38,7 +38,6 @@ use common_meta::key::table_info::TableInfoKey;
 use common_meta::key::table_name::TableNameKey;
 use common_meta::key::{TableMetaKey, TableMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
-use common_meta::table_name::TableName;
 use common_telemetry::debug;
 use futures_util::TryStreamExt;
 use partition::manager::PartitionRuleManagerRef;
@@ -417,12 +416,7 @@ impl CatalogManager for FrontendCatalogManager {
                .try_into()
                .context(catalog_err::InvalidTableInfoInCatalogSnafu)?,
        );
-        let table = Arc::new(DistTable::new(
-            TableName::new(catalog, schema, table_name),
-            table_info,
-            Arc::new(self.clone()),
-        ));
-        Ok(Some(table))
+        Ok(Some(DistTable::table(table_info)))
    }

    fn as_any(&self) -> &dyn Any {
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -513,7 +513,7 @@ pub enum Error {
    },

    #[snafu(display("Failed to read record batch, source: {}", source))]
-    ReadRecordBatch {
+    ReadDfRecordBatch {
        source: datafusion::error::DataFusionError,
        location: Location,
    },
@@ -600,6 +600,18 @@ pub enum Error {

    #[snafu(display("Empty data: {}", msg))]
    EmptyData { msg: String, location: Location },
+
+    #[snafu(display("Failed to read record batch, source: {}", source))]
+    ReadRecordBatch {
+        source: common_recordbatch::error::Error,
+        location: Location,
+    },
+
+    #[snafu(display("Failed to build column vectors, source: {}", source))]
+    BuildColumnVectors {
+        source: common_recordbatch::error::Error,
+        location: Location,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -678,7 +690,7 @@ impl ErrorExt for Error {

            Error::JoinTask { .. }
            | Error::BuildParquetRecordBatchStream { .. }
-            | Error::ReadRecordBatch { .. }
+            | Error::ReadDfRecordBatch { .. }
            | Error::BuildFileStream { .. }
            | Error::WriteStreamToFile { .. }
            | Error::Unexpected { .. } => StatusCode::Unexpected,
@@ -731,6 +743,10 @@ impl ErrorExt for Error {

            Error::WriteParquet { source, .. } => source.status_code(),
            Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,
+
+            Error::ReadRecordBatch { source, .. } | Error::BuildColumnVectors { source, .. } => {
+                source.status_code()
+            }
        }
    }

--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -21,7 +21,7 @@ use servers::Mode;

 use crate::service_config::{
    DatanodeOptions, GrpcOptions, InfluxdbOptions, MysqlOptions, OpentsdbOptions, OtlpOptions,
-    PostgresOptions, PromStoreOptions, PrometheusOptions,
+    PostgresOptions, PromStoreOptions,
 };

 #[derive(Clone, Debug, Serialize, Deserialize)]
@@ -37,7 +37,6 @@ pub struct FrontendOptions {
    pub opentsdb_options: Option<OpentsdbOptions>,
    pub influxdb_options: Option<InfluxdbOptions>,
    pub prom_store_options: Option<PromStoreOptions>,
-    pub prometheus_options: Option<PrometheusOptions>,
    pub otlp_options: Option<OtlpOptions>,
    pub meta_client_options: Option<MetaClientOptions>,
    pub logging: LoggingOptions,
@@ -57,7 +56,6 @@ impl Default for FrontendOptions {
            opentsdb_options: Some(OpentsdbOptions::default()),
            influxdb_options: Some(InfluxdbOptions::default()),
            prom_store_options: Some(PromStoreOptions::default()),
-            prometheus_options: Some(PrometheusOptions::default()),
            otlp_options: Some(OtlpOptions::default()),
            meta_client_options: None,
            logging: LoggingOptions::default(),
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -38,7 +38,7 @@ use catalog::remote::CachedMetaKvBackend;
 use catalog::CatalogManagerRef;
 use client::client_manager::DatanodeClients;
 use common_base::Plugins;
-use common_catalog::consts::MITO_ENGINE;
+use common_catalog::consts::default_engine;
 use common_error::ext::BoxedError;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
@@ -64,7 +64,7 @@ use servers::error::{AuthSnafu, ExecuteQuerySnafu, ParsePromQLSnafu};
 use servers::interceptor::{
    PromQueryInterceptor, PromQueryInterceptorRef, SqlQueryInterceptor, SqlQueryInterceptorRef,
 };
-use servers::prometheus::PrometheusHandler;
+use servers::prometheus_handler::PrometheusHandler;
 use servers::query_handler::grpc::{GrpcQueryHandler, GrpcQueryHandlerRef};
 use servers::query_handler::sql::SqlQueryHandler;
 use servers::query_handler::{
@@ -213,7 +213,6 @@ impl Instance {
        let create_expr_factory = CreateExprFactory;

        let row_inserter = Arc::new(RowInserter::new(
-            MITO_ENGINE.to_string(),
            catalog_manager.clone(),
            create_expr_factory,
            dist_instance.clone(),
@@ -286,7 +285,6 @@ impl Instance {
        let grpc_query_handler = StandaloneGrpcQueryHandler::arc(dn_instance.clone());

        let row_inserter = Arc::new(RowInserter::new(
-            MITO_ENGINE.to_string(),
            catalog_manager.clone(),
            create_expr_factory,
            grpc_query_handler.clone(),
@@ -366,7 +364,7 @@ impl Instance {
                    catalog_name, schema_name, table_name,
                );
                let _ = self
-                    .create_table_by_columns(ctx, table_name, columns, MITO_ENGINE)
+                    .create_table_by_columns(ctx, table_name, columns, default_engine())
                    .await?;
                info!(
                    "Successfully created table on insertion: {}.{}.{}",
--- a/src/frontend/src/instance/distributed.rs
+++ b/src/frontend/src/instance/distributed.rs
@@ -14,6 +14,7 @@

 pub mod deleter;
 pub(crate) mod inserter;
+pub(crate) mod row_inserter;

 use std::collections::HashMap;
 use std::sync::Arc;
@@ -23,7 +24,7 @@ use api::v1::ddl_request::Expr as DdlExpr;
 use api::v1::greptime_request::Request;
 use api::v1::{
    column_def, AlterExpr, CompactTableExpr, CreateDatabaseExpr, CreateTableExpr, DeleteRequests,
-    FlushTableExpr, InsertRequests, TruncateTableExpr,
+    FlushTableExpr, InsertRequests, RowInsertRequests, TruncateTableExpr,
 };
 use async_trait::async_trait;
 use catalog::{CatalogManager, DeregisterTableRequest, RegisterTableRequest};
@@ -33,7 +34,7 @@ use client::Database;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_catalog::format_full_table_name;
 use common_error::ext::BoxedError;
-use common_meta::key::schema_name::SchemaNameKey;
+use common_meta::key::schema_name::{SchemaNameKey, SchemaNameValue};
 use common_meta::peer::Peer;
 use common_meta::rpc::ddl::{DdlTask, SubmitDdlTaskRequest, SubmitDdlTaskResponse};
 use common_meta::rpc::router::{Partition, Partition as MetaPartition, RouteRequest};
@@ -57,6 +58,7 @@ use sql::statements::create::{PartitionEntry, Partitions};
 use sql::statements::statement::Statement;
 use sql::statements::{self, sql_value_to_value};
 use store_api::storage::RegionNumber;
+use table::error::TableOperationSnafu;
 use table::metadata::{RawTableInfo, RawTableMeta, TableId, TableIdent, TableInfo, TableType};
 use table::requests::{AlterTableRequest, TableOptions};
 use table::TableRef;
@@ -66,11 +68,12 @@ use crate::error::{
    self, AlterExprToRequestSnafu, CatalogSnafu, ColumnDataTypeSnafu, ColumnNotFoundSnafu,
    DeserializePartitionSnafu, InvokeDatanodeSnafu, NotSupportedSnafu, ParseSqlSnafu,
    RequestDatanodeSnafu, RequestMetaSnafu, Result, SchemaExistsSnafu, TableAlreadyExistSnafu,
-    TableNotFoundSnafu, TableSnafu, UnrecognizedTableOptionSnafu,
+    TableMetadataManagerSnafu, TableNotFoundSnafu, TableSnafu, UnrecognizedTableOptionSnafu,
 };
 use crate::expr_factory;
 use crate::instance::distributed::deleter::DistDeleter;
 use crate::instance::distributed::inserter::DistInserter;
+use crate::instance::distributed::row_inserter::RowDistInserter;
 use crate::table::DistTable;

 const MAX_VALUE: &str = "MAXVALUE";
@@ -101,6 +104,18 @@ impl DistInstance {
        partitions: Option<Partitions>,
    ) -> Result<TableRef> {
        let _timer = common_telemetry::timer!(crate::metrics::DIST_CREATE_TABLE);
+        // 1. get schema info
+        let schema_value = self
+            .catalog_manager
+            .table_metadata_manager_ref()
+            .schema_manager()
+            .get(SchemaNameKey::new(
+                &create_table.catalog_name,
+                &create_table.schema_name,
+            ))
+            .await
+            .context(TableMetadataManagerSnafu)?;
+
        let table_name = TableName::new(
            &create_table.catalog_name,
            &create_table.schema_name,
@@ -109,7 +124,7 @@ impl DistInstance {

        let (partitions, partition_cols) = parse_partitions(create_table, partitions)?;

-        let mut table_info = create_table_info(create_table, partition_cols)?;
+        let mut table_info = create_table_info(create_table, partition_cols, schema_value)?;

        let resp = self
            .create_table_procedure(create_table, partitions, table_info.clone())
@@ -121,15 +136,12 @@ impl DistInstance {
        info!("Successfully created distributed table '{table_name}' with table id {table_id}");

        table_info.ident.table_id = table_id;
+
        let table_info = Arc::new(table_info.try_into().context(error::CreateTableInfoSnafu)?);

        create_table.table_id = Some(api::v1::TableId { id: table_id });

-        let table = Arc::new(DistTable::new(
-            table_name.clone(),
-            table_info,
-            self.catalog_manager.clone(),
-        ));
+        let table = DistTable::table(table_info);

        let request = RegisterTableRequest {
            catalog: table_name.catalog_name.clone(),
@@ -148,10 +160,7 @@ impl DistInstance {
            }
        );

-        // Since the table information created on meta does not go through KvBackend, so we
-        // manually invalidate the cache here.
-        //
-        // TODO(fys): when the meta invalidation cache mechanism is established, remove it.
+        // Invalidates local cache ASAP.
        self.catalog_manager
            .invalidate_table(
                &table_name.catalog_name,
@@ -191,10 +200,7 @@ impl DistInstance {
            .await
            .context(CatalogSnafu)?;

-        // Since the table information dropped on meta does not go through KvBackend, so we
-        // manually invalidate the cache here.
-        //
-        // TODO(fys): when the meta invalidation cache mechanism is established, remove it.
+        // Invalidates local cache ASAP.
        self.catalog_manager()
            .invalidate_table(
                &table_name.catalog_name,
@@ -346,6 +352,7 @@ impl DistInstance {
                let expr = CreateDatabaseExpr {
                    database_name: stmt.name.to_string(),
                    create_if_not_exists: stmt.if_not_exists,
+                    options: Default::default(),
                };
                self.handle_create_database(expr, query_ctx).await
            }
@@ -372,26 +379,24 @@ impl DistInstance {
                self.drop_table(table_name).await
            }
            Statement::Insert(insert) => {
-                let (catalog, schema, table) =
+                let (catalog, schema, _) =
                    table_idents_to_full_name(insert.table_name(), query_ctx.clone())
                        .map_err(BoxedError::new)
                        .context(error::ExternalSnafu)?;

-                let table = self
-                    .catalog_manager
-                    .table(&catalog, &schema, &table)
-                    .await
-                    .context(CatalogSnafu)?
-                    .context(TableNotFoundSnafu { table_name: table })?;
-
                let insert_request =
                    SqlHandler::insert_to_request(self.catalog_manager.clone(), &insert, query_ctx)
                        .await
                        .context(InvokeDatanodeSnafu)?;

-                Ok(Output::AffectedRows(
-                    table.insert(insert_request).await.context(TableSnafu)?,
-                ))
+                let inserter = DistInserter::new(catalog, schema, self.catalog_manager.clone());
+                let affected_rows = inserter
+                    .insert(vec![insert_request])
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(TableOperationSnafu)
+                    .context(TableSnafu)?;
+                Ok(Output::AffectedRows(affected_rows as usize))
            }
            Statement::ShowCreateTable(show) => {
                let (catalog, schema, table) =
@@ -407,7 +412,8 @@ impl DistInstance {
                    .context(TableNotFoundSnafu { table_name: &table })?;
                let table_name = TableName::new(catalog, schema, table);

-                self.show_create_table(table_name, table_ref).await
+                self.show_create_table(table_name, table_ref, query_ctx.clone())
+                    .await
            }
            Statement::TruncateTable(stmt) => {
                let (catalog, schema, table) =
@@ -424,7 +430,12 @@ impl DistInstance {
        }
    }

-    async fn show_create_table(&self, table_name: TableName, table: TableRef) -> Result<Output> {
+    async fn show_create_table(
+        &self,
+        table_name: TableName,
+        table: TableRef,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output> {
        let partitions = self
            .catalog_manager
            .partition_manager()
@@ -436,7 +447,8 @@ impl DistInstance {

        let partitions = create_partitions_stmt(partitions)?;

-        query::sql::show_create_table(table, partitions).context(error::ExecuteStatementSnafu)
+        query::sql::show_create_table(table, partitions, query_ctx)
+            .context(error::ExecuteStatementSnafu)
    }

    /// Handles distributed database creation
@@ -478,10 +490,12 @@ impl DistInstance {
            }
        );

+        let schema_value =
+            SchemaNameValue::try_from(&expr.options).context(error::TableMetadataManagerSnafu)?;
        self.catalog_manager
            .table_metadata_manager_ref()
            .schema_manager()
-            .create(schema)
+            .create(schema, Some(schema_value))
            .await
            .context(error::TableMetadataManagerSnafu)?;

@@ -556,6 +570,11 @@ impl DistInstance {
            .await
            .context(error::RequestMetaSnafu)?;

+        // Invalidates local cache ASAP.
+        self.catalog_manager()
+            .invalidate_table(catalog_name, schema_name, table_name, table_id)
+            .await;
+
        Ok(Output::AffectedRows(0))
    }

@@ -625,6 +644,20 @@ impl DistInstance {
        Ok(Output::AffectedRows(affected_rows as usize))
    }

+    async fn handle_row_dist_insert(
+        &self,
+        requests: RowInsertRequests,
+        ctx: QueryContextRef,
+    ) -> Result<Output> {
+        let inserter = RowDistInserter::new(
+            ctx.current_catalog().to_owned(),
+            ctx.current_schema().to_owned(),
+            self.catalog_manager.clone(),
+        );
+        let affected_rows = inserter.insert(requests).await?;
+        Ok(Output::AffectedRows(affected_rows as usize))
+    }
+
    async fn handle_dist_delete(
        &self,
        request: DeleteRequests,
@@ -665,8 +698,9 @@ impl GrpcQueryHandler for DistInstance {
    async fn do_query(&self, request: Request, ctx: QueryContextRef) -> Result<Output> {
        match request {
            Request::Inserts(requests) => self.handle_dist_insert(requests, ctx).await,
-            Request::RowInserts(_) | Request::RowDeletes(_) => NotSupportedSnafu {
-                feat: "row inserts/deletes",
+            Request::RowInserts(requests) => self.handle_row_dist_insert(requests, ctx).await,
+            Request::RowDeletes(_) => NotSupportedSnafu {
+                feat: "row deletes",
            }
            .fail(),
            Request::Deletes(requests) => self.handle_dist_delete(requests, ctx).await,
@@ -726,7 +760,7 @@ fn create_partitions_stmt(partitions: Vec<PartitionInfo>) -> Result<Option<Parti
        .into_iter()
        .map(|info| {
            // Generated the partition name from id
-            let name = &format!("r{}", info.id.as_u64());
+            let name = &format!("r{}", info.id.region_number());
            let bounds = info.partition.partition_bounds();
            let value_list = bounds
                .iter()
@@ -753,6 +787,7 @@ fn create_partitions_stmt(partitions: Vec<PartitionInfo>) -> Result<Option<Parti
 fn create_table_info(
    create_table: &CreateTableExpr,
    partition_columns: Vec<String>,
+    schema_opts: Option<SchemaNameValue>,
 ) -> Result<RawTableInfo> {
    let mut column_schemas = Vec::with_capacity(create_table.column_defs.len());
    let mut column_name_to_index_map = HashMap::new();
@@ -799,6 +834,10 @@ fn create_table_info(
        })
        .collect::<Result<Vec<_>>>()?;

+    let table_options = TableOptions::try_from(&create_table.table_options)
+        .context(UnrecognizedTableOptionSnafu)?;
+    let table_options = merge_options(table_options, schema_opts);
+
    let meta = RawTableMeta {
        schema: raw_schema,
        primary_key_indices,
@@ -807,8 +846,7 @@ fn create_table_info(
        next_column_id: column_schemas.len() as u32,
        region_numbers: vec![],
        engine_options: HashMap::new(),
-        options: TableOptions::try_from(&create_table.table_options)
-            .context(UnrecognizedTableOptionSnafu)?,
+        options: table_options,
        created_on: DateTime::default(),
        partition_key_indices,
    };
@@ -835,6 +873,14 @@ fn create_table_info(
    Ok(table_info)
 }

+fn merge_options(
+    mut table_opts: TableOptions,
+    schema_opts: Option<SchemaNameValue>,
+) -> TableOptions {
+    table_opts.ttl = table_opts.ttl.or(schema_opts.and_then(|s| s.ttl));
+    table_opts
+}
+
 fn parse_partitions(
    create_table: &CreateTableExpr,
    partitions: Option<Partitions>,
--- a/Show More
+++ b/Show More