refactor: Remove usages of the old storage crate (#2777)

* chore: remove storage from some crate * feat: remove storage config * feat: remove storage from cmd * feat: impl stream_to_parquet * feat: remove storage from operator * feat: remove stream writer from mito2 * feat: remove storage from project toml * test: fix config api test * docs: remove outdated configs * refactor: remove storage directory
2026-05-27 02:10:38 +00:00 · 2023-11-20 20:29:41 +08:00
parent 9558b3c201
commit b9146c88ff
118 changed files with 160 additions and 27554 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -641,12 +641,6 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"

-[[package]]
-name = "atomic_float"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62af46d040ba9df09edc6528dae9d8e49f5f3e82f55b7d2ec31a733c38dbc49d"
-
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -1205,7 +1199,6 @@ dependencies = [
 "serde_json",
 "session",
 "snafu",
- "storage",
 "store-api",
 "table",
 "tokio",
@@ -1628,11 +1621,13 @@ dependencies = [
 "common-runtime",
 "common-test-util",
 "datafusion",
+ "datatypes",
 "derive_builder 0.12.0",
 "futures",
 "lazy_static",
 "object-store",
 "orc-rust",
+ "parquet",
 "paste",
 "regex",
 "serde",
@@ -1722,7 +1717,7 @@ dependencies = [
 "common-runtime",
 "common-telemetry",
 "common-time",
- "criterion 0.4.0",
+ "criterion",
 "dashmap",
 "datafusion",
 "datatypes",
@@ -2142,32 +2137,6 @@ dependencies = [
 "cfg-if 1.0.0",
 ]

-[[package]]
-name = "criterion"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
-dependencies = [
- "atty",
- "cast",
- "clap 2.34.0",
- "criterion-plot 0.4.5",
- "csv",
- "itertools 0.10.5",
- "lazy_static",
- "num-traits",
- "oorandom",
- "plotters",
- "rayon",
- "regex",
- "serde",
- "serde_cbor",
- "serde_derive",
- "serde_json",
- "tinytemplate",
- "walkdir",
-]
-
 [[package]]
 name = "criterion"
 version = "0.4.0"
@@ -2179,7 +2148,7 @@ dependencies = [
 "cast",
 "ciborium",
 "clap 3.2.25",
- "criterion-plot 0.5.0",
+ "criterion-plot",
 "futures",
 "itertools 0.10.5",
 "lazy_static",
@@ -2196,16 +2165,6 @@ dependencies = [
 "walkdir",
 ]

-[[package]]
-name = "criterion-plot"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
-dependencies = [
- "cast",
- "itertools 0.10.5",
-]
-
 [[package]]
 name = "criterion-plot"
 version = "0.5.0"
@@ -2681,7 +2640,6 @@ dependencies = [
 "session",
 "snafu",
 "sql",
- "storage",
 "store-api",
 "substrait 0.4.3",
 "table",
@@ -3313,7 +3271,6 @@ dependencies = [
 "snafu",
 "sql",
 "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)",
- "storage",
 "store-api",
 "strfmt",
 "substrait 0.4.3",
@@ -5569,7 +5526,6 @@ dependencies = [
 "snafu",
 "sql",
 "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)",
- "storage",
 "store-api",
 "substrait 0.4.3",
 "table",
@@ -7966,7 +7922,7 @@ dependencies = [
 "common-test-util",
 "common-time",
 "console",
- "criterion 0.4.0",
+ "criterion",
 "crossbeam-utils",
 "datafusion",
 "datafusion-common",
@@ -7998,7 +7954,6 @@ dependencies = [
 "session",
 "snafu",
 "sql",
- "storage",
 "store-api",
 "table",
 "tokio",
@@ -8078,16 +8033,6 @@ dependencies = [
 "serde_derive",
 ]

-[[package]]
-name = "serde_cbor"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
-dependencies = [
- "half 1.8.2",
- "serde",
-]
-
 [[package]]
 name = "serde_derive"
 version = "1.0.190"
@@ -8829,60 +8774,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "storage"
-version = "0.4.3"
-dependencies = [
- "api",
- "arc-swap",
- "arrow",
- "arrow-array",
- "async-compat",
- "async-stream",
- "async-trait",
- "atomic_float",
- "bytes",
- "common-base",
- "common-config",
- "common-datasource",
- "common-error",
- "common-macro",
- "common-query",
- "common-recordbatch",
- "common-runtime",
- "common-telemetry",
- "common-test-util",
- "common-time",
- "criterion 0.3.6",
- "datafusion",
- "datafusion-common",
- "datafusion-expr",
- "datafusion-physical-expr",
- "datatypes",
- "futures",
- "futures-util",
- "itertools 0.10.5",
- "lazy_static",
- "log-store",
- "object-store",
- "parquet",
- "paste",
- "prometheus",
- "prost 0.12.1",
- "rand",
- "regex",
- "serde",
- "serde_json",
- "snafu",
- "store-api",
- "table",
- "tokio",
- "tokio-util",
- "tonic 0.10.2",
- "tonic-build 0.9.2",
- "uuid",
-]
-
 [[package]]
 name = "store-api"
 version = "0.4.3"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,7 +49,6 @@ members = [
    "src/servers",
    "src/session",
    "src/sql",
-    "src/storage",
    "src/store-api",
    "src/table",
    "tests-integration",
@@ -176,7 +175,6 @@ script = { path = "src/script" }
 servers = { path = "src/servers" }
 session = { path = "src/session" }
 sql = { path = "src/sql" }
-storage = { path = "src/storage" }
 store-api = { path = "src/store-api" }
 substrait = { path = "src/common/substrait" }
 table = { path = "src/table" }
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -53,33 +53,6 @@ type = "File"
 # The local file cache capacity in bytes.
 # cache_capacity = "256MB"

-# Compaction options, see `standalone.example.toml`.
-[storage.compaction]
-max_inflight_tasks = 4
-max_files_in_level0 = 8
-max_purge_tasks = 32
-
-# Storage manifest options
-[storage.manifest]
-# Region checkpoint actions margin.
-# Create a checkpoint every <checkpoint_margin> actions.
-checkpoint_margin = 10
-# Region manifest logs and checkpoints gc execution duration
-gc_duration = '10m'
-
-# Storage flush options
-[storage.flush]
-# Max inflight flush tasks.
-max_flush_tasks = 8
-# Default write buffer size for a region.
-region_write_buffer_size = "32MB"
-# Interval to check whether a region needs flush.
-picker_schedule_interval = "5m"
-# Interval to auto flush a region if it has not flushed yet.
-auto_flush_interval = "1h"
-# Global write buffer size for all regions.
-global_write_buffer_size = "1GB"
-
 # Mito engine options
 [[region_engine]]
 [region_engine.mito]
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -122,36 +122,6 @@ type = "File"
 # The local file cache capacity in bytes.
 # cache_capacity = "256MB"

-# Compaction options.
-[storage.compaction]
-# Max task number that can concurrently run.
-max_inflight_tasks = 4
-# Max files in level 0 to trigger compaction.
-max_files_in_level0 = 8
-# Max task number for SST purge task after compaction.
-max_purge_tasks = 32
-
-# Storage manifest options
-[storage.manifest]
-# Region checkpoint actions margin.
-# Create a checkpoint every <checkpoint_margin> actions.
-checkpoint_margin = 10
-# Region manifest logs and checkpoints gc execution duration
-gc_duration = '10m'
-
-# Storage flush options
-[storage.flush]
-# Max inflight flush tasks.
-max_flush_tasks = 8
-# Default write buffer size for a region.
-region_write_buffer_size = "32MB"
-# Interval to check whether a region needs flush.
-picker_schedule_interval = "5m"
-# Interval to auto flush a region if it has not flushed yet.
-auto_flush_interval = "1h"
-# Global write buffer size for all regions.
-global_write_buffer_size = "1GB"
-
 # Mito engine options
 [[region_engine]]
 [region_engine.mito]
--- a/src/catalog/Cargo.toml
+++ b/src/catalog/Cargo.toml
@@ -49,5 +49,4 @@ chrono.workspace = true
 common-test-util.workspace = true
 log-store.workspace = true
 object-store.workspace = true
-storage.workspace = true
 tokio.workspace = true
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -192,7 +192,7 @@ mod tests {
    use std::time::Duration;

    use common_test_util::temp_dir::create_named_temp_file;
-    use datanode::config::{CompactionConfig, FileConfig, ObjectStoreConfig, RegionManifestConfig};
+    use datanode::config::{FileConfig, ObjectStoreConfig};
    use servers::heartbeat_options::HeartbeatOptions;
    use servers::Mode;

@@ -232,16 +232,6 @@ mod tests {
            type = "File"
            data_home = "/tmp/greptimedb/"

-            [storage.compaction]
-            max_inflight_tasks = 3
-            max_files_in_level0 = 7
-            max_purge_tasks = 32
-
-            [storage.manifest]
-            checkpoint_margin = 9
-            gc_duration = '7s'
-            compress = true
-
            [logging]
            level = "debug"
            dir = "/tmp/greptimedb/test/logs"
@@ -294,23 +284,6 @@ mod tests {
            ObjectStoreConfig::File(FileConfig { .. })
        ));

-        assert_eq!(
-            CompactionConfig {
-                max_inflight_tasks: 3,
-                max_files_in_level0: 7,
-                max_purge_tasks: 32,
-            },
-            options.storage.compaction,
-        );
-        assert_eq!(
-            RegionManifestConfig {
-                checkpoint_margin: Some(9),
-                gc_duration: Some(Duration::from_secs(7)),
-                compress: true
-            },
-            options.storage.manifest,
-        );
-
        assert_eq!("debug", options.logging.level.unwrap());
        assert_eq!("/tmp/greptimedb/test/logs".to_string(), options.logging.dir);
    }
@@ -387,18 +360,12 @@ mod tests {
            file_size = "1GB"
            purge_threshold = "50GB"
            purge_interval = "10m"
-            read_batch_size = 128
            sync_write = false

            [storage]
            type = "File"
            data_home = "/tmp/greptimedb/"

-            [storage.compaction]
-            max_inflight_tasks = 3
-            max_files_in_level0 = 7
-            max_purge_tasks = 32
-
            [logging]
            level = "debug"
            dir = "/tmp/greptimedb/test/logs"
@@ -409,26 +376,24 @@ mod tests {
        temp_env::with_vars(
            [
                (
-                    // storage.manifest.gc_duration = 9s
+                    // wal.purge_interval = 1m
                    [
                        env_prefix.to_string(),
-                        "storage".to_uppercase(),
-                        "manifest".to_uppercase(),
-                        "gc_duration".to_uppercase(),
+                        "wal".to_uppercase(),
+                        "purge_interval".to_uppercase(),
                    ]
                    .join(ENV_VAR_SEP),
-                    Some("9s"),
+                    Some("1m"),
                ),
                (
-                    // storage.compaction.max_purge_tasks = 99
+                    // wal.read_batch_size = 100
                    [
                        env_prefix.to_string(),
-                        "storage".to_uppercase(),
-                        "compaction".to_uppercase(),
-                        "max_purge_tasks".to_uppercase(),
+                        "wal".to_uppercase(),
+                        "read_batch_size".to_uppercase(),
                    ]
                    .join(ENV_VAR_SEP),
-                    Some("99"),
+                    Some("100"),
                ),
                (
                    // meta_client.metasrv_addrs = 127.0.0.1:3001,127.0.0.1:3002,127.0.0.1:3003
@@ -456,10 +421,7 @@ mod tests {
                };

                // Should be read from env, env > default values.
-                assert_eq!(
-                    opts.storage.manifest.gc_duration,
-                    Some(Duration::from_secs(9))
-                );
+                assert_eq!(opts.wal.read_batch_size, 100,);
                assert_eq!(
                    opts.meta_client.unwrap().metasrv_addrs,
                    vec![
@@ -470,19 +432,13 @@ mod tests {
                );

                // Should be read from config file, config file > env > default values.
-                assert_eq!(opts.storage.compaction.max_purge_tasks, 32);
+                assert_eq!(opts.wal.purge_interval, Duration::from_secs(60 * 10));

                // Should be read from cli, cli > config file > env > default values.
                assert_eq!(opts.wal.dir.unwrap(), "/other/wal/dir");

                // Should be default value.
-                assert_eq!(
-                    opts.storage.manifest.checkpoint_margin,
-                    DatanodeOptions::default()
-                        .storage
-                        .manifest
-                        .checkpoint_margin
-                );
+                assert_eq!(opts.http.addr, DatanodeOptions::default().http.addr);
            },
        );
    }
--- a/src/cmd/src/options.rs
+++ b/src/cmd/src/options.rs
@@ -147,7 +147,6 @@ impl Options {
 #[cfg(test)]
 mod tests {
    use std::io::Write;
-    use std::time::Duration;

    use common_test_util::temp_dir::create_named_temp_file;
    use datanode::config::{DatanodeOptions, ObjectStoreConfig};
@@ -179,11 +178,6 @@ mod tests {
            read_batch_size = 128
            sync_write = false

-            [storage.compaction]
-            max_inflight_tasks = 3
-            max_files_in_level0 = 7
-            max_purge_tasks = 32
-
            [logging]
            level = "debug"
            dir = "/tmp/greptimedb/test/logs"
@@ -194,17 +188,6 @@ mod tests {
        temp_env::with_vars(
            // The following environment variables will be used to override the values in the config file.
            [
-                (
-                    // storage.manifest.checkpoint_margin = 99
-                    [
-                        env_prefix.to_string(),
-                        "storage".to_uppercase(),
-                        "manifest".to_uppercase(),
-                        "checkpoint_margin".to_uppercase(),
-                    ]
-                    .join(ENV_VAR_SEP),
-                    Some("99"),
-                ),
                (
                    // storage.type = S3
                    [
@@ -225,17 +208,6 @@ mod tests {
                    .join(ENV_VAR_SEP),
                    Some("mybucket"),
                ),
-                (
-                    // storage.manifest.gc_duration = 42s
-                    [
-                        env_prefix.to_string(),
-                        "storage".to_uppercase(),
-                        "manifest".to_uppercase(),
-                        "gc_duration".to_uppercase(),
-                    ]
-                    .join(ENV_VAR_SEP),
-                    Some("42s"),
-                ),
                (
                    // wal.dir = /other/wal/dir
                    [
@@ -266,17 +238,12 @@ mod tests {
                .unwrap();

                // Check the configs from environment variables.
-                assert_eq!(opts.storage.manifest.checkpoint_margin, Some(99));
                match opts.storage.store {
                    ObjectStoreConfig::S3(s3_config) => {
                        assert_eq!(s3_config.bucket, "mybucket".to_string());
                    }
                    _ => panic!("unexpected store type"),
                }
-                assert_eq!(
-                    opts.storage.manifest.gc_duration,
-                    Some(Duration::from_secs(42))
-                );
                assert_eq!(
                    opts.meta_client.unwrap().metasrv_addrs,
                    vec![
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -21,11 +21,13 @@ common-error.workspace = true
 common-macro.workspace = true
 common-runtime.workspace = true
 datafusion.workspace = true
+datatypes.workspace = true
 derive_builder.workspace = true
 futures.workspace = true
 lazy_static.workspace = true
 object-store.workspace = true
 orc-rust = "0.2"
+parquet.workspace = true
 paste = "1.0"
 regex = "1.7"
 serde.workspace = true
--- a/src/common/datasource/src/error.rs
+++ b/src/common/datasource/src/error.rs
@@ -166,6 +166,14 @@ pub enum Error {

    #[snafu(display("Buffered writer closed"))]
    BufferedWriterClosed { location: Location },
+
+    #[snafu(display("Failed to write parquet file, path: {}", path))]
+    WriteParquet {
+        path: String,
+        location: Location,
+        #[snafu(source)]
+        error: parquet::errors::ParquetError,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -178,7 +186,8 @@ impl ErrorExt for Error {
            | ListObjects { .. }
            | ReadObject { .. }
            | WriteObject { .. }
-            | AsyncWrite { .. } => StatusCode::StorageUnavailable,
+            | AsyncWrite { .. }
+            | WriteParquet { .. } => StatusCode::StorageUnavailable,

            UnsupportedBackendProtocol { .. }
            | UnsupportedCompressionType { .. }
@@ -231,6 +240,7 @@ impl ErrorExt for Error {
            InvalidConnection { location, .. } => Some(*location),
            UnsupportedCompressionType { location, .. } => Some(*location),
            UnsupportedFormat { location, .. } => Some(*location),
+            WriteParquet { location, .. } => Some(*location),
        }
    }
 }
--- a/src/common/datasource/src/file_format/parquet.rs
+++ b/src/common/datasource/src/file_format/parquet.rs
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::future::Future;
+use std::pin::Pin;
 use std::result;
 use std::sync::Arc;

 use arrow::record_batch::RecordBatch;
-use arrow_schema::Schema;
+use arrow_schema::{Schema, SchemaRef};
 use async_trait::async_trait;
 use datafusion::datasource::physical_plan::{FileMeta, ParquetFileReaderFactory};
 use datafusion::error::Result as DatafusionResult;
@@ -26,11 +28,15 @@ use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
 use datafusion::parquet::file::metadata::ParquetMetaData;
 use datafusion::parquet::format::FileMetaData;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion::physical_plan::SendableRecordBatchStream;
 use futures::future::BoxFuture;
+use futures::StreamExt;
 use object_store::{ObjectStore, Reader};
+use parquet::basic::{Compression, ZstdLevel};
+use parquet::file::properties::WriterProperties;
 use snafu::ResultExt;

-use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder};
+use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder, LazyBufferedWriter};
 use crate::error::{self, Result};
 use crate::file_format::FileFormat;
 use crate::share_buffer::SharedBuffer;
@@ -156,6 +162,103 @@ impl ArrowWriterCloser for ArrowWriter<SharedBuffer> {
    }
 }

+/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying
+/// storage by chunks to reduce memory consumption.
+pub struct BufferedWriter {
+    inner: InnerBufferedWriter,
+}
+
+type InnerBufferedWriter = LazyBufferedWriter<
+    object_store::Writer,
+    ArrowWriter<SharedBuffer>,
+    Box<
+        dyn FnMut(
+                String,
+            )
+                -> Pin<Box<dyn Future<Output = error::Result<object_store::Writer>> + Send>>
+            + Send,
+    >,
+>;
+
+impl BufferedWriter {
+    pub async fn try_new(
+        path: String,
+        store: ObjectStore,
+        arrow_schema: SchemaRef,
+        props: Option<WriterProperties>,
+        buffer_threshold: usize,
+    ) -> error::Result<Self> {
+        let buffer = SharedBuffer::with_capacity(buffer_threshold);
+
+        let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props)
+            .context(error::WriteParquetSnafu { path: &path })?;
+
+        Ok(Self {
+            inner: LazyBufferedWriter::new(
+                buffer_threshold,
+                buffer,
+                arrow_writer,
+                &path,
+                Box::new(move |path| {
+                    let store = store.clone();
+                    Box::pin(async move {
+                        store
+                            .writer(&path)
+                            .await
+                            .context(error::WriteObjectSnafu { path })
+                    })
+                }),
+            ),
+        })
+    }
+
+    /// Write a record batch to stream writer.
+    pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
+        self.inner.write(arrow_batch).await?;
+        self.inner.try_flush(false).await?;
+
+        Ok(())
+    }
+
+    /// Close parquet writer.
+    ///
+    /// Return file metadata and bytes written.
+    pub async fn close(self) -> error::Result<(FileMetaData, u64)> {
+        self.inner.close_with_arrow_writer().await
+    }
+}
+
+/// Output the stream to a parquet file.
+///
+/// Returns number of rows written.
+pub async fn stream_to_parquet(
+    mut stream: SendableRecordBatchStream,
+    store: ObjectStore,
+    path: &str,
+    threshold: usize,
+) -> Result<usize> {
+    let write_props = WriterProperties::builder()
+        .set_compression(Compression::ZSTD(ZstdLevel::default()))
+        .build();
+    let schema = stream.schema();
+    let mut buffered_writer = BufferedWriter::try_new(
+        path.to_string(),
+        store,
+        schema,
+        Some(write_props),
+        threshold,
+    )
+    .await?;
+    let mut rows_written = 0;
+    while let Some(batch) = stream.next().await {
+        let batch = batch.context(error::ReadRecordBatchSnafu)?;
+        buffered_writer.write(&batch).await?;
+        rows_written += batch.num_rows();
+    }
+    buffered_writer.close().await?;
+    Ok(rows_written)
+}
+
 #[cfg(test)]
 mod tests {
    use common_test_util::find_workspace_path;
--- a/src/datanode/Cargo.toml
+++ b/src/datanode/Cargo.toml
@@ -61,7 +61,6 @@ servers.workspace = true
 session.workspace = true
 snafu.workspace = true
 sql.workspace = true
-storage.workspace = true
 store-api.workspace = true
 substrait.workspace = true
 table.workspace = true
--- a/src/datanode/src/config.rs
+++ b/src/datanode/src/config.rs
@@ -31,11 +31,6 @@ use serde::{Deserialize, Serialize};
 use servers::heartbeat_options::HeartbeatOptions;
 use servers::http::HttpOptions;
 use servers::Mode;
-use storage::config::{
-    EngineConfig as StorageEngineConfig, DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_MAX_FLUSH_TASKS,
-    DEFAULT_PICKER_SCHEDULE_INTERVAL, DEFAULT_REGION_WRITE_BUFFER_SIZE,
-};
-use storage::scheduler::SchedulerConfig;

 pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::mb(256);

@@ -68,9 +63,6 @@ pub struct StorageConfig {
    pub data_home: String,
    #[serde(flatten)]
    pub store: ObjectStoreConfig,
-    pub compaction: CompactionConfig,
-    pub manifest: RegionManifestConfig,
-    pub flush: FlushConfig,
 }

 impl Default for StorageConfig {
@@ -79,9 +71,6 @@ impl Default for StorageConfig {
            global_ttl: None,
            data_home: DEFAULT_DATA_HOME.to_string(),
            store: ObjectStoreConfig::default(),
-            compaction: CompactionConfig::default(),
-            manifest: RegionManifestConfig::default(),
-            flush: FlushConfig::default(),
        }
    }
 }
@@ -216,109 +205,6 @@ impl Default for ObjectStoreConfig {
    }
 }

-/// Options for region manifest
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-#[serde(default)]
-pub struct RegionManifestConfig {
-    /// Region manifest checkpoint actions margin.
-    /// Manifest service create a checkpoint every `checkpoint_margin` actions.
-    pub checkpoint_margin: Option<u16>,
-    /// Region manifest logs and checkpoints gc task execution duration.
-    #[serde(with = "humantime_serde")]
-    pub gc_duration: Option<Duration>,
-    /// Whether to compress manifest and checkpoint file by gzip
-    pub compress: bool,
-}
-
-impl Default for RegionManifestConfig {
-    fn default() -> Self {
-        Self {
-            checkpoint_margin: Some(10u16),
-            gc_duration: Some(Duration::from_secs(600)),
-            compress: false,
-        }
-    }
-}
-
-/// Options for table compaction
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-#[serde(default)]
-pub struct CompactionConfig {
-    /// Max task number that can concurrently run.
-    pub max_inflight_tasks: usize,
-    /// Max files in level 0 to trigger compaction.
-    pub max_files_in_level0: usize,
-    /// Max task number for SST purge task after compaction.
-    pub max_purge_tasks: usize,
-}
-
-impl Default for CompactionConfig {
-    fn default() -> Self {
-        Self {
-            max_inflight_tasks: 4,
-            max_files_in_level0: 8,
-            max_purge_tasks: 32,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-#[serde(default)]
-pub struct FlushConfig {
-    /// Max inflight flush tasks.
-    pub max_flush_tasks: usize,
-    /// Default write buffer size for a region.
-    pub region_write_buffer_size: ReadableSize,
-    /// Interval to schedule auto flush picker to find region to flush.
-    #[serde(with = "humantime_serde")]
-    pub picker_schedule_interval: Duration,
-    /// Interval to auto flush a region if it has not flushed yet.
-    #[serde(with = "humantime_serde")]
-    pub auto_flush_interval: Duration,
-    /// Global write buffer size for all regions.
-    pub global_write_buffer_size: Option<ReadableSize>,
-}
-
-impl Default for FlushConfig {
-    fn default() -> Self {
-        Self {
-            max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
-            region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
-            picker_schedule_interval: Duration::from_millis(
-                DEFAULT_PICKER_SCHEDULE_INTERVAL.into(),
-            ),
-            auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
-            global_write_buffer_size: None,
-        }
-    }
-}
-
-impl From<&DatanodeOptions> for SchedulerConfig {
-    fn from(value: &DatanodeOptions) -> Self {
-        Self {
-            max_inflight_tasks: value.storage.compaction.max_inflight_tasks,
-        }
-    }
-}
-
-impl From<&DatanodeOptions> for StorageEngineConfig {
-    fn from(value: &DatanodeOptions) -> Self {
-        Self {
-            compress_manifest: value.storage.manifest.compress,
-            manifest_checkpoint_margin: value.storage.manifest.checkpoint_margin,
-            manifest_gc_duration: value.storage.manifest.gc_duration,
-            max_files_in_l0: value.storage.compaction.max_files_in_level0,
-            max_purge_tasks: value.storage.compaction.max_purge_tasks,
-            max_flush_tasks: value.storage.flush.max_flush_tasks,
-            region_write_buffer_size: value.storage.flush.region_write_buffer_size,
-            picker_schedule_interval: value.storage.flush.picker_schedule_interval,
-            auto_flush_interval: value.storage.flush.auto_flush_interval,
-            global_write_buffer_size: value.storage.flush.global_write_buffer_size,
-            global_ttl: value.storage.global_ttl,
-        }
-    }
-}
-
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(default)]
 pub struct DatanodeOptions {
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -68,7 +68,6 @@ session.workspace = true
 snafu.workspace = true
 sql.workspace = true
 sqlparser.workspace = true
-storage.workspace = true
 store-api.workspace = true
 substrait.workspace = true
 table.workspace = true
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -121,14 +121,6 @@ pub enum Error {
        source: common_datasource::error::Error,
    },

-    #[snafu(display("Failed to write parquet file, path: {}", path))]
-    WriteParquet {
-        path: String,
-        location: Location,
-        #[snafu(source)]
-        error: parquet::errors::ParquetError,
-    },
-
    #[snafu(display("Failed to read parquet file, path: {}", path))]
    ReadParquet {
        path: String,
@@ -428,7 +420,6 @@ impl ErrorExt for Error {

        match self {
            OpenDal { .. }
-            | WriteParquet { .. }
            | ReadParquet { .. }
            | WriteWal { .. }
            | ReadWal { .. }
--- a/src/mito2/src/sst.rs
+++ b/src/mito2/src/sst.rs
@@ -17,5 +17,4 @@
 pub mod file;
 pub mod file_purger;
 pub mod parquet;
-mod stream_writer;
 pub(crate) mod version;
--- a/src/mito2/src/sst/parquet/writer.rs
+++ b/src/mito2/src/sst/parquet/writer.rs
@@ -14,6 +14,7 @@

 //! Parquet writer.

+use common_datasource::file_format::parquet::BufferedWriter;
 use common_telemetry::debug;
 use common_time::Timestamp;
 use object_store::ObjectStore;
@@ -25,11 +26,10 @@ use snafu::ResultExt;
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::consts::SEQUENCE_COLUMN_NAME;

-use crate::error::{InvalidMetadataSnafu, Result};
+use crate::error::{InvalidMetadataSnafu, Result, WriteBufferSnafu};
 use crate::read::{Batch, Source};
 use crate::sst::parquet::format::WriteFormat;
 use crate::sst::parquet::{SstInfo, WriteOptions, PARQUET_METADATA_KEY};
-use crate::sst::stream_writer::BufferedWriter;

 /// Parquet SST writer.
 pub struct ParquetWriter {
@@ -83,14 +83,18 @@ impl ParquetWriter {
            Some(writer_props),
            opts.write_buffer_size.as_bytes() as usize,
        )
-        .await?;
+        .await
+        .context(WriteBufferSnafu)?;

        let mut stats = SourceStats::default();
        while let Some(batch) = self.source.next_batch().await? {
            stats.update(&batch);
            let arrow_batch = write_format.convert_batch(&batch)?;

-            buffered_writer.write(&arrow_batch).await?;
+            buffered_writer
+                .write(&arrow_batch)
+                .await
+                .context(WriteBufferSnafu)?;
        }

        if stats.num_rows == 0 {
@@ -99,11 +103,11 @@ impl ParquetWriter {
                self.file_path
            );

-            buffered_writer.close().await?;
+            buffered_writer.close().await.context(WriteBufferSnafu)?;
            return Ok(None);
        }

-        let (_file_meta, file_size) = buffered_writer.close().await?;
+        let (_file_meta, file_size) = buffered_writer.close().await.context(WriteBufferSnafu)?;
        // Safety: num rows > 0 so we must have min/max.
        let time_range = stats.time_range.unwrap();

--- a/src/mito2/src/sst/stream_writer.rs
+++ b/src/mito2/src/sst/stream_writer.rs
@@ -1,105 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::future::Future;
-use std::pin::Pin;
-
-use common_datasource::buffered_writer::LazyBufferedWriter;
-use common_datasource::share_buffer::SharedBuffer;
-use datatypes::arrow::datatypes::SchemaRef;
-use datatypes::arrow::record_batch::RecordBatch;
-use object_store::ObjectStore;
-use parquet::arrow::ArrowWriter;
-use parquet::file::properties::WriterProperties;
-use parquet::format::FileMetaData;
-use snafu::ResultExt;
-
-use crate::error;
-use crate::error::WriteParquetSnafu;
-
-/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying
-/// storage by chunks to reduce memory consumption.
-pub struct BufferedWriter {
-    inner: InnerBufferedWriter,
-}
-
-type InnerBufferedWriter = LazyBufferedWriter<
-    object_store::Writer,
-    ArrowWriter<SharedBuffer>,
-    Box<
-        dyn FnMut(
-                String,
-            ) -> Pin<
-                Box<
-                    dyn Future<Output = common_datasource::error::Result<object_store::Writer>>
-                        + Send,
-                >,
-            > + Send,
-    >,
->;
-
-impl BufferedWriter {
-    pub async fn try_new(
-        path: String,
-        store: ObjectStore,
-        arrow_schema: SchemaRef,
-        props: Option<WriterProperties>,
-        buffer_threshold: usize,
-    ) -> error::Result<Self> {
-        let buffer = SharedBuffer::with_capacity(buffer_threshold);
-
-        let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props)
-            .context(WriteParquetSnafu { path: &path })?;
-
-        Ok(Self {
-            inner: LazyBufferedWriter::new(
-                buffer_threshold,
-                buffer,
-                arrow_writer,
-                &path,
-                Box::new(move |path| {
-                    let store = store.clone();
-                    Box::pin(async move {
-                        store
-                            .writer(&path)
-                            .await
-                            .context(common_datasource::error::WriteObjectSnafu { path })
-                    })
-                }),
-            ),
-        })
-    }
-
-    /// Write a record batch to stream writer.
-    pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
-        self.inner
-            .write(arrow_batch)
-            .await
-            .context(error::WriteBufferSnafu)?;
-        self.inner
-            .try_flush(false)
-            .await
-            .context(error::WriteBufferSnafu)?;
-
-        Ok(())
-    }
-
-    /// Close parquet writer.
-    pub async fn close(self) -> error::Result<(FileMetaData, u64)> {
-        self.inner
-            .close_with_arrow_writer()
-            .await
-            .context(error::WriteBufferSnafu)
-    }
-}
--- a/src/operator/Cargo.toml
+++ b/src/operator/Cargo.toml
@@ -50,7 +50,6 @@ session.workspace = true
 snafu.workspace = true
 sql.workspace = true
 sqlparser.workspace = true
-storage.workspace = true
 store-api.workspace = true
 substrait.workspace = true
 table.workspace = true
--- a/src/operator/src/error.rs
+++ b/src/operator/src/error.rs
@@ -378,12 +378,6 @@ pub enum Error {
        error: datafusion::error::DataFusionError,
    },

-    #[snafu(display("Failed to write parquet file"))]
-    WriteParquet {
-        location: Location,
-        source: storage::error::Error,
-    },
-
    #[snafu(display(
        "Schema datatypes not match at index {}, expected table schema: {}, actual file schema: {}",
        index,
@@ -594,7 +588,6 @@ impl ErrorExt for Error {
            | Error::ParseUrl { source, .. }
            | Error::BuildBackend { source, .. } => source.status_code(),

-            Error::WriteParquet { source, .. } => source.status_code(),
            Error::ExecuteDdl { source, .. } => source.status_code(),
            Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,

--- a/src/operator/src/statement/copy_table_to.rs
+++ b/src/operator/src/statement/copy_table_to.rs
@@ -17,6 +17,7 @@ use std::sync::Arc;
 use common_base::readable_size::ReadableSize;
 use common_datasource::file_format::csv::stream_to_csv;
 use common_datasource::file_format::json::stream_to_json;
+use common_datasource::file_format::parquet::stream_to_parquet;
 use common_datasource::file_format::Format;
 use common_datasource::object_store::{build_backend, parse_url};
 use common_datasource::util::find_dir_and_filename;
@@ -31,17 +32,17 @@ use object_store::ObjectStore;
 use query::plan::LogicalPlan;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
-use storage::sst::SstInfo;
-use storage::{ParquetWriter, Source};
 use table::engine::TableReference;
 use table::requests::CopyTableRequest;
 use table::table::adapter::DfTableProviderAdapter;

-use crate::error::{
-    self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result, WriteParquetSnafu,
-};
+use crate::error::{self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result};
 use crate::statement::StatementExecutor;

+// The buffer size should be greater than 5MB (minimum multipart upload size).
+/// Buffer size to flush data to object stores.
+const WRITE_BUFFER_THRESHOLD: ReadableSize = ReadableSize::mb(8);
+
 impl StatementExecutor {
    async fn stream_to_file(
        &self,
@@ -50,7 +51,7 @@ impl StatementExecutor {
        object_store: ObjectStore,
        path: &str,
    ) -> Result<usize> {
-        let threshold = ReadableSize::mb(4).as_bytes() as usize;
+        let threshold = WRITE_BUFFER_THRESHOLD.as_bytes() as usize;

        match format {
            Format::Csv(_) => stream_to_csv(
@@ -69,17 +70,14 @@ impl StatementExecutor {
            )
            .await
            .context(error::WriteStreamToFileSnafu { path }),
-            Format::Parquet(_) => {
-                let writer = ParquetWriter::new(path, Source::Stream(stream), object_store);
-                let rows_copied = writer
-                    .write_sst(&storage::sst::WriteOptions::default())
-                    .await
-                    .context(WriteParquetSnafu)?
-                    .map(|SstInfo { num_rows, .. }| num_rows)
-                    .unwrap_or(0);
-
-                Ok(rows_copied)
-            }
+            Format::Parquet(_) => stream_to_parquet(
+                Box::pin(DfRecordBatchStreamAdapter::new(stream)),
+                object_store,
+                path,
+                threshold,
+            )
+            .await
+            .context(error::WriteStreamToFileSnafu { path }),
            _ => error::UnsupportedFormatSnafu { format: *format }.fail(),
        }
    }
--- a/src/script/Cargo.toml
+++ b/src/script/Cargo.toml
@@ -85,7 +85,6 @@ rayon = "1.0"
 ron = "0.7"
 serde = { version = "1.0", features = ["derive"] }
 session = { workspace = true, features = ["testing"] }
-storage.workspace = true
 tokio-test = "0.4"

 [[bench]]
--- a/src/storage/Cargo.toml
+++ b/src/storage/Cargo.toml
@@ -1,64 +0,0 @@
-[package]
-name = "storage"
-version.workspace = true
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-api.workspace = true
-arc-swap = "1.0"
-arrow-array.workspace = true
-arrow.workspace = true
-async-compat = "0.2"
-async-stream.workspace = true
-async-trait = "0.1"
-bytes = "1.1"
-common-base.workspace = true
-common-datasource.workspace = true
-common-error.workspace = true
-common-macro.workspace = true
-common-query.workspace = true
-common-recordbatch.workspace = true
-common-runtime.workspace = true
-common-telemetry.workspace = true
-common-time.workspace = true
-datafusion-common.workspace = true
-datafusion-expr.workspace = true
-datafusion-physical-expr.workspace = true
-datafusion.workspace = true
-datatypes.workspace = true
-futures-util.workspace = true
-futures.workspace = true
-itertools.workspace = true
-lazy_static.workspace = true
-object-store.workspace = true
-parquet = { workspace = true, features = ["async"] }
-paste.workspace = true
-prometheus.workspace = true
-prost.workspace = true
-regex = "1.5"
-serde.workspace = true
-serde_json = "1.0"
-snafu.workspace = true
-store-api.workspace = true
-table.workspace = true
-tokio-util.workspace = true
-tokio.workspace = true
-tonic.workspace = true
-uuid.workspace = true
-
-[dev-dependencies]
-atomic_float = "0.1"
-common-config.workspace = true
-common-test-util.workspace = true
-criterion = "0.3"
-datatypes = { workspace = true, features = ["test"] }
-log-store.workspace = true
-rand.workspace = true
-
-[build-dependencies]
-tonic-build = "0.9"
-
-[[bench]]
-name = "bench_main"
-harness = false
--- a/src/storage/benches/bench_main.rs
+++ b/src/storage/benches/bench_main.rs
@@ -1,27 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::criterion_main;
-
-mod memtable;
-mod wal;
-
-criterion_main! {
-    memtable::bench_memtable_read::benches,
-    memtable::bench_memtable_write::benches,
-    memtable::bench_memtable_read_write_ratio::benches,
-    wal::bench_wal::benches,
-    wal::bench_decode::benches,
-    wal::bench_encode::benches,
-}
--- a/src/storage/benches/memtable/bench_memtable_read.rs
+++ b/src/storage/benches/memtable/bench_memtable_read.rs
@@ -1,33 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::{criterion_group, criterion_main, Criterion, Throughput};
-
-use crate::memtable::generate_kvs;
-use crate::memtable::util::bench_context::BenchContext;
-
-fn bench_memtable_read(c: &mut Criterion) {
-    // the length of string in value is 20
-    let kvs = generate_kvs(10, 10000, 20);
-    let ctx = BenchContext::new();
-    kvs.iter().for_each(|kv| ctx.write(kv));
-    let mut group = c.benchmark_group("memtable_read");
-    let _ = group
-        .throughput(Throughput::Elements(10 * 10000))
-        .bench_function("read", |b| b.iter(|| ctx.read(100)));
-    group.finish();
-}
-
-criterion_group!(benches, bench_memtable_read);
-criterion_main!(benches);
--- a/src/storage/benches/memtable/bench_memtable_read_write_ratio.rs
+++ b/src/storage/benches/memtable/bench_memtable_read_write_ratio.rs
@@ -1,151 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::Arc;
-use std::thread;
-use std::time::Instant;
-
-use atomic_float::AtomicF64;
-use criterion::{
-    criterion_group, criterion_main, BatchSize, Bencher, BenchmarkId, Criterion, Throughput,
-};
-use rand::Rng;
-
-use crate::memtable::generate_kvs;
-use crate::memtable::util::bench_context::BenchContext;
-
-static READ_NUM: AtomicUsize = AtomicUsize::new(0);
-static WRITE_NUM: AtomicUsize = AtomicUsize::new(0);
-static READ_SECS: AtomicF64 = AtomicF64::new(0.0);
-static WRITE_SECS: AtomicF64 = AtomicF64::new(0.0);
-
-struct Input {
-    ratio: bool,
-    kv_size: usize,
-    batch_size: usize,
-}
-
-fn memtable_round(ctx: &BenchContext, input: &Input) {
-    if input.ratio {
-        let now = Instant::now();
-        let read_count = ctx.read(input.batch_size);
-        let d = now.elapsed();
-        let _ = READ_SECS.fetch_add(
-            d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9,
-            Ordering::Relaxed,
-        );
-        let _ = READ_NUM.fetch_add(read_count, Ordering::Relaxed);
-    } else {
-        generate_kvs(input.kv_size, input.batch_size, 20)
-            .iter()
-            .for_each(|kv| {
-                let now = Instant::now();
-                ctx.write(kv);
-                let d = now.elapsed();
-                let _ = WRITE_SECS.fetch_add(
-                    d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9,
-                    Ordering::Relaxed,
-                );
-                let _ = WRITE_NUM.fetch_add(kv.len(), Ordering::Relaxed);
-            });
-    }
-}
-
-fn bench_read_write_ctx_frac(b: &mut Bencher<'_>, frac: &usize) {
-    let frac = *frac;
-    let ctx = Arc::new(BenchContext::default());
-    let thread_ctx = ctx.clone();
-    let stop = Arc::new(AtomicBool::new(false));
-    let thread_stop = stop.clone();
-
-    let handle = thread::spawn(move || {
-        let mut rng = rand::thread_rng();
-        while !thread_stop.load(Ordering::Relaxed) {
-            let f = rng.gen_range(0..=10);
-            let input = Input {
-                ratio: f < frac,
-                kv_size: 100,
-                batch_size: 1000,
-            };
-            memtable_round(&thread_ctx, &input);
-        }
-    });
-
-    let mut rng = rand::thread_rng();
-    b.iter_batched_ref(
-        || {
-            let f = rng.gen_range(0..=10);
-            Input {
-                ratio: f < frac,
-                kv_size: 100,
-                batch_size: 1000,
-            }
-        },
-        |input| {
-            memtable_round(&ctx, input);
-        },
-        BatchSize::SmallInput,
-    );
-    stop.store(true, Ordering::Relaxed);
-    handle.join().unwrap();
-}
-
-#[allow(clippy::print_stdout)]
-fn bench_memtable_read_write_ratio(c: &mut Criterion) {
-    let mut group = c.benchmark_group("memtable_read_write_ratio");
-    for i in 0..=10 {
-        READ_NUM.store(0, Ordering::Relaxed);
-        WRITE_NUM.store(0, Ordering::Relaxed);
-        READ_SECS.store(0.0, Ordering::Relaxed);
-        WRITE_SECS.store(0.0, Ordering::Relaxed);
-
-        let _ = group
-            .bench_with_input(
-                BenchmarkId::from_parameter(format!(
-                    "read ratio: {:.2}% , write ratio: {:.2}%",
-                    i as f64 / 10_f64 * 100.0,
-                    (10 - i) as f64 / 10_f64 * 100.0,
-                )),
-                &i,
-                bench_read_write_ctx_frac,
-            )
-            .throughput(Throughput::Elements(100 * 1000));
-
-        // the time is a little different the real time
-        let read_num = READ_NUM.load(Ordering::Relaxed);
-        let read_time = READ_SECS.load(Ordering::Relaxed);
-        let read_tps = if read_time != 0.0 {
-            read_num as f64 / read_time
-        } else {
-            0.0
-        };
-        let write_num = WRITE_NUM.load(Ordering::Relaxed);
-        let write_time = WRITE_SECS.load(Ordering::Relaxed);
-        let write_tps = if write_time != 0.0 {
-            write_num as f64 / write_time
-        } else {
-            0.0
-        };
-        if read_num != 0 || write_num != 0 {
-            println!(
-                "\nread numbers: {read_num}, read thrpt: {read_tps}\nwrite numbers: {write_num}, write thrpt {write_tps}\n",
-            );
-        }
-    }
-    group.finish();
-}
-
-criterion_group!(benches, bench_memtable_read_write_ratio);
-criterion_main!(benches);
--- a/src/storage/benches/memtable/bench_memtable_write.rs
+++ b/src/storage/benches/memtable/bench_memtable_write.rs
@@ -1,34 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::{criterion_group, criterion_main, Criterion, Throughput};
-
-use crate::memtable::generate_kvs;
-use crate::memtable::util::bench_context::BenchContext;
-
-pub fn bench_memtable_write(c: &mut Criterion) {
-    // the length of string in value is 20
-    let kvs = generate_kvs(10, 1000, 20);
-    let mut group = c.benchmark_group("memtable_write");
-    let _ = group
-        .throughput(Throughput::Elements(10 * 1000))
-        .bench_function("write", |b| {
-            let ctx = BenchContext::new();
-            b.iter(|| kvs.iter().for_each(|kv| ctx.write(kv)))
-        });
-    group.finish();
-}
-
-criterion_group!(benches, bench_memtable_write);
-criterion_main!(benches);
--- a/src/storage/benches/memtable/mod.rs
+++ b/src/storage/benches/memtable/mod.rs
@@ -1,121 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod bench_memtable_read;
-pub mod bench_memtable_read_write_ratio;
-pub mod bench_memtable_write;
-pub mod util;
-
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::Arc;
-
-use api::v1::OpType;
-use datatypes::prelude::ScalarVectorBuilder;
-use datatypes::timestamp::TimestampMillisecond;
-use datatypes::vectors::{
-    StringVectorBuilder, TimestampMillisecondVectorBuilder, UInt64VectorBuilder,
-};
-use rand::distributions::Alphanumeric;
-use rand::prelude::ThreadRng;
-use rand::Rng;
-use storage::memtable::KeyValues;
-use store_api::storage::SequenceNumber;
-
-static NEXT_SEQUENCE: AtomicU64 = AtomicU64::new(0);
-
-fn get_sequence() -> SequenceNumber {
-    NEXT_SEQUENCE.fetch_add(1, Ordering::Relaxed)
-}
-
-fn random_kv(rng: &mut ThreadRng, value_size: usize) -> ((i64, u64), (Option<u64>, String)) {
-    let key0 = rng.gen_range(0..10000);
-    let key1 = rng.gen::<u64>();
-    let value1 = Some(rng.gen::<u64>());
-    let value2 = rand::thread_rng()
-        .sample_iter(&Alphanumeric)
-        .take(value_size)
-        .map(char::from)
-        .collect();
-    ((key0, key1), (value1, value2))
-}
-type KeyTuple = (i64, u64);
-type ValueTuple = (Option<u64>, String);
-
-fn random_kvs(len: usize, value_size: usize) -> (Vec<KeyTuple>, Vec<ValueTuple>) {
-    let mut keys = Vec::with_capacity(len);
-    let mut values = Vec::with_capacity(len);
-    for _ in 0..len {
-        let mut rng = rand::thread_rng();
-        let (key, value) = random_kv(&mut rng, value_size);
-        keys.push(key);
-        values.push(value);
-    }
-    (keys, values)
-}
-
-fn kvs_with_index(
-    sequence: SequenceNumber,
-    op_type: OpType,
-    start_index_in_batch: usize,
-    keys: &[(i64, u64)],
-    values: &[(Option<u64>, String)],
-) -> KeyValues {
-    let mut key_builders = (
-        TimestampMillisecondVectorBuilder::with_capacity(keys.len()),
-        UInt64VectorBuilder::with_capacity(keys.len()),
-    );
-    for key in keys {
-        key_builders.0.push(Some(TimestampMillisecond::from(key.0)));
-        key_builders.1.push(Some(key.1));
-    }
-    let row_keys = vec![Arc::new(key_builders.1.finish()) as _];
-
-    let mut value_builders = (
-        UInt64VectorBuilder::with_capacity(values.len()),
-        StringVectorBuilder::with_capacity(values.len()),
-    );
-    for value in values {
-        value_builders.0.push(value.0);
-        value_builders.1.push(Some(&value.1));
-    }
-    let row_values = vec![
-        Arc::new(value_builders.0.finish()) as _,
-        Arc::new(value_builders.1.finish()) as _,
-    ];
-    KeyValues {
-        sequence,
-        op_type,
-        start_index_in_batch,
-        keys: row_keys,
-        values: row_values,
-        timestamp: Some(Arc::new(key_builders.0.finish()) as _),
-    }
-}
-
-fn generate_kv(kv_size: usize, start_index_in_batch: usize, value_size: usize) -> KeyValues {
-    let (keys, values) = random_kvs(kv_size, value_size);
-    kvs_with_index(
-        get_sequence(),
-        OpType::Put,
-        start_index_in_batch,
-        &keys,
-        &values,
-    )
-}
-
-fn generate_kvs(kv_size: usize, size: usize, value_size: usize) -> Vec<KeyValues> {
-    (0..size)
-        .map(|i| generate_kv(kv_size, i, value_size))
-        .collect()
-}
--- a/src/storage/benches/memtable/util/bench_context.rs
+++ b/src/storage/benches/memtable/util/bench_context.rs
@@ -1,51 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use storage::memtable::{IterContext, KeyValues, MemtableRef};
-
-use crate::memtable::util::new_memtable;
-
-pub struct BenchContext {
-    memtable: MemtableRef,
-}
-impl Default for BenchContext {
-    fn default() -> Self {
-        BenchContext::new()
-    }
-}
-impl BenchContext {
-    pub fn new() -> BenchContext {
-        BenchContext {
-            memtable: new_memtable(),
-        }
-    }
-
-    pub fn write(&self, kvs: &KeyValues) {
-        self.memtable.write(kvs).unwrap();
-    }
-
-    pub fn read(&self, batch_size: usize) -> usize {
-        let mut read_count = 0;
-        let iter_ctx = IterContext {
-            batch_size,
-            ..Default::default()
-        };
-        let iter = self.memtable.iter(iter_ctx).unwrap();
-        for batch in iter {
-            let _ = batch.unwrap();
-            read_count += batch_size;
-        }
-        read_count
-    }
-}
--- a/src/storage/benches/memtable/util/mod.rs
+++ b/src/storage/benches/memtable/util/mod.rs
@@ -1,40 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod bench_context;
-pub mod regiondesc_util;
-pub mod schema_util;
-
-use datatypes::type_id::LogicalTypeId;
-use storage::memtable::{DefaultMemtableBuilder, MemtableBuilder, MemtableRef};
-use storage::metadata::RegionMetadata;
-use storage::schema::RegionSchemaRef;
-
-use crate::memtable::util::regiondesc_util::RegionDescBuilder;
-
-pub const TIMESTAMP_NAME: &str = "timestamp";
-
-pub fn schema_for_test() -> RegionSchemaRef {
-    let desc = RegionDescBuilder::new("bench")
-        .push_field_column(("v1", LogicalTypeId::UInt64, true))
-        .push_field_column(("v2", LogicalTypeId::String, true))
-        .build();
-    let metadata: RegionMetadata = desc.try_into().unwrap();
-
-    metadata.schema().clone()
-}
-
-pub fn new_memtable() -> MemtableRef {
-    DefaultMemtableBuilder::default().build(schema_for_test())
-}
--- a/src/storage/benches/memtable/util/regiondesc_util.rs
+++ b/src/storage/benches/memtable/util/regiondesc_util.rs
@@ -1,80 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use datatypes::prelude::ConcreteDataType;
-use store_api::storage::{
-    ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId,
-    RegionDescriptor, RowKeyDescriptorBuilder,
-};
-
-use super::schema_util::ColumnDef;
-use super::TIMESTAMP_NAME;
-
-pub struct RegionDescBuilder {
-    name: String,
-    last_column_id: ColumnId,
-    key_builder: RowKeyDescriptorBuilder,
-    default_cf_builder: ColumnFamilyDescriptorBuilder,
-}
-
-impl RegionDescBuilder {
-    pub fn new<T: Into<String>>(name: T) -> Self {
-        let key_builder = RowKeyDescriptorBuilder::new(
-            ColumnDescriptorBuilder::new(
-                1,
-                TIMESTAMP_NAME,
-                ConcreteDataType::timestamp_millisecond_datatype(),
-            )
-            .is_nullable(false)
-            .build()
-            .unwrap(),
-        );
-
-        Self {
-            name: name.into(),
-            last_column_id: 1,
-            key_builder,
-            default_cf_builder: ColumnFamilyDescriptorBuilder::default(),
-        }
-    }
-
-    pub fn push_field_column(mut self, column_def: ColumnDef) -> Self {
-        let column = self.new_column(column_def);
-        self.default_cf_builder = self.default_cf_builder.push_column(column);
-        self
-    }
-
-    pub fn build(self) -> RegionDescriptor {
-        RegionDescriptor {
-            id: 0.into(),
-            name: self.name,
-            row_key: self.key_builder.build().unwrap(),
-            default_cf: self.default_cf_builder.build().unwrap(),
-            extra_cfs: Vec::new(),
-        }
-    }
-
-    fn alloc_column_id(&mut self) -> ColumnId {
-        self.last_column_id += 1;
-        self.last_column_id
-    }
-
-    fn new_column(&mut self, column_def: ColumnDef) -> ColumnDescriptor {
-        let datatype = column_def.1.data_type();
-        ColumnDescriptorBuilder::new(self.alloc_column_id(), column_def.0, datatype)
-            .is_nullable(column_def.2)
-            .build()
-            .unwrap()
-    }
-}
--- a/src/storage/benches/memtable/util/schema_util.rs
+++ b/src/storage/benches/memtable/util/schema_util.rs
@@ -1,46 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use datatypes::prelude::*;
-use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef};
-
-/// Column definition: (name, datatype, is_nullable)
-pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool);
-
-pub fn new_schema(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> Schema {
-    let column_schemas: Vec<_> = column_defs
-        .iter()
-        .enumerate()
-        .map(|(index, column_def)| {
-            let datatype = column_def.1.data_type();
-            if let Some(timestamp_index) = timestamp_index {
-                ColumnSchema::new(column_def.0, datatype, column_def.2)
-                    .with_time_index(index == timestamp_index)
-            } else {
-                ColumnSchema::new(column_def.0, datatype, column_def.2)
-            }
-        })
-        .collect();
-
-    SchemaBuilder::try_from(column_schemas)
-        .unwrap()
-        .build()
-        .unwrap()
-}
-
-pub fn new_schema_ref(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> SchemaRef {
-    Arc::new(new_schema(column_defs, timestamp_index))
-}
--- a/src/storage/benches/wal/bench_decode.rs
+++ b/src/storage/benches/wal/bench_decode.rs
@@ -1,73 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::{criterion_group, criterion_main, Criterion};
-use storage::codec::{Decoder, Encoder};
-use storage::write_batch::{codec, WriteBatch};
-
-use crate::wal::util::gen_new_batch_and_types;
-
-/*
-------------------------------------
-                decode               |
-------------------------------------
-rows |  protobuf    |    arrow       |
------------------------------------
-10   |  8.6485 us    |  8.8028 us    |
------------------------------------
-100  |  63.850 us    |  46.174 us   |
------------------------------------
-10000|  654.46 us    |  433.58 us    |
------------------------------------
-*/
-
-fn encode_arrow(batch: &WriteBatch, dst: &mut Vec<u8>) {
-    let encoder = codec::PayloadEncoder::new();
-    encoder.encode(batch.payload(), dst).unwrap();
-}
-
-fn decode_arrow(dst: &[u8], mutation_types: &[i32]) {
-    let decoder = codec::PayloadDecoder::new(mutation_types);
-    let _ = decoder.decode(dst).unwrap();
-}
-
-fn bench_wal_decode(c: &mut Criterion) {
-    let (batch_10, types_10) = gen_new_batch_and_types(1);
-    let (batch_100, types_100) = gen_new_batch_and_types(10);
-    let (batch_10000, types_10000) = gen_new_batch_and_types(100);
-
-    let mut dst_arrow_10 = vec![];
-    let mut dst_arrow_100 = vec![];
-    let mut dst_arrow_10000 = vec![];
-
-    encode_arrow(&batch_10, &mut dst_arrow_10);
-    encode_arrow(&batch_100, &mut dst_arrow_100);
-    encode_arrow(&batch_10000, &mut dst_arrow_10000);
-
-    let mut group = c.benchmark_group("wal_decode");
-    let _ = group
-        .bench_function("arrow_decode_with_10_num_rows", |b| {
-            b.iter(|| decode_arrow(&dst_arrow_10, &types_10))
-        })
-        .bench_function("arrow_decode_with_100_num_rows", |b| {
-            b.iter(|| decode_arrow(&dst_arrow_100, &types_100))
-        })
-        .bench_function("arrow_decode_with_10000_num_rows", |b| {
-            b.iter(|| decode_arrow(&dst_arrow_10000, &types_10000))
-        });
-    group.finish();
-}
-
-criterion_group!(benches, bench_wal_decode);
-criterion_main!(benches);
--- a/src/storage/benches/wal/bench_encode.rs
+++ b/src/storage/benches/wal/bench_encode.rs
@@ -1,61 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::{criterion_group, criterion_main, Criterion};
-use storage::codec::Encoder;
-use storage::write_batch::{codec, WriteBatch};
-
-use crate::wal::util::gen_new_batch_and_types;
-
-/*
-------------------------------------
-                encode               |
-------------------------------------
-rows |  protobuf    |    arrow       |
------------------------------------
-10   |  4.8732 us    |  5.7388 us    |
------------------------------------
-100  |  40.928 us    |  24.988 us    |
------------------------------------
-10000|  425.69 us    |  229.74 us    |
------------------------------------
-*/
-
-fn encode_arrow(batch: &WriteBatch) {
-    let encoder = codec::PayloadEncoder::new();
-    let mut dst = vec![];
-    encoder.encode(batch.payload(), &mut dst).unwrap();
-}
-
-fn bench_wal_encode(c: &mut Criterion) {
-    let (batch_10, _) = gen_new_batch_and_types(1);
-    let (batch_100, _) = gen_new_batch_and_types(10);
-    let (batch_10000, _) = gen_new_batch_and_types(100);
-
-    let mut group = c.benchmark_group("wal_encode");
-    let _ = group
-        .bench_function("arrow_encode_with_10_num_rows", |b| {
-            b.iter(|| encode_arrow(&batch_10))
-        })
-        .bench_function("arrow_encode_with_100_num_rows", |b| {
-            b.iter(|| encode_arrow(&batch_100))
-        })
-        .bench_function("arrow_encode_with_10000_num_rows", |b| {
-            b.iter(|| encode_arrow(&batch_10000))
-        });
-    group.finish();
-}
-
-criterion_group!(benches, bench_wal_encode);
-criterion_main!(benches);
--- a/src/storage/benches/wal/bench_wal.rs
+++ b/src/storage/benches/wal/bench_wal.rs
@@ -1,64 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use criterion::{criterion_group, criterion_main, Criterion};
-use storage::codec::{Decoder, Encoder};
-use storage::write_batch::{codec, WriteBatch};
-
-use crate::wal::util::gen_new_batch_and_types;
-
-/*
-------------------------------------
-            encode  &  decode        |
-------------------------------------
-rows |  protobuf    |    arrow       |
------------------------------------
-10   |  13.845 us    |  15.093 us    |
------------------------------------
-100  |  106.70 us    |  73.895 us    |
------------------------------------
-10000|  1.0860 ms    |  680.12 us    |
------------------------------------
-*/
-
-fn codec_arrow(batch: &WriteBatch, mutation_types: &[i32]) {
-    let encoder = codec::PayloadEncoder::new();
-    let mut dst = vec![];
-    encoder.encode(batch.payload(), &mut dst).unwrap();
-
-    let decoder = codec::PayloadDecoder::new(mutation_types);
-    let _ = decoder.decode(&dst).unwrap();
-}
-
-fn bench_wal_encode_decode(c: &mut Criterion) {
-    let (batch_10, types_10) = gen_new_batch_and_types(1);
-    let (batch_100, types_100) = gen_new_batch_and_types(10);
-    let (batch_10000, types_10000) = gen_new_batch_and_types(100);
-
-    let mut group = c.benchmark_group("wal_encode_decode");
-    let _ = group
-        .bench_function("arrow_encode_decode_with_10_num_rows", |b| {
-            b.iter(|| codec_arrow(&batch_10, &types_10))
-        })
-        .bench_function("arrow_encode_decode_with_100_num_rows", |b| {
-            b.iter(|| codec_arrow(&batch_100, &types_100))
-        })
-        .bench_function("arrow_encode_decode_with_10000_num_rows", |b| {
-            b.iter(|| codec_arrow(&batch_10000, &types_10000))
-        });
-    group.finish();
-}
-
-criterion_group!(benches, bench_wal_encode_decode);
-criterion_main!(benches);
--- a/src/storage/benches/wal/mod.rs
+++ b/src/storage/benches/wal/mod.rs
@@ -1,18 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod bench_decode;
-pub mod bench_encode;
-pub mod bench_wal;
-pub mod util;
--- a/src/storage/benches/wal/util/mod.rs
+++ b/src/storage/benches/wal/util/mod.rs
@@ -1,94 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod write_batch_util;
-
-use std::collections::HashMap;
-use std::sync::Arc;
-
-use datatypes::prelude::ScalarVector;
-use datatypes::type_id::LogicalTypeId;
-use datatypes::vectors::{
-    BooleanVector, Float64Vector, StringVector, TimestampMillisecondVector, UInt64Vector, VectorRef,
-};
-use rand::Rng;
-use storage::proto;
-use storage::write_batch::WriteBatch;
-use store_api::storage::WriteRequest;
-
-pub fn new_test_batch() -> WriteBatch {
-    write_batch_util::new_write_batch(
-        &[
-            ("k1", LogicalTypeId::UInt64, false),
-            ("ts", LogicalTypeId::TimestampMillisecond, false),
-            ("v1", LogicalTypeId::Boolean, true),
-            ("4", LogicalTypeId::Float64, false),
-            ("5", LogicalTypeId::Float64, false),
-            ("6", LogicalTypeId::Float64, false),
-            ("7", LogicalTypeId::Float64, false),
-            ("8", LogicalTypeId::Float64, false),
-            ("9", LogicalTypeId::Float64, false),
-            ("10", LogicalTypeId::String, false),
-        ],
-        Some(2),
-        3,
-    )
-}
-
-pub fn gen_new_batch_and_types(putdate_nums: usize) -> (WriteBatch, Vec<i32>) {
-    let mut batch = new_test_batch();
-    let mut rng = rand::thread_rng();
-    for _ in 0..putdate_nums {
-        let mut intvs = [0u64; 10];
-        let mut boolvs = [true; 10];
-        let mut tsvs = [0i64; 10];
-        let mut fvs = [0.0_f64; 10];
-        let svs = [
-            "value1_string",
-            "value2_string",
-            "value3_string",
-            "value4_string",
-            "value5_string",
-            "value6_string",
-            "value7_string",
-            "value8_string",
-            "value9_string",
-            "value10_string",
-        ];
-        rng.fill(&mut intvs[..]);
-        rng.fill(&mut boolvs[..]);
-        rng.fill(&mut tsvs[..]);
-        rng.fill(&mut fvs[..]);
-        let intv = Arc::new(UInt64Vector::from_slice(intvs)) as VectorRef;
-        let boolv = Arc::new(BooleanVector::from(boolvs.to_vec())) as VectorRef;
-        let tsv = Arc::new(TimestampMillisecondVector::from_values(tsvs)) as VectorRef;
-        let fvs = Arc::new(Float64Vector::from_slice(fvs)) as VectorRef;
-        let svs = Arc::new(StringVector::from_slice(&svs)) as VectorRef;
-        let put_data = HashMap::from([
-            ("k1".to_string(), intv.clone()),
-            ("v1".to_string(), boolv),
-            ("ts".to_string(), tsv.clone()),
-            ("4".to_string(), fvs.clone()),
-            ("5".to_string(), fvs.clone()),
-            ("6".to_string(), fvs.clone()),
-            ("7".to_string(), fvs.clone()),
-            ("8".to_string(), fvs.clone()),
-            ("9".to_string(), fvs),
-            ("10".to_string(), svs),
-        ]);
-        batch.put(put_data).unwrap();
-    }
-    let types = proto::wal::gen_mutation_types(batch.payload());
-    (batch, types)
-}
--- a/src/storage/benches/wal/util/write_batch_util.rs
+++ b/src/storage/benches/wal/util/write_batch_util.rs
@@ -1,27 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use storage::write_batch::WriteBatch;
-
-use crate::memtable::util::schema_util::{self, ColumnDef};
-
-pub fn new_write_batch(
-    column_defs: &[ColumnDef],
-    timestamp_index: Option<usize>,
-    row_key_end: usize,
-) -> WriteBatch {
-    let schema = schema_util::new_schema_ref(column_defs, timestamp_index);
-
-    WriteBatch::new(schema, row_key_end)
-}
--- a/src/storage/build.rs
+++ b/src/storage/build.rs
@@ -1,19 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-fn main() {
-    tonic_build::configure()
-        .compile(&["proto/wal.proto"], &["."])
-        .expect("compile proto");
-}
--- a/src/storage/proto/wal.proto
+++ b/src/storage/proto/wal.proto
@@ -1,14 +0,0 @@
-syntax = "proto3";
-
-package greptime.storage.wal.v1;
-
-message WalHeader {
-  uint64 last_manifest_version = 1;
-  // Type of each mutation in payload, now only arrow payload uses this field.
-  repeated MutationType mutation_types = 2;
-}
-
-enum MutationType {
-  DELETE = 0;
-  PUT = 1;
-}
--- a/src/storage/src/chunk.rs
+++ b/src/storage/src/chunk.rs
@@ -1,451 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use async_trait::async_trait;
-use common_query::logical_plan::Expr;
-use common_recordbatch::OrderOption;
-use common_telemetry::logging;
-use common_time::range::TimestampRange;
-use snafu::ResultExt;
-use store_api::storage::{Chunk, ChunkReader, RegionId, SchemaRef, SequenceNumber};
-use table::predicate::{Predicate, TimeRangePredicateBuilder};
-
-use crate::error::{self, Error, Result};
-use crate::memtable::{IterContext, MemtableRef};
-use crate::read::{
-    Batch, BoxedBatchReader, ChainReader, DedupReader, MergeReaderBuilder, WindowedReader,
-};
-use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
-use crate::sst::{AccessLayerRef, FileHandle, LevelMetas, ReadOptions};
-use crate::window_infer::{PlainWindowInference, WindowInfer};
-
-/// Chunk reader implementation.
-// Now we use async-trait to implement the chunk reader, which is easier to implement than
-// using `Stream`, maybe change to `Stream` if we find out it is more efficient and have
-// necessary to do so.
-pub struct ChunkReaderImpl {
-    schema: ProjectedSchemaRef,
-    batch_reader: BoxedBatchReader,
-    output_ordering: Option<Vec<OrderOption>>,
-}
-
-#[async_trait]
-impl ChunkReader for ChunkReaderImpl {
-    type Error = Error;
-
-    fn user_schema(&self) -> &SchemaRef {
-        self.schema.projected_user_schema()
-    }
-
-    async fn next_chunk(&mut self) -> Result<Option<Chunk>> {
-        let batch = match self.batch_reader.next_batch().await? {
-            Some(b) => b,
-            None => return Ok(None),
-        };
-        Ok(Some(Chunk::new(batch.columns)))
-    }
-
-    fn project_chunk(&self, chunk: Chunk) -> Chunk {
-        let batch = Batch {
-            columns: chunk.columns,
-        };
-        self.schema.batch_to_chunk(&batch)
-    }
-
-    fn output_ordering(&self) -> Option<Vec<OrderOption>> {
-        self.output_ordering.clone()
-    }
-}
-
-impl ChunkReaderImpl {
-    pub fn new(
-        schema: ProjectedSchemaRef,
-        batch_reader: BoxedBatchReader,
-        output_ordering: Option<Vec<OrderOption>>,
-    ) -> ChunkReaderImpl {
-        ChunkReaderImpl {
-            schema,
-            batch_reader,
-            output_ordering,
-        }
-    }
-
-    #[inline]
-    pub fn projected_schema(&self) -> &ProjectedSchemaRef {
-        &self.schema
-    }
-}
-
-/// Builder to create a new [ChunkReaderImpl] from scan request.
-pub struct ChunkReaderBuilder {
-    region_id: RegionId,
-    schema: RegionSchemaRef,
-    projection: Option<Vec<usize>>,
-    filters: Vec<Expr>,
-    sst_layer: AccessLayerRef,
-    iter_ctx: IterContext,
-    memtables: Vec<MemtableRef>,
-    files_to_read: Vec<FileHandle>,
-    output_ordering: Option<Vec<OrderOption>>,
-    use_chain_reader: bool,
-}
-
-impl ChunkReaderBuilder {
-    pub fn new(region_id: RegionId, schema: RegionSchemaRef, sst_layer: AccessLayerRef) -> Self {
-        ChunkReaderBuilder {
-            region_id,
-            schema,
-            projection: None,
-            filters: vec![],
-            sst_layer,
-            iter_ctx: IterContext::default(),
-            memtables: Vec::new(),
-            files_to_read: Vec::new(),
-            output_ordering: None,
-            use_chain_reader: false,
-        }
-    }
-
-    /// Reserve space for iterating `num` memtables.
-    pub fn reserve_num_memtables(mut self, num: usize) -> Self {
-        self.memtables.reserve(num);
-        self
-    }
-
-    pub fn projection(mut self, projection: Option<Vec<usize>>) -> Self {
-        self.projection = projection;
-        self
-    }
-
-    pub fn filters(mut self, filters: Vec<Expr>) -> Self {
-        self.filters = filters;
-        self
-    }
-
-    pub fn output_ordering(mut self, ordering: Option<Vec<OrderOption>>) -> Self {
-        self.output_ordering = ordering;
-        self
-    }
-
-    pub fn batch_size(mut self, batch_size: usize) -> Self {
-        self.iter_ctx.batch_size = batch_size;
-        self
-    }
-
-    pub fn visible_sequence(mut self, sequence: SequenceNumber) -> Self {
-        self.iter_ctx.visible_sequence = sequence;
-        self
-    }
-
-    pub fn pick_memtables(mut self, memtables: MemtableRef) -> Self {
-        self.memtables.push(memtables);
-        self
-    }
-
-    /// Partition files and memtables according to their time windows and scan time windows
-    /// one by one.
-    ///
-    /// Note that compaction should not enable this.
-    pub fn use_chain_reader(mut self, use_chain_reader: bool) -> Self {
-        self.use_chain_reader = use_chain_reader;
-        self
-    }
-
-    /// Picks all SSTs in all levels
-    pub fn pick_all_ssts(mut self, ssts: &LevelMetas) -> Result<Self> {
-        let files = ssts.levels().iter().flat_map(|level| level.files());
-        // Now we read all files, so just reserve enough space to hold all files.
-        self.files_to_read.reserve(files.size_hint().0);
-        for file in files {
-            // We can't invoke async functions here, so we collects all files first, and
-            // create the batch reader later in `ChunkReaderBuilder`.
-            self.files_to_read.push(file.clone());
-        }
-        Ok(self)
-    }
-
-    /// Picks given SSTs to read.
-    pub fn pick_ssts(mut self, ssts: &[FileHandle]) -> Self {
-        for file in ssts {
-            self.files_to_read.push(file.clone());
-        }
-        self
-    }
-
-    /// Try to infer time window from output ordering. If the result
-    /// is `None` means the output ordering is not obeyed, otherwise
-    /// means the output ordering is obeyed and is same with request.
-    fn infer_time_windows(&self, output_ordering: &[OrderOption]) -> Option<Vec<TimestampRange>> {
-        if output_ordering.is_empty() {
-            return None;
-        }
-        let OrderOption { name, options } = &output_ordering[0];
-
-        if name != self.schema.timestamp_column_name() {
-            return None;
-        }
-        let memtable_stats = self
-            .memtables
-            .iter()
-            .filter(|m| m.num_rows() > 0) // Skip empty memtables.
-            .map(|m| m.stats())
-            .collect::<Vec<_>>();
-        let files = self
-            .files_to_read
-            .iter()
-            .map(FileHandle::meta)
-            .collect::<Vec<_>>();
-
-        Some(PlainWindowInference {}.infer_window(&files, &memtable_stats, options.descending))
-    }
-
-    async fn build_windowed(
-        self,
-        schema: &ProjectedSchemaRef,
-        time_range_predicate: &TimestampRange,
-        windows: Vec<TimestampRange>,
-        order_options: Vec<OrderOption>,
-    ) -> Result<BoxedBatchReader> {
-        let mut readers = Vec::with_capacity(windows.len());
-        for window in windows {
-            let time_range_predicate = time_range_predicate.and(&window);
-            let reader = self.build_reader(schema, &time_range_predicate).await?;
-            readers.push(reader);
-        }
-        let windowed_reader = WindowedReader::new(schema.clone(), readers, order_options);
-        Ok(Box::new(windowed_reader) as Box<_>)
-    }
-
-    async fn build_reader(
-        &self,
-        schema: &ProjectedSchemaRef,
-        time_range: &TimestampRange,
-    ) -> Result<BoxedBatchReader> {
-        let num_sources = self.memtables.len() + self.files_to_read.len();
-        let mut reader_builder = MergeReaderBuilder::with_capacity(schema.clone(), num_sources)
-            .batch_size(self.iter_ctx.batch_size);
-
-        for mem in &self.memtables {
-            let mut iter_ctx = self.iter_ctx.clone();
-            iter_ctx.time_range = Some(*time_range);
-            let iter = mem.iter(iter_ctx)?;
-            reader_builder = reader_builder.push_batch_iter(iter);
-        }
-
-        let predicate = Predicate::new(self.filters.clone());
-
-        let read_opts = ReadOptions {
-            batch_size: self.iter_ctx.batch_size,
-            projected_schema: schema.clone(),
-            predicate,
-            time_range: *time_range,
-        };
-
-        let mut num_read_files = 0;
-        for file in &self.files_to_read {
-            if !Self::file_in_range(file, time_range) {
-                logging::debug!(
-                    "Skip region {} file {:?}, predicate: {:?}",
-                    self.region_id,
-                    file,
-                    time_range
-                );
-                continue;
-            }
-
-            let reader = self.sst_layer.read_sst(file.clone(), &read_opts).await?;
-            reader_builder = reader_builder.push_batch_reader(reader);
-            num_read_files += 1;
-        }
-
-        logging::debug!(
-            "build reader done, region_id: {}, time_range: {:?}, total_files: {}, num_read_files: {}",
-            self.region_id,
-            time_range,
-            self.files_to_read.len(),
-            num_read_files,
-        );
-
-        let reader = reader_builder.build();
-        let reader = DedupReader::new(schema.clone(), reader);
-        Ok(Box::new(reader) as Box<_>)
-    }
-
-    pub async fn build(mut self) -> Result<ChunkReaderImpl> {
-        let time_range_predicate = self.build_time_range_predicate();
-        let schema = Arc::new(
-            ProjectedSchema::new(self.schema.clone(), self.projection.clone())
-                .context(error::InvalidProjectionSnafu)?,
-        );
-        self.iter_ctx.projected_schema = Some(schema.clone());
-
-        let mut output_ordering = None;
-        let reader = if let Some(ordering) = self.output_ordering.take() &&
-            let Some(windows) = self.infer_time_windows(&ordering) {
-                output_ordering = Some(ordering.clone());
-                self.build_windowed(&schema, &time_range_predicate, windows, ordering)
-                    .await?
-        } else if self.use_chain_reader {
-            self.build_chained(&schema, &time_range_predicate).await?
-        } else {
-            self.build_reader(&schema, &time_range_predicate).await?
-        };
-
-        Ok(ChunkReaderImpl::new(schema, reader, output_ordering))
-    }
-
-    async fn build_chained(
-        &self,
-        schema: &ProjectedSchemaRef,
-        time_range: &TimestampRange,
-    ) -> Result<BoxedBatchReader> {
-        let windows = self.infer_window_for_chain_reader(time_range);
-
-        logging::debug!(
-            "Infer window for chain reader, region_id: {}, memtables: {}, files: {}, num_windows: {}",
-            self.region_id,
-            self.memtables.len(),
-            self.files_to_read.len(),
-            windows.len(),
-        );
-
-        let mut readers = Vec::with_capacity(windows.len());
-        for window in &windows {
-            let time_range = time_range.and(window);
-            let reader = self.build_reader(schema, &time_range).await?;
-            readers.push(reader);
-        }
-
-        logging::debug!(
-            "Build chain reader, region_id: {}, time_range: {:?}, num_readers: {}",
-            self.region_id,
-            time_range,
-            readers.len(),
-        );
-
-        let chain_reader = ChainReader::new(schema.clone(), readers);
-        Ok(Box::new(chain_reader) as Box<_>)
-    }
-
-    /// Build time range predicate from schema and filters.
-    fn build_time_range_predicate(&self) -> TimestampRange {
-        let Some(ts_col) = self.schema.user_schema().timestamp_column() else {
-            return TimestampRange::min_to_max();
-        };
-        let unit = ts_col
-            .data_type
-            .as_timestamp()
-            .expect("Timestamp column must have timestamp-compatible type")
-            .unit();
-        TimeRangePredicateBuilder::new(&ts_col.name, unit, &self.filters).build()
-    }
-
-    /// Check if SST file's time range matches predicate.
-    fn file_in_range(file: &FileHandle, predicate: &TimestampRange) -> bool {
-        if predicate == &TimestampRange::min_to_max() {
-            return true;
-        }
-        // end_timestamp of sst file is inclusive.
-        let Some((start, end)) = *file.time_range() else {
-            return true;
-        };
-        let file_ts_range = TimestampRange::new_inclusive(Some(start), Some(end));
-        file_ts_range.intersects(predicate)
-    }
-
-    /// Returns the time range of memtables to read.
-    fn compute_memtable_range(&self) -> Option<TimestampRange> {
-        let (min_timestamp, max_timestamp) = self
-            .memtables
-            .iter()
-            .filter(|m| m.num_rows() > 0) // Skip empty memtables.
-            .map(|m| {
-                let stats = m.stats();
-                (stats.min_timestamp, stats.max_timestamp)
-            })
-            .reduce(|acc, e| (acc.0.min(e.0), acc.1.max(e.1)))?;
-
-        logging::debug!(
-            "Compute memtable range, region_id: {}, min: {:?}, max: {:?}",
-            self.region_id,
-            min_timestamp,
-            max_timestamp,
-        );
-
-        Some(TimestampRange::new_inclusive(
-            Some(min_timestamp),
-            Some(max_timestamp),
-        ))
-    }
-
-    /// Infer time window for chain reader according to the time range of memtables and files.
-    fn infer_window_for_chain_reader(&self, time_range: &TimestampRange) -> Vec<TimestampRange> {
-        let mut memtable_range = self.compute_memtable_range();
-        // file ranges: (start, end)
-        let mut file_ranges = Vec::with_capacity(self.files_to_read.len());
-        for file in &self.files_to_read {
-            if !Self::file_in_range(file, time_range) || file.time_range().is_none() {
-                continue;
-            }
-            // Safety: we have skip files whose range is `None`.
-            let range = file.time_range().unwrap();
-
-            // Filter by memtable's time range.
-            if let Some(mem_range) = &mut memtable_range {
-                let file_range = TimestampRange::new_inclusive(Some(range.0), Some(range.1));
-                if mem_range.intersects(&file_range) {
-                    // If the range of the SST intersects with the range of the
-                    // memtable, we merge it into the memtable's range.
-                    *mem_range = mem_range.or(&file_range);
-                    continue;
-                }
-            }
-
-            file_ranges.push((range.0, range.1));
-        }
-
-        if file_ranges.is_empty() {
-            return memtable_range.map(|range| vec![range]).unwrap_or_default();
-        }
-
-        // Sort by start times.
-        file_ranges.sort_unstable_by(|left, right| left.0.cmp(&right.0));
-
-        // Compute ranges for all SSTs.
-        let mut time_ranges = Vec::with_capacity(file_ranges.len() + 1);
-        // Safety: file_ranges is not empty.
-        let mut prev =
-            TimestampRange::new_inclusive(Some(file_ranges[0].0), Some(file_ranges[0].1));
-        for file_range in &file_ranges[1..] {
-            let current = TimestampRange::new_inclusive(Some(file_range.0), Some(file_range.1));
-            if prev.intersects(&current) {
-                prev = prev.or(&current);
-            } else {
-                time_ranges.push(prev);
-                prev = current;
-            }
-        }
-        time_ranges.push(prev);
-
-        if let Some(mem_range) = memtable_range {
-            time_ranges.push(mem_range);
-            // We have pushed the memtable range, resort the array.
-            time_ranges.sort_unstable_by(|left, right| left.start().cmp(right.start()));
-        }
-
-        time_ranges
-    }
-}
--- a/src/storage/src/codec.rs
+++ b/src/storage/src/codec.rs
@@ -1,33 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use common_error::ext::ErrorExt;
-
-pub trait Encoder {
-    /// The type that is decoded.
-    type Item;
-    type Error: ErrorExt;
-
-    /// Encodes a message into the bytes buffer.
-    fn encode(&self, item: &Self::Item, dst: &mut Vec<u8>) -> Result<(), Self::Error>;
-}
-
-pub trait Decoder {
-    /// The type that is decoded.
-    type Item;
-    type Error: ErrorExt;
-
-    /// Decodes a message from the bytes buffer.
-    fn decode(&self, src: &[u8]) -> Result<Self::Item, Self::Error>;
-}
--- a/src/storage/src/compaction.rs
+++ b/src/storage/src/compaction.rs
@@ -1,193 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod noop;
-mod picker;
-mod scheduler;
-mod task;
-mod twcs;
-mod writer;
-
-use std::sync::Arc;
-
-use common_telemetry::warn;
-use common_time::timestamp::TimeUnit;
-use common_time::Timestamp;
-pub use picker::{LeveledTimeWindowPicker, Picker, PickerContext};
-pub use scheduler::{CompactionHandler, CompactionRequestImpl};
-use store_api::logstore::LogStore;
-use store_api::storage::CompactionStrategy;
-pub use task::{CompactionTask, CompactionTaskImpl};
-pub use twcs::TwcsPicker;
-
-use crate::scheduler::Scheduler;
-use crate::sst::FileHandle;
-
-pub type CompactionPickerRef<S> =
-    Arc<dyn Picker<Request = CompactionRequestImpl<S>, Task = CompactionTaskImpl<S>> + Send + Sync>;
-
-pub type CompactionSchedulerRef<S> =
-    Arc<dyn Scheduler<Request = CompactionRequestImpl<S>> + Send + Sync>;
-
-/// Infers the suitable time bucket duration.
-/// Now it simply find the max and min timestamp across all SSTs in level and fit the time span
-/// into time bucket.
-pub(crate) fn infer_time_bucket<'a>(files: impl Iterator<Item = &'a FileHandle>) -> i64 {
-    let mut max_ts = Timestamp::new(i64::MIN, TimeUnit::Second);
-    let mut min_ts = Timestamp::new(i64::MAX, TimeUnit::Second);
-
-    for f in files {
-        if let Some((start, end)) = f.time_range() {
-            min_ts = min_ts.min(*start);
-            max_ts = max_ts.max(*end);
-        } else {
-            // we don't expect an SST file without time range,
-            // it's either a bug or data corruption.
-            warn!("Found SST file without time range metadata: {f:?}");
-        }
-    }
-
-    // safety: Convert whatever timestamp into seconds will not cause overflow.
-    let min_sec = min_ts.convert_to(TimeUnit::Second).unwrap().value();
-    let max_sec = max_ts.convert_to(TimeUnit::Second).unwrap().value();
-
-    max_sec
-        .checked_sub(min_sec)
-        .map(|span| TIME_BUCKETS.fit_time_bucket(span)) // return the max bucket on subtraction overflow.
-        .unwrap_or_else(|| TIME_BUCKETS.max()) // safety: TIME_BUCKETS cannot be empty.
-}
-
-pub(crate) struct TimeBuckets([i64; 7]);
-
-impl TimeBuckets {
-    /// Fits a given time span into time bucket by find the minimum bucket that can cover the span.
-    /// Returns the max bucket if no such bucket can be found.
-    fn fit_time_bucket(&self, span_sec: i64) -> i64 {
-        assert!(span_sec >= 0);
-        match self.0.binary_search(&span_sec) {
-            Ok(idx) => self.0[idx],
-            Err(idx) => {
-                if idx < self.0.len() {
-                    self.0[idx]
-                } else {
-                    self.0.last().copied().unwrap()
-                }
-            }
-        }
-    }
-
-    #[cfg(test)]
-    fn get(&self, idx: usize) -> i64 {
-        self.0[idx]
-    }
-
-    fn max(&self) -> i64 {
-        self.0.last().copied().unwrap()
-    }
-}
-
-/// A set of predefined time buckets.
-pub(crate) const TIME_BUCKETS: TimeBuckets = TimeBuckets([
-    60 * 60,                 // one hour
-    2 * 60 * 60,             // two hours
-    12 * 60 * 60,            // twelve hours
-    24 * 60 * 60,            // one day
-    7 * 24 * 60 * 60,        // one week
-    365 * 24 * 60 * 60,      // one year
-    10 * 365 * 24 * 60 * 60, // ten years
-]);
-
-pub fn compaction_strategy_to_picker<S: LogStore>(
-    strategy: &CompactionStrategy,
-) -> CompactionPickerRef<S> {
-    match strategy {
-        CompactionStrategy::Twcs(twcs_opts) => Arc::new(TwcsPicker::new(
-            twcs_opts.max_active_window_files,
-            twcs_opts.max_inactive_window_files,
-            twcs_opts.time_window_seconds,
-        )) as Arc<_>,
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use common_time::Timestamp;
-
-    use super::*;
-    use crate::file_purger::noop::new_noop_file_purger;
-    use crate::sst::{FileHandle, FileId, FileMeta, Level};
-
-    /// Test util to create file handles.
-    pub fn new_file_handle(
-        file_id: FileId,
-        start_ts_millis: i64,
-        end_ts_millis: i64,
-        level: Level,
-    ) -> FileHandle {
-        let file_purger = new_noop_file_purger();
-        let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
-        FileHandle::new(
-            FileMeta {
-                region_id: 0.into(),
-                file_id,
-                time_range: Some((
-                    Timestamp::new_millisecond(start_ts_millis),
-                    Timestamp::new_millisecond(end_ts_millis),
-                )),
-                level,
-                file_size: 0,
-            },
-            layer,
-            file_purger,
-        )
-    }
-
-    #[test]
-    fn test_time_bucket() {
-        assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(1));
-        assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(60 * 60));
-        assert_eq!(
-            TIME_BUCKETS.get(1),
-            TIME_BUCKETS.fit_time_bucket(60 * 60 + 1)
-        );
-
-        assert_eq!(
-            TIME_BUCKETS.get(2),
-            TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2) - 1)
-        );
-        assert_eq!(
-            TIME_BUCKETS.get(2),
-            TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2))
-        );
-        assert_eq!(
-            TIME_BUCKETS.get(3),
-            TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(3) - 1)
-        );
-        assert_eq!(TIME_BUCKETS.get(6), TIME_BUCKETS.fit_time_bucket(i64::MAX));
-    }
-
-    #[test]
-    fn test_infer_time_buckets() {
-        assert_eq!(
-            TIME_BUCKETS.get(0),
-            infer_time_bucket(
-                [
-                    new_file_handle(FileId::random(), 0, TIME_BUCKETS.get(0) * 1000 - 1, 0),
-                    new_file_handle(FileId::random(), 1, 10_000, 0)
-                ]
-                .iter()
-            )
-        );
-    }
-}
--- a/src/storage/src/compaction/noop.rs
+++ b/src/storage/src/compaction/noop.rs
@@ -1,91 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::fmt::{Debug, Formatter};
-use std::marker::PhantomData;
-
-use store_api::storage::RegionId;
-
-use crate::compaction::{CompactionTask, Picker};
-use crate::error::Result;
-use crate::scheduler::{Request, Scheduler};
-
-pub struct NoopCompactionScheduler<R> {
-    _phantom_data: PhantomData<R>,
-}
-
-impl<R> Default for NoopCompactionScheduler<R> {
-    fn default() -> Self {
-        Self {
-            _phantom_data: Default::default(),
-        }
-    }
-}
-
-impl<R> Debug for NoopCompactionScheduler<R> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("NoopCompactionScheduler<...>").finish()
-    }
-}
-
-#[derive(Default, Debug)]
-pub struct NoopCompactionRequest;
-
-#[derive(Default, Debug)]
-pub struct NoopCompactionPicker;
-
-impl Picker for NoopCompactionPicker {
-    type Request = NoopCompactionRequest;
-    type Task = NoopCompactionTask;
-
-    fn pick(&self, _req: &Self::Request) -> Result<Option<Self::Task>> {
-        Ok(None)
-    }
-}
-
-#[derive(Debug)]
-pub struct NoopCompactionTask;
-
-#[async_trait::async_trait]
-impl CompactionTask for NoopCompactionTask {
-    async fn run(self) -> Result<()> {
-        Ok(())
-    }
-}
-
-impl Request for NoopCompactionRequest {
-    type Key = RegionId;
-
-    fn key(&self) -> Self::Key {
-        RegionId::from(0)
-    }
-
-    fn complete(self, _result: Result<()>) {}
-}
-
-#[async_trait::async_trait]
-impl<R> Scheduler for NoopCompactionScheduler<R>
-where
-    R: Request<Key = RegionId>,
-{
-    type Request = R;
-
-    fn schedule(&self, _request: Self::Request) -> Result<bool> {
-        Ok(true)
-    }
-
-    async fn stop(&self, _await_termination: bool) -> Result<()> {
-        Ok(())
-    }
-}
--- a/src/storage/src/compaction/picker.rs
+++ b/src/storage/src/compaction/picker.rs
@@ -1,432 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::fmt::{Debug, Formatter};
-use std::marker::PhantomData;
-use std::time::Duration;
-
-use common_telemetry::{debug, error, info, warn};
-use common_time::timestamp::TimeUnit;
-use common_time::timestamp_millis::BucketAligned;
-use common_time::Timestamp;
-use snafu::ResultExt;
-use store_api::logstore::LogStore;
-
-use crate::compaction::infer_time_bucket;
-use crate::compaction::scheduler::CompactionRequestImpl;
-use crate::compaction::task::{CompactionOutput, CompactionTask, CompactionTaskImpl};
-use crate::error::{Result, TtlCalculationSnafu};
-use crate::scheduler::Request;
-use crate::sst::{FileHandle, FileId, LevelMeta};
-
-/// Picker picks input SST files and builds the compaction task.
-/// Different compaction strategy may implement different pickers.
-pub trait Picker: Debug + Send + 'static {
-    type Request: Request;
-    type Task: CompactionTask;
-
-    fn pick(&self, req: &Self::Request) -> Result<Option<Self::Task>>;
-}
-
-pub(crate) fn get_expired_ssts(
-    levels: &[LevelMeta],
-    ttl: Option<Duration>,
-    now: Timestamp,
-) -> Result<Vec<FileHandle>> {
-    let Some(ttl) = ttl else {
-        return Ok(vec![]);
-    };
-
-    let expire_time = now.sub_duration(ttl).context(TtlCalculationSnafu)?;
-
-    let expired_ssts = levels
-        .iter()
-        .flat_map(|l| l.get_expired_files(&expire_time).into_iter())
-        .collect();
-    Ok(expired_ssts)
-}
-
-pub struct PickerContext {
-    compaction_time_window: Option<i64>,
-}
-
-impl PickerContext {
-    pub fn with(compaction_time_window: Option<i64>) -> Self {
-        Self {
-            compaction_time_window,
-        }
-    }
-
-    pub fn compaction_time_window(&self) -> Option<i64> {
-        self.compaction_time_window
-    }
-}
-
-/// `LeveledTimeWindowPicker` only handles level 0 to level 1 compaction in a time-window tiered
-/// manner. It picks all SSTs in level 0 and writes rows in these SSTs to a new file partitioned
-/// by a inferred time bucket in level 1.
-pub struct LeveledTimeWindowPicker<S> {
-    _phantom_data: PhantomData<S>,
-}
-
-impl<S> Debug for LeveledTimeWindowPicker<S> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "LeveledTimeWindowPicker{{..}}")
-    }
-}
-
-impl<S> Default for LeveledTimeWindowPicker<S> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl<S> LeveledTimeWindowPicker<S> {
-    pub fn new() -> Self {
-        Self {
-            _phantom_data: Default::default(),
-        }
-    }
-}
-
-impl<S: LogStore> Picker for LeveledTimeWindowPicker<S> {
-    type Request = CompactionRequestImpl<S>;
-    type Task = CompactionTaskImpl<S>;
-
-    fn pick(&self, req: &CompactionRequestImpl<S>) -> Result<Option<CompactionTaskImpl<S>>> {
-        let levels = &req.levels();
-        let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())
-            .map_err(|e| {
-                error!(e;"Failed to get region expired SST files, region: {}, ttl: {:?}", req.region_id, req.ttl);
-                e
-            })
-            .unwrap_or_default();
-
-        if !expired_ssts.is_empty() {
-            info!(
-                "Expired SSTs in region {}: {:?}",
-                req.region_id, expired_ssts
-            );
-            // here we mark expired SSTs as compacting to avoid them being picked.
-            expired_ssts.iter().for_each(|f| f.mark_compacting(true));
-        }
-
-        let ctx = &PickerContext::with(req.compaction_time_window);
-
-        let mut outputs = vec![];
-        for level_num in 0..levels.level_num() {
-            let level = levels.level(level_num as u8);
-            let compaction_time_window = Self::pick_level(ctx, level, &mut outputs);
-
-            if outputs.is_empty() {
-                debug!(
-                    "No SST file can be compacted at level {}, path: {:?}",
-                    level_num, req.sst_layer
-                );
-                continue;
-            }
-
-            debug!(
-                "Found SST files to compact {:?} on level: {}, compaction window: {:?}",
-                outputs, level_num, compaction_time_window,
-            );
-            return Ok(Some(CompactionTaskImpl {
-                schema: req.schema(),
-                sst_layer: req.sst_layer.clone(),
-                outputs,
-                writer: req.writer.clone(),
-                shared_data: req.shared.clone(),
-                wal: req.wal.clone(),
-                manifest: req.manifest.clone(),
-                expired_ssts,
-                sst_write_buffer_size: req.sst_write_buffer_size,
-                compaction_time_window,
-                reschedule_on_finish: req.reschedule_on_finish,
-            }));
-        }
-
-        Ok(None)
-    }
-}
-
-impl<S> LeveledTimeWindowPicker<S> {
-    fn pick_level(
-        ctx: &PickerContext,
-        level: &LevelMeta,
-        results: &mut Vec<CompactionOutput>,
-    ) -> Option<i64> {
-        // SimpleTimeWindowStrategy only handles level 0 to level 1 compaction.
-        if level.level() != 0 {
-            return None;
-        }
-        let files = find_compactable_files(level);
-        debug!("Compactable files found: {:?}", files);
-        if files.is_empty() {
-            return None;
-        }
-        let time_window = ctx.compaction_time_window().unwrap_or_else(|| {
-            let inferred = infer_time_bucket(files.iter());
-            debug!(
-                "Compaction window is not present, inferring from files: {:?}",
-                inferred
-            );
-            inferred
-        });
-        let buckets = calculate_time_buckets(time_window, &files);
-        debug!("File bucket:{}, file groups: {:?}", time_window, buckets);
-
-        results.extend(buckets.into_iter().map(|(bound, files)| CompactionOutput {
-            output_file_id: FileId::random(),
-            output_level: 1,
-            time_window_bound: bound,
-            time_window_sec: time_window,
-            inputs: files,
-            // strict window is used in simple time window strategy in that rows in one file
-            // may get compacted to multiple destinations.
-            strict_window: true,
-        }));
-        Some(time_window)
-    }
-}
-
-/// Finds files that can be compacted in given level.
-/// Currently they're files that is not currently under compaction.
-#[inline]
-fn find_compactable_files(level: &LevelMeta) -> Vec<FileHandle> {
-    level.files().filter(|f| !f.compacting()).cloned().collect()
-}
-
-/// Calculates buckets for files. If file does not contain a time range in metadata, it will be
-/// assigned to a special bucket `i64::MAX` (normally no timestamp can be aligned to this bucket)
-/// so that all files without timestamp can be compacted together.
-fn calculate_time_buckets(bucket_sec: i64, files: &[FileHandle]) -> HashMap<i64, Vec<FileHandle>> {
-    let mut buckets = HashMap::new();
-
-    for file in files {
-        if let Some((start, end)) = file.time_range() {
-            let bounds = file_time_bucket_span(
-                start.convert_to(TimeUnit::Second).unwrap().value(),
-                end.convert_to(TimeUnit::Second).unwrap().value(),
-                bucket_sec,
-            );
-            for bound in bounds {
-                buckets
-                    .entry(bound)
-                    .or_insert_with(Vec::new)
-                    .push(file.clone());
-            }
-        } else {
-            warn!("Found corrupted SST without timestamp bounds: {:?}", file);
-        }
-    }
-    buckets
-}
-
-/// Calculates timestamp span between start and end timestamp.
-fn file_time_bucket_span(start_sec: i64, end_sec: i64, bucket_sec: i64) -> Vec<i64> {
-    assert!(start_sec <= end_sec);
-
-    // if timestamp is between `[i64::MIN, i64::MIN.align_by_bucket(bucket)]`, which cannot
-    // be aligned to a valid i64 bound, simply return `i64::MIN` rather than just underflow.
-    let mut start_aligned = start_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
-    let end_aligned = end_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
-
-    let mut res = Vec::with_capacity(((end_aligned - start_aligned) / bucket_sec + 1) as usize);
-    while start_aligned < end_aligned {
-        res.push(start_aligned);
-        start_aligned += bucket_sec;
-    }
-    res.push(end_aligned);
-    res
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::{HashMap, HashSet};
-    use std::sync::Arc;
-
-    use super::*;
-    use crate::compaction::tests::new_file_handle;
-    use crate::compaction::TIME_BUCKETS;
-    use crate::file_purger::noop::new_noop_file_purger;
-    use crate::sst::{FileId, Level, LevelMetas};
-
-    #[test]
-    fn test_time_bucket_span() {
-        assert_eq!(vec![0], file_time_bucket_span(1, 9, 10));
-
-        assert_eq!(vec![0, 10], file_time_bucket_span(1, 10, 10));
-
-        assert_eq!(vec![-10], file_time_bucket_span(-10, -1, 10));
-
-        assert_eq!(vec![-10, 0], file_time_bucket_span(-10, 0, 10));
-    }
-
-    #[test]
-    fn test_time_bucket_span_large() {
-        assert_eq!(
-            vec![
-                (i64::MAX - 10).align_by_bucket(10).unwrap(),
-                i64::MAX.align_by_bucket(10).unwrap(),
-            ],
-            file_time_bucket_span(i64::MAX - 10, i64::MAX, 10)
-        );
-
-        // magic hmmm?
-        for bucket in 1..100 {
-            assert_eq!(
-                vec![
-                    i64::MIN,
-                    (i64::MIN + bucket).align_by_bucket(bucket).unwrap()
-                ],
-                file_time_bucket_span(i64::MIN, i64::MIN + bucket, bucket)
-            );
-        }
-    }
-
-    fn new_file_handles(input: &[(FileId, i64, i64)]) -> Vec<FileHandle> {
-        input
-            .iter()
-            .map(|(file_id, start, end)| new_file_handle(*file_id, *start, *end, 0))
-            .collect()
-    }
-
-    fn check_bucket_calculation(
-        bucket_sec: i64,
-        files: Vec<FileHandle>,
-        expected: &[(i64, &[FileId])],
-    ) {
-        let res = calculate_time_buckets(bucket_sec, &files);
-
-        let expected = expected
-            .iter()
-            .map(|(bucket, file_ids)| (*bucket, file_ids.iter().copied().collect::<HashSet<_>>()))
-            .collect::<HashMap<_, _>>();
-
-        for (bucket, file_ids) in expected {
-            let actual = res
-                .get(&bucket)
-                .unwrap()
-                .iter()
-                .map(|f| f.file_id())
-                .collect();
-            assert_eq!(
-                file_ids, actual,
-                "bucket: {bucket}, expected: {file_ids:?}, actual: {actual:?}",
-            );
-        }
-    }
-
-    #[test]
-    fn test_calculate_time_buckets() {
-        let file_id_a = FileId::random();
-        let file_id_b = FileId::random();
-        // simple case, files with disjoint
-        check_bucket_calculation(
-            10,
-            new_file_handles(&[(file_id_a, 0, 9000), (file_id_b, 10000, 19000)]),
-            &[(0, &[file_id_a]), (10, &[file_id_b])],
-        );
-
-        // files across buckets
-        check_bucket_calculation(
-            10,
-            new_file_handles(&[(file_id_a, 0, 10001), (file_id_b, 10000, 19000)]),
-            &[(0, &[file_id_a]), (10, &[file_id_a, file_id_b])],
-        );
-        check_bucket_calculation(
-            10,
-            new_file_handles(&[(file_id_a, 0, 10000)]),
-            &[(0, &[file_id_a]), (10, &[file_id_a])],
-        );
-
-        // file with an large time range
-        let file_id_array = &[file_id_a];
-        let expected = (0..(TIME_BUCKETS.get(4) / TIME_BUCKETS.get(0)))
-            .map(|b| (b * TIME_BUCKETS.get(0), file_id_array as _))
-            .collect::<Vec<_>>();
-        check_bucket_calculation(
-            TIME_BUCKETS.get(0),
-            new_file_handles(&[(file_id_a, 0, TIME_BUCKETS.get(4) * 1000)]),
-            &expected,
-        );
-    }
-
-    struct TtlTester {
-        files: Vec<(FileId, i64, i64, Level)>,
-        ttl: Option<Duration>,
-        expired: Vec<usize>,
-        now: Timestamp,
-    }
-
-    impl TtlTester {
-        fn check(&self) {
-            let expected_expired = self
-                .expired
-                .iter()
-                .map(|idx| self.files[*idx].0)
-                .collect::<HashSet<_>>();
-            let file_purger = new_noop_file_purger();
-            let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
-            let file_handles = self
-                .files
-                .iter()
-                .map(|(file_id, start_ts, end_ts, level)| {
-                    new_file_handle(*file_id, *start_ts, *end_ts, *level).meta()
-                })
-                .collect::<Vec<_>>();
-            let levels = LevelMetas::new(layer, file_purger).merge(
-                file_handles.into_iter(),
-                vec![].into_iter(),
-                None,
-            );
-            let expired = get_expired_ssts(levels.levels(), self.ttl, self.now)
-                .unwrap()
-                .into_iter()
-                .map(|f| f.file_id())
-                .collect::<HashSet<_>>();
-            assert_eq!(expected_expired, expired);
-        }
-    }
-
-    #[test]
-    fn test_find_expired_ssts() {
-        TtlTester {
-            files: vec![
-                (FileId::random(), 8000, 9000, 0),
-                (FileId::random(), 10000, 11000, 0),
-                (FileId::random(), 8000, 11000, 1),
-                (FileId::random(), 2000, 3000, 1),
-            ],
-            ttl: Some(Duration::from_secs(1)),
-            expired: vec![3],
-            now: Timestamp::new_second(10),
-        }
-        .check();
-
-        TtlTester {
-            files: vec![
-                (FileId::random(), 8000, 8999, 0),
-                (FileId::random(), 10000, 11000, 0),
-                (FileId::random(), 8000, 11000, 1),
-                (FileId::random(), 2000, 3000, 1),
-            ],
-            ttl: Some(Duration::from_secs(1)),
-            expired: vec![0, 3],
-            now: Timestamp::new_second(10),
-        }
-        .check();
-    }
-}
--- a/src/storage/src/compaction/scheduler.rs
+++ b/src/storage/src/compaction/scheduler.rs
@@ -1,157 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::marker::PhantomData;
-use std::sync::Arc;
-use std::time::Duration;
-
-use common_base::readable_size::ReadableSize;
-use common_telemetry::{debug, error, info};
-use store_api::logstore::LogStore;
-use store_api::storage::RegionId;
-use tokio::sync::oneshot::Sender;
-use tokio::sync::Notify;
-
-use crate::compaction::task::CompactionTask;
-use crate::compaction::CompactionPickerRef;
-use crate::error::Result;
-use crate::manifest::region::RegionManifest;
-use crate::region::{RegionWriterRef, SharedDataRef};
-use crate::scheduler::rate_limit::BoxedRateLimitToken;
-use crate::scheduler::{Handler, Request};
-use crate::schema::RegionSchemaRef;
-use crate::sst::AccessLayerRef;
-use crate::version::LevelMetasRef;
-use crate::wal::Wal;
-
-impl<S: LogStore> Request for CompactionRequestImpl<S> {
-    type Key = RegionId;
-
-    #[inline]
-    fn key(&self) -> RegionId {
-        self.region_id
-    }
-
-    fn complete(self, result: Result<()>) {
-        if let Some(sender) = self.sender {
-            // We don't care the send result as callers might not
-            // wait the result.
-            let _ = sender.send(result);
-        }
-    }
-}
-
-/// Region compaction request.
-pub struct CompactionRequestImpl<S: LogStore> {
-    pub region_id: RegionId,
-    pub sst_layer: AccessLayerRef,
-    pub writer: RegionWriterRef<S>,
-    pub shared: SharedDataRef,
-    pub manifest: RegionManifest,
-    pub wal: Wal<S>,
-    pub ttl: Option<Duration>,
-    pub compaction_time_window: Option<i64>,
-    /// Compaction result sender.
-    pub sender: Option<Sender<Result<()>>>,
-    pub picker: CompactionPickerRef<S>,
-    pub sst_write_buffer_size: ReadableSize,
-    /// Whether to immediately reschedule another compaction when finished.
-    pub reschedule_on_finish: bool,
-}
-
-impl<S: LogStore> CompactionRequestImpl<S> {
-    #[inline]
-    pub(crate) fn schema(&self) -> RegionSchemaRef {
-        self.shared.version_control.current().schema().clone()
-    }
-
-    #[inline]
-    pub(crate) fn levels(&self) -> LevelMetasRef {
-        self.shared.version_control.current().ssts().clone()
-    }
-}
-
-pub struct CompactionHandler<S: LogStore> {
-    _phantom_data: PhantomData<S>,
-    #[cfg(test)]
-    pub pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
-}
-
-impl<S: LogStore> Default for CompactionHandler<S> {
-    fn default() -> Self {
-        Self {
-            _phantom_data: Default::default(),
-            #[cfg(test)]
-            pending_tasks: Arc::new(Default::default()),
-        }
-    }
-}
-
-impl<S: LogStore> CompactionHandler<S> {
-    #[cfg(test)]
-    pub fn new_with_pending_tasks(
-        tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
-    ) -> Self {
-        Self {
-            _phantom_data: Default::default(),
-            pending_tasks: tasks,
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl<S> Handler for CompactionHandler<S>
-where
-    S: LogStore,
-{
-    type Request = CompactionRequestImpl<S>;
-
-    async fn handle_request(
-        &self,
-        req: Self::Request,
-        token: BoxedRateLimitToken,
-        finish_notifier: Arc<Notify>,
-    ) -> Result<()> {
-        let region_id = req.key();
-        let Some(task) = req.picker.pick(&req)? else {
-            info!("No file needs compaction in region: {:?}", region_id);
-            req.complete(Ok(()));
-            return Ok(());
-        };
-
-        debug!("Compaction task, region: {:?}, task: {:?}", region_id, task);
-        // TODO(hl): we need to keep a track of task handle here to allow task cancellation.
-        let _handle = common_runtime::spawn_bg(async move {
-            if let Err(e) = task.run().await {
-                // TODO(hl): maybe resubmit compaction task on failure?
-                error!(e; "Failed to compact region: {:?}", region_id);
-
-                req.complete(Err(e));
-            } else {
-                info!("Successfully compacted region: {:?}", region_id);
-
-                req.complete(Ok(()));
-            }
-            // releases rate limit token
-            token.try_release();
-            // notify scheduler to schedule next task when current task finishes.
-            finish_notifier.notify_one();
-        });
-
-        #[cfg(test)]
-        self.pending_tasks.write().await.push(_handle);
-
-        Ok(())
-    }
-}
--- a/src/storage/src/compaction/task.rs
+++ b/src/storage/src/compaction/task.rs
@@ -1,309 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashSet;
-use std::fmt::{Debug, Formatter};
-
-use common_base::readable_size::ReadableSize;
-use common_telemetry::{debug, error, info};
-use itertools::Itertools;
-use snafu::ResultExt;
-use store_api::logstore::LogStore;
-use store_api::storage::{CompactContext, RegionId};
-
-use crate::compaction::writer::build_sst_reader;
-use crate::error;
-use crate::error::Result;
-use crate::manifest::action::RegionEdit;
-use crate::manifest::region::RegionManifest;
-use crate::region::{RegionWriterRef, SharedDataRef, WriterCompactRequest};
-use crate::schema::RegionSchemaRef;
-use crate::sst::{
-    AccessLayerRef, FileHandle, FileId, FileMeta, Level, Source, SstInfo, WriteOptions,
-};
-use crate::wal::Wal;
-
-const MAX_PARALLEL_COMPACTION: usize = 8;
-
-#[async_trait::async_trait]
-pub trait CompactionTask: Debug + Send + Sync + 'static {
-    async fn run(self) -> Result<()>;
-}
-
-pub struct CompactionTaskImpl<S: LogStore> {
-    pub schema: RegionSchemaRef,
-    pub sst_layer: AccessLayerRef,
-    pub outputs: Vec<CompactionOutput>,
-    pub writer: RegionWriterRef<S>,
-    pub shared_data: SharedDataRef,
-    pub wal: Wal<S>,
-    pub manifest: RegionManifest,
-    pub expired_ssts: Vec<FileHandle>,
-    pub sst_write_buffer_size: ReadableSize,
-    pub compaction_time_window: Option<i64>,
-    pub reschedule_on_finish: bool,
-}
-
-impl<S: LogStore> Debug for CompactionTaskImpl<S> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("CompactionTaskImpl")
-            .field("region_name", &self.shared_data.name())
-            .finish()
-    }
-}
-
-impl<S: LogStore> Drop for CompactionTaskImpl<S> {
-    fn drop(&mut self) {
-        self.mark_files_compacting(false);
-    }
-}
-
-impl<S: LogStore> CompactionTaskImpl<S> {
-    /// Compacts inputs SSTs, returns `(output file, compacted input file)`.
-    async fn merge_ssts(&mut self) -> Result<(HashSet<FileMeta>, HashSet<FileMeta>)> {
-        let mut futs = Vec::with_capacity(self.outputs.len());
-        let mut compacted_inputs = HashSet::new();
-        let region_id = self.shared_data.id();
-        for output in self.outputs.drain(..) {
-            let schema = self.schema.clone();
-            let sst_layer = self.sst_layer.clone();
-            let sst_write_buffer_size = self.sst_write_buffer_size;
-            compacted_inputs.extend(output.inputs.iter().map(FileHandle::meta));
-
-            info!(
-                "Compaction output [{}]-> {}",
-                output
-                    .inputs
-                    .iter()
-                    .map(|f| f.file_id().to_string())
-                    .join(","),
-                output.output_file_id
-            );
-
-            // TODO(hl): Maybe spawn to runtime to exploit in-job parallelism.
-            futs.push(async move {
-                output
-                    .build(region_id, schema, sst_layer, sst_write_buffer_size)
-                    .await
-            });
-        }
-
-        let mut outputs = HashSet::with_capacity(futs.len());
-        while !futs.is_empty() {
-            let mut task_chunk = Vec::with_capacity(MAX_PARALLEL_COMPACTION);
-            for _ in 0..MAX_PARALLEL_COMPACTION {
-                if let Some(task) = futs.pop() {
-                    task_chunk.push(common_runtime::spawn_bg(task));
-                }
-            }
-            let metas = futures::future::try_join_all(task_chunk)
-                .await
-                .context(error::JoinSnafu)?
-                .into_iter()
-                .collect::<Result<Vec<_>>>()?;
-            outputs.extend(metas.into_iter().flatten());
-        }
-
-        let inputs = compacted_inputs.into_iter().collect();
-        Ok((outputs, inputs))
-    }
-
-    /// Writes updated SST info into manifest.
-    async fn write_manifest_and_apply(
-        &self,
-        output: HashSet<FileMeta>,
-        input: HashSet<FileMeta>,
-    ) -> Result<()> {
-        let version = &self.shared_data.version_control;
-        let region_version = version.metadata().version();
-
-        let edit = RegionEdit {
-            region_version,
-            flushed_sequence: None,
-            files_to_add: Vec::from_iter(output),
-            files_to_remove: Vec::from_iter(input),
-            compaction_time_window: self.compaction_time_window,
-        };
-        debug!(
-            "Compacted region: {}, region edit: {:?}",
-            version.metadata().name(),
-            edit
-        );
-        self.writer
-            .write_edit_and_apply(&self.wal, &self.shared_data, &self.manifest, edit, None)
-            .await
-    }
-
-    /// Mark files are under compaction.
-    fn mark_files_compacting(&self, compacting: bool) {
-        for o in &self.outputs {
-            for input in &o.inputs {
-                input.mark_compacting(compacting);
-            }
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl<S: LogStore> CompactionTask for CompactionTaskImpl<S> {
-    async fn run(mut self) -> Result<()> {
-        let _timer = crate::metrics::COMPACT_ELAPSED.start_timer();
-        self.mark_files_compacting(true);
-
-        let (output, mut compacted) = self.merge_ssts().await.map_err(|e| {
-            error!(e; "Failed to compact region: {}", self.shared_data.name());
-            e
-        })?;
-        compacted.extend(self.expired_ssts.iter().map(FileHandle::meta));
-
-        let input_ids = compacted.iter().map(|f| f.file_id).collect::<Vec<_>>();
-        let output_ids = output.iter().map(|f| f.file_id).collect::<Vec<_>>();
-        info!(
-            "Compacting SST files, input: {:?}, output: {:?}, window: {:?}",
-            input_ids, output_ids, self.compaction_time_window
-        );
-
-        let no_output = output.is_empty();
-        let write_result = self
-            .write_manifest_and_apply(output, compacted)
-            .await
-            .map_err(|e| {
-                error!(e; "Failed to update region manifest: {}", self.shared_data.name());
-                e
-            });
-
-        if !no_output && self.reschedule_on_finish {
-            // only reschedule another compaction if current compaction has output and it's
-            // triggered by flush.
-            if let Err(e) = self
-                .writer
-                .compact(WriterCompactRequest {
-                    shared_data: self.shared_data.clone(),
-                    sst_layer: self.sst_layer.clone(),
-                    manifest: self.manifest.clone(),
-                    wal: self.wal.clone(),
-                    region_writer: self.writer.clone(),
-                    compact_ctx: CompactContext { wait: false },
-                })
-                .await
-            {
-                error!(e; "Failed to schedule a compaction after compaction, region id: {}", self.shared_data.id());
-            } else {
-                info!(
-                    "Immediately schedule another compaction for region: {}",
-                    self.shared_data.id()
-                );
-            }
-        }
-        write_result
-    }
-}
-
-/// Many-to-many compaction can be decomposed to a many-to-one compaction from level n to level n+1
-/// and a many-to-one compaction from level n+1 to level n+1.
-#[derive(Debug)]
-pub struct CompactionOutput {
-    pub output_file_id: FileId,
-    /// Compaction output file level.
-    pub output_level: Level,
-    /// The left bound of time window.
-    pub time_window_bound: i64,
-    /// Time window size in seconds.
-    pub time_window_sec: i64,
-    /// Compaction input files.
-    pub inputs: Vec<FileHandle>,
-    /// If the compaction output is strictly windowed.
-    pub strict_window: bool,
-}
-
-impl CompactionOutput {
-    async fn build(
-        &self,
-        region_id: RegionId,
-        schema: RegionSchemaRef,
-        sst_layer: AccessLayerRef,
-        sst_write_buffer_size: ReadableSize,
-    ) -> Result<Option<FileMeta>> {
-        let time_range = if self.strict_window {
-            (
-                Some(self.time_window_bound),
-                Some(self.time_window_bound + self.time_window_sec),
-            )
-        } else {
-            (None, None)
-        };
-
-        let reader = build_sst_reader(
-            region_id,
-            schema,
-            sst_layer.clone(),
-            &self.inputs,
-            time_range,
-        )
-        .await?;
-
-        let opts = WriteOptions {
-            sst_write_buffer_size,
-        };
-        let _timer = crate::metrics::MERGE_ELAPSED.start_timer();
-        let meta = sst_layer
-            .write_sst(self.output_file_id, Source::Reader(reader), &opts)
-            .await?
-            .map(
-                |SstInfo {
-                     time_range,
-                     file_size,
-                     ..
-                 }| FileMeta {
-                    region_id,
-                    file_id: self.output_file_id,
-                    time_range,
-                    level: self.output_level,
-                    file_size,
-                },
-            );
-        Ok(meta)
-    }
-}
-
-#[cfg(test)]
-pub mod tests {
-    use std::sync::Arc;
-
-    use super::*;
-    use crate::compaction::task::CompactionTask;
-
-    pub type CallbackRef = Arc<dyn Fn() + Send + Sync>;
-
-    pub struct NoopCompactionTask {
-        pub cbs: Vec<CallbackRef>,
-    }
-
-    impl Debug for NoopCompactionTask {
-        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-            f.debug_struct("storage::compaction::task::tests::NoopCompactionTask")
-                .finish()
-        }
-    }
-
-    #[async_trait::async_trait]
-    impl CompactionTask for NoopCompactionTask {
-        async fn run(self) -> Result<()> {
-            for cb in &self.cbs {
-                cb()
-            }
-            Ok(())
-        }
-    }
-}
--- a/src/storage/src/compaction/twcs.rs
+++ b/src/storage/src/compaction/twcs.rs
@@ -1,406 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Time-window compaction strategy
-
-use std::collections::BTreeMap;
-use std::fmt::{Debug, Formatter};
-use std::marker::PhantomData;
-
-use common_telemetry::{debug, info, warn};
-use common_time::timestamp::TimeUnit;
-use common_time::timestamp_millis::BucketAligned;
-use common_time::Timestamp;
-use store_api::logstore::LogStore;
-
-use crate::compaction::picker::get_expired_ssts;
-use crate::compaction::task::CompactionOutput;
-use crate::compaction::{infer_time_bucket, CompactionRequestImpl, CompactionTaskImpl, Picker};
-use crate::sst::{FileHandle, FileId, LevelMeta};
-
-/// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction
-/// candidates.
-pub struct TwcsPicker<S> {
-    max_active_window_files: usize,
-    max_inactive_window_files: usize,
-    time_window_seconds: Option<i64>,
-    _phantom_data: PhantomData<S>,
-}
-
-impl<S> Debug for TwcsPicker<S> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("TwcsPicker")
-            .field("max_active_window_files", &self.max_active_window_files)
-            .field("max_inactive_window_files", &self.max_inactive_window_files)
-            .finish()
-    }
-}
-
-impl<S> TwcsPicker<S> {
-    pub fn new(
-        max_active_window_files: usize,
-        max_inactive_window_files: usize,
-        time_window_seconds: Option<i64>,
-    ) -> Self {
-        Self {
-            max_inactive_window_files,
-            max_active_window_files,
-            _phantom_data: Default::default(),
-            time_window_seconds,
-        }
-    }
-
-    /// Builds compaction output from files.
-    /// For active writing window, we allow for at most `max_active_window_files` files to alleviate
-    /// fragmentation. For other windows, we allow at most 1 file at each window.
-    fn build_output(
-        &self,
-        time_windows: &BTreeMap<i64, Vec<FileHandle>>,
-        active_window: Option<i64>,
-        window_size: i64,
-    ) -> Vec<CompactionOutput> {
-        let mut output = vec![];
-        for (window, files) in time_windows {
-            if let Some(active_window) = active_window && *window == active_window {
-                if files.len() > self.max_active_window_files {
-                    output.push(CompactionOutput {
-                        output_file_id: FileId::random(),
-                        output_level: 1, // we only have two levels and always compact to l1 
-                        time_window_bound: *window,
-                        time_window_sec: window_size,
-                        inputs: files.clone(),
-                        // Strict window is not needed since we always compact many files to one 
-                        // single file in TWCS.
-                        strict_window: false,
-                    });
-                } else {
-                    debug!("Active window not present or no enough files in active window {:?}, window: {}", active_window, *window);
-                }
-            } else {
-                // not active writing window
-                if files.len() > self.max_inactive_window_files {
-                    output.push(CompactionOutput {
-                        output_file_id: FileId::random(),
-                        output_level: 1,
-                        time_window_bound: *window,
-                        time_window_sec: window_size,
-                        inputs: files.clone(),
-                        strict_window: false,
-                    });
-                } else {
-                    debug!("No enough files, current: {}, max_inactive_window_files: {}", files.len(), self.max_inactive_window_files)
-                }
-            }
-        }
-        output
-    }
-}
-
-impl<S: LogStore> Picker for TwcsPicker<S> {
-    type Request = CompactionRequestImpl<S>;
-    type Task = CompactionTaskImpl<S>;
-
-    fn pick(&self, req: &Self::Request) -> crate::error::Result<Option<Self::Task>> {
-        let levels = req.levels();
-        let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())?;
-        if !expired_ssts.is_empty() {
-            info!(
-                "Expired SSTs in region {}: {:?}",
-                req.region_id, expired_ssts
-            );
-            // here we mark expired SSTs as compacting to avoid them being picked.
-            expired_ssts.iter().for_each(|f| f.mark_compacting(true));
-        }
-
-        let time_window_size = req
-            .compaction_time_window
-            .or(self.time_window_seconds)
-            .unwrap_or_else(|| {
-                let inferred = infer_time_bucket(req.levels().level(0).files());
-                info!(
-                    "Compaction window for region {} is not present, inferring from files: {:?}",
-                    req.region_id, inferred
-                );
-                inferred
-            });
-
-        // Find active window from files in level 0.
-        let active_window =
-            find_latest_window_in_seconds(levels.level(0).files(), time_window_size);
-
-        let windows = assign_to_windows(
-            levels.levels().iter().flat_map(LevelMeta::files),
-            time_window_size,
-        );
-
-        let outputs = self.build_output(&windows, active_window, time_window_size);
-
-        if outputs.is_empty() && expired_ssts.is_empty() {
-            return Ok(None);
-        }
-        let task = CompactionTaskImpl {
-            schema: req.schema(),
-            sst_layer: req.sst_layer.clone(),
-            outputs,
-            writer: req.writer.clone(),
-            shared_data: req.shared.clone(),
-            wal: req.wal.clone(),
-            manifest: req.manifest.clone(),
-            expired_ssts,
-            sst_write_buffer_size: req.sst_write_buffer_size,
-            compaction_time_window: Some(time_window_size),
-            reschedule_on_finish: req.reschedule_on_finish,
-        };
-        Ok(Some(task))
-    }
-}
-
-/// Assigns files to windows with predefined window size (in seconds) by their max timestamps.
-fn assign_to_windows<'a>(
-    files: impl Iterator<Item = &'a FileHandle>,
-    time_window_size: i64,
-) -> BTreeMap<i64, Vec<FileHandle>> {
-    let mut windows: BTreeMap<i64, Vec<FileHandle>> = BTreeMap::new();
-    // Iterates all files and assign to time windows according to max timestamp
-    for file in files {
-        if let Some((_, end)) = file.time_range() {
-            let time_window = end
-                .convert_to(TimeUnit::Second)
-                .unwrap()
-                .value()
-                .align_to_ceil_by_bucket(time_window_size)
-                .unwrap_or(i64::MIN);
-            windows.entry(time_window).or_default().push(file.clone());
-        } else {
-            warn!("Unexpected file w/o timestamp: {:?}", file.file_id());
-        }
-    }
-    windows
-}
-
-/// Finds the latest active writing window among all files.
-/// Returns `None` when there are no files or all files are corrupted.
-fn find_latest_window_in_seconds<'a>(
-    files: impl Iterator<Item = &'a FileHandle>,
-    time_window_size: i64,
-) -> Option<i64> {
-    let mut latest_timestamp = None;
-    for f in files {
-        if let Some((_, end)) = f.time_range() {
-            if let Some(latest) = latest_timestamp && end > latest {
-                latest_timestamp = Some(end);
-            } else {
-                latest_timestamp = Some(end);
-            }
-        } else {
-            warn!("Cannot find timestamp range of file: {}", f.file_id());
-        }
-    }
-    latest_timestamp
-        .and_then(|ts| ts.convert_to_ceil(TimeUnit::Second))
-        .and_then(|ts| ts.value().align_to_ceil_by_bucket(time_window_size))
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashSet;
-
-    use log_store::NoopLogStore;
-
-    use super::*;
-    use crate::compaction::tests::new_file_handle;
-    use crate::sst::{FileId, Level};
-
-    #[test]
-    fn test_get_latest_window_in_seconds() {
-        assert_eq!(
-            Some(1),
-            find_latest_window_in_seconds([new_file_handle(FileId::random(), 0, 999, 0)].iter(), 1)
-        );
-        assert_eq!(
-            Some(1),
-            find_latest_window_in_seconds(
-                [new_file_handle(FileId::random(), 0, 1000, 0)].iter(),
-                1
-            )
-        );
-
-        assert_eq!(
-            Some(-9223372036854000),
-            find_latest_window_in_seconds(
-                [new_file_handle(FileId::random(), i64::MIN, i64::MIN + 1, 0)].iter(),
-                3600,
-            )
-        );
-
-        assert_eq!(
-            (i64::MAX / 10000000 + 1) * 10000,
-            find_latest_window_in_seconds(
-                [new_file_handle(FileId::random(), i64::MIN, i64::MAX, 0)].iter(),
-                10000,
-            )
-            .unwrap()
-        );
-    }
-
-    #[test]
-    fn test_assign_to_windows() {
-        let windows = assign_to_windows(
-            [
-                new_file_handle(FileId::random(), 0, 999, 0),
-                new_file_handle(FileId::random(), 0, 999, 0),
-                new_file_handle(FileId::random(), 0, 999, 0),
-                new_file_handle(FileId::random(), 0, 999, 0),
-                new_file_handle(FileId::random(), 0, 999, 0),
-            ]
-            .iter(),
-            3,
-        );
-        assert_eq!(5, windows.get(&0).unwrap().len());
-
-        let files = [FileId::random(); 3];
-        let windows = assign_to_windows(
-            [
-                new_file_handle(files[0], -2000, -3, 0),
-                new_file_handle(files[1], 0, 2999, 0),
-                new_file_handle(files[2], 50, 10001, 0),
-            ]
-            .iter(),
-            3,
-        );
-        assert_eq!(files[0], windows.get(&0).unwrap().get(0).unwrap().file_id());
-        assert_eq!(files[1], windows.get(&3).unwrap().get(0).unwrap().file_id());
-        assert_eq!(
-            files[2],
-            windows.get(&12).unwrap().get(0).unwrap().file_id()
-        );
-    }
-
-    struct CompactionPickerTestCase {
-        window_size: i64,
-        input_files: Vec<FileHandle>,
-        expected_outputs: Vec<ExpectedOutput>,
-    }
-
-    impl CompactionPickerTestCase {
-        fn check(&self) {
-            let windows = assign_to_windows(self.input_files.iter(), self.window_size);
-            let active_window =
-                find_latest_window_in_seconds(self.input_files.iter(), self.window_size);
-            let output = TwcsPicker::<NoopLogStore>::new(4, 1, None).build_output(
-                &windows,
-                active_window,
-                self.window_size,
-            );
-
-            let output = output
-                .iter()
-                .map(|o| {
-                    let input_file_ids =
-                        o.inputs.iter().map(|f| f.file_id()).collect::<HashSet<_>>();
-                    (
-                        input_file_ids,
-                        o.output_level,
-                        o.time_window_sec,
-                        o.time_window_bound,
-                        o.strict_window,
-                    )
-                })
-                .collect::<Vec<_>>();
-
-            let expected = self
-                .expected_outputs
-                .iter()
-                .map(|o| {
-                    let input_file_ids = o
-                        .input_files
-                        .iter()
-                        .map(|idx| self.input_files[*idx].file_id())
-                        .collect::<HashSet<_>>();
-                    (
-                        input_file_ids,
-                        o.output_level,
-                        o.time_window_sec,
-                        o.time_window_bound,
-                        o.strict_window,
-                    )
-                })
-                .collect::<Vec<_>>();
-            assert_eq!(expected, output);
-        }
-    }
-
-    struct ExpectedOutput {
-        input_files: Vec<usize>,
-        output_level: Level,
-        time_window_sec: i64,
-        time_window_bound: i64,
-        strict_window: bool,
-    }
-
-    #[test]
-    fn test_build_twcs_output() {
-        let file_ids = (0..4).map(|_| FileId::random()).collect::<Vec<_>>();
-
-        CompactionPickerTestCase {
-            window_size: 3,
-            input_files: [
-                new_file_handle(file_ids[0], -2000, -3, 0),
-                new_file_handle(file_ids[1], -3000, -100, 0),
-                new_file_handle(file_ids[2], 0, 2999, 0), //active windows
-                new_file_handle(file_ids[3], 50, 2998, 0), //active windows
-            ]
-            .to_vec(),
-            expected_outputs: vec![ExpectedOutput {
-                input_files: vec![0, 1],
-                output_level: 1,
-                time_window_sec: 3,
-                time_window_bound: 0,
-                strict_window: false,
-            }],
-        }
-        .check();
-
-        let file_ids = (0..6).map(|_| FileId::random()).collect::<Vec<_>>();
-        CompactionPickerTestCase {
-            window_size: 3,
-            input_files: [
-                new_file_handle(file_ids[0], -2000, -3, 0),
-                new_file_handle(file_ids[1], -3000, -100, 0),
-                new_file_handle(file_ids[2], 0, 2999, 0),
-                new_file_handle(file_ids[3], 50, 2998, 0),
-                new_file_handle(file_ids[4], 11, 2990, 0),
-                new_file_handle(file_ids[5], 50, 4998, 0),
-            ]
-            .to_vec(),
-            expected_outputs: vec![
-                ExpectedOutput {
-                    input_files: vec![0, 1],
-                    output_level: 1,
-                    time_window_sec: 3,
-                    time_window_bound: 0,
-                    strict_window: false,
-                },
-                ExpectedOutput {
-                    input_files: vec![2, 3, 4],
-                    output_level: 1,
-                    time_window_sec: 3,
-                    time_window_bound: 3,
-                    strict_window: false,
-                },
-            ],
-        }
-        .check();
-    }
-}
--- a/src/storage/src/compaction/writer.rs
+++ b/src/storage/src/compaction/writer.rs
@@ -1,588 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use common_query::logical_plan::{DfExpr, Expr};
-use common_time::timestamp::TimeUnit;
-use datafusion_expr::Operator;
-use datatypes::value::timestamp_to_scalar_value;
-use store_api::storage::RegionId;
-
-use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
-use crate::error;
-use crate::schema::RegionSchemaRef;
-use crate::sst::{AccessLayerRef, FileHandle};
-
-/// Builds an SST reader that only reads rows within given time range.
-pub(crate) async fn build_sst_reader(
-    region_id: RegionId,
-    schema: RegionSchemaRef,
-    sst_layer: AccessLayerRef,
-    files: &[FileHandle],
-    time_range: (Option<i64>, Option<i64>),
-) -> error::Result<ChunkReaderImpl> {
-    // TODO(hl): Schemas in different SSTs may differ, thus we should infer
-    // timestamp column name from Parquet metadata.
-
-    // safety: Region schema's timestamp column must present
-    let ts_col = schema.user_schema().timestamp_column().unwrap();
-    let ts_col_unit = ts_col.data_type.as_timestamp().unwrap().unit();
-    let ts_col_name = ts_col.name.clone();
-
-    ChunkReaderBuilder::new(region_id, schema, sst_layer)
-        .pick_ssts(files)
-        .filters(
-            build_time_range_filter(time_range, &ts_col_name, ts_col_unit)
-                .into_iter()
-                .collect(),
-        )
-        .build()
-        .await
-}
-
-/// Build time range filter expr from lower (inclusive) and upper bound(exclusive).
-/// Returns `None` if time range overflows.
-fn build_time_range_filter(
-    time_range: (Option<i64>, Option<i64>),
-    ts_col_name: &str,
-    ts_col_unit: TimeUnit,
-) -> Option<Expr> {
-    let (low_ts_inclusive, high_ts_exclusive) = time_range;
-    let ts_col = DfExpr::Column(datafusion_common::Column::from_name(ts_col_name));
-
-    // Converting seconds to whatever unit won't lose precision.
-    // Here only handles overflow.
-    let low_ts = low_ts_inclusive
-        .map(common_time::Timestamp::new_second)
-        .and_then(|ts| ts.convert_to(ts_col_unit))
-        .map(|ts| ts.value());
-    let high_ts = high_ts_exclusive
-        .map(common_time::Timestamp::new_second)
-        .and_then(|ts| ts.convert_to(ts_col_unit))
-        .map(|ts| ts.value());
-
-    let expr = match (low_ts, high_ts) {
-        (Some(low), Some(high)) => {
-            let lower_bound_expr =
-                DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(low)));
-            let upper_bound_expr =
-                DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(high)));
-            Some(datafusion_expr::and(
-                datafusion_expr::binary_expr(ts_col.clone(), Operator::GtEq, lower_bound_expr),
-                datafusion_expr::binary_expr(ts_col, Operator::Lt, upper_bound_expr),
-            ))
-        }
-
-        (Some(low), None) => {
-            let lower_bound_expr =
-                datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(low)));
-            Some(datafusion_expr::binary_expr(
-                ts_col,
-                Operator::GtEq,
-                lower_bound_expr,
-            ))
-        }
-
-        (None, Some(high)) => {
-            let upper_bound_expr =
-                datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(high)));
-            Some(datafusion_expr::binary_expr(
-                ts_col,
-                Operator::Lt,
-                upper_bound_expr,
-            ))
-        }
-
-        (None, None) => None,
-    };
-
-    expr.map(Expr::from)
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::atomic::{AtomicU64, Ordering};
-    use std::sync::Arc;
-
-    use api::v1::OpType;
-    use common_base::readable_size::ReadableSize;
-    use common_test_util::temp_dir::create_temp_dir;
-    use common_time::Timestamp;
-    use datatypes::prelude::{LogicalTypeId, ScalarVector, ScalarVectorBuilder};
-    use datatypes::timestamp::TimestampMillisecond;
-    use datatypes::vectors::{
-        TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64VectorBuilder,
-    };
-    use object_store::services::Fs;
-    use object_store::ObjectStore;
-    use store_api::storage::{ChunkReader, SequenceNumber};
-
-    use super::*;
-    use crate::file_purger::noop::new_noop_file_purger;
-    use crate::memtable::{
-        DefaultMemtableBuilder, IterContext, KeyValues, Memtable, MemtableBuilder,
-    };
-    use crate::metadata::RegionMetadata;
-    use crate::sst::parquet::ParquetWriter;
-    use crate::sst::{self, FileId, FileMeta, FsAccessLayer, Source, SstInfo, WriteOptions};
-    use crate::test_util::descriptor_util::RegionDescBuilder;
-
-    const REGION_ID: RegionId = RegionId::from_u64(1);
-
-    fn schema_for_test() -> RegionSchemaRef {
-        // Just build a region desc and use its columns metadata.
-        let desc = RegionDescBuilder::new("test")
-            .push_field_column(("v", LogicalTypeId::UInt64, true))
-            .build();
-        let metadata: RegionMetadata = desc.try_into().unwrap();
-        metadata.schema().clone()
-    }
-
-    pub fn write_kvs(
-        memtable: &dyn Memtable,
-        sequence: SequenceNumber,
-        op_type: OpType,
-        ts: &[i64], // timestamp
-        values: &[Option<u64>],
-    ) {
-        let keys: Vec<TimestampMillisecond> = ts.iter().map(|ts| (*ts).into()).collect();
-        let kvs = kvs_for_test(sequence, op_type, &keys, values);
-        memtable.write(&kvs).unwrap();
-    }
-
-    fn kvs_for_test(
-        sequence: SequenceNumber,
-        op_type: OpType,
-        ts: &[TimestampMillisecond],
-        values: &[Option<u64>],
-    ) -> KeyValues {
-        let start_index_in_batch = 0;
-        assert_eq!(ts.len(), values.len());
-        let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(ts.len());
-        for key in ts {
-            key_builders.push(Some(*key));
-        }
-        let ts_col = Arc::new(key_builders.finish()) as _;
-        let mut value_builders = UInt64VectorBuilder::with_capacity(values.len());
-
-        for value in values {
-            value_builders.push(*value);
-        }
-        let row_values = vec![Arc::new(value_builders.finish()) as _];
-
-        let kvs = KeyValues {
-            sequence,
-            op_type,
-            start_index_in_batch,
-            keys: vec![],
-            values: row_values,
-            timestamp: Some(ts_col),
-        };
-
-        assert_eq!(ts.len(), kvs.len());
-        assert_eq!(ts.is_empty(), kvs.is_empty());
-
-        kvs
-    }
-
-    async fn write_sst(
-        sst_file_id: FileId,
-        schema: RegionSchemaRef,
-        seq: &AtomicU64,
-        object_store: ObjectStore,
-        ts: &[i64],
-        ops: &[OpType],
-    ) -> FileHandle {
-        let memtable = DefaultMemtableBuilder::default().build(schema.clone());
-        let mut breaks = ops
-            .iter()
-            .zip(ops.iter().skip(1))
-            .enumerate()
-            .filter_map(
-                |(idx, (prev, next))| {
-                    if prev != next {
-                        Some(idx + 1)
-                    } else {
-                        None
-                    }
-                },
-            )
-            .collect::<Vec<_>>();
-
-        breaks.insert(0, 0);
-        breaks.push(ts.len());
-
-        for i in 0..breaks.len() - 1 {
-            let op = ops[i];
-            let seg_len = breaks[i + 1] - breaks[i];
-            let ts_seg = ts
-                .iter()
-                .skip(breaks[i])
-                .take(seg_len)
-                .copied()
-                .collect::<Vec<_>>();
-            let value_seg = ts
-                .iter()
-                .skip(breaks[i])
-                .take(seg_len)
-                .map(|i| (*i) as u64)
-                .map(Some)
-                .collect::<Vec<_>>();
-
-            write_kvs(
-                &*memtable,
-                seq.load(Ordering::Relaxed), // sequence
-                op,
-                &ts_seg,    // keys
-                &value_seg, // values
-            );
-            let _ = seq.fetch_add(1, Ordering::Relaxed);
-        }
-
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let file_path = sst_file_id.as_parquet();
-        let writer = ParquetWriter::new(&file_path, Source::Iter(iter), object_store.clone());
-
-        let SstInfo {
-            time_range,
-            file_size,
-            ..
-        } = writer
-            .write_sst(&sst::WriteOptions::default())
-            .await
-            .unwrap()
-            .unwrap();
-        let handle = FileHandle::new(
-            FileMeta {
-                region_id: 0.into(),
-                file_id: sst_file_id,
-                time_range,
-                level: 0,
-                file_size,
-            },
-            Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}),
-            new_noop_file_purger(),
-        );
-        let _ = seq.fetch_add(1, Ordering::Relaxed);
-        handle
-    }
-
-    // The region id is only used to build the reader, we don't check its content.
-    async fn check_reads(
-        region_id: RegionId,
-        schema: RegionSchemaRef,
-        sst_layer: AccessLayerRef,
-        files: &[FileHandle],
-        lower_sec_inclusive: i64,
-        upper_sec_exclusive: i64,
-        expect: &[i64],
-    ) {
-        let mut reader = build_sst_reader(
-            region_id,
-            schema,
-            sst_layer,
-            files,
-            (Some(lower_sec_inclusive), Some(upper_sec_exclusive)),
-        )
-        .await
-        .unwrap();
-
-        let mut res = vec![];
-        while let Some(f) = reader.next_chunk().await.unwrap() {
-            let ts_col = f.columns[0]
-                .as_any()
-                .downcast_ref::<TimestampMillisecondVector>()
-                .unwrap();
-            res.extend(ts_col.iter_data().map(|t| t.unwrap().0.value()));
-        }
-        assert_eq!(expect, &res);
-    }
-
-    #[tokio::test]
-    async fn test_sst_reader() {
-        let dir = create_temp_dir("write_parquet");
-        let path = dir.path().to_str().unwrap();
-        let mut builder = Fs::default();
-        let _ = builder.root(path);
-
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-
-        let seq = AtomicU64::new(0);
-        let schema = schema_for_test();
-        let file1 = write_sst(
-            FileId::random(),
-            schema.clone(),
-            &seq,
-            object_store.clone(),
-            &[1000, 2000, 3000, 4001, 5001],
-            &[
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-            ],
-        )
-        .await;
-        let file2 = write_sst(
-            FileId::random(),
-            schema.clone(),
-            &seq,
-            object_store.clone(),
-            &[4002, 5002, 6000, 7000, 8000],
-            &[
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-            ],
-        )
-        .await;
-        let sst_layer = Arc::new(FsAccessLayer::new("./", object_store));
-
-        let files = vec![file1, file2];
-        // read from two sst files with time range filter,
-        check_reads(
-            REGION_ID,
-            schema.clone(),
-            sst_layer.clone(),
-            &files,
-            3,
-            6,
-            &[3000, 4001, 4002, 5001, 5002],
-        )
-        .await;
-
-        check_reads(REGION_ID, schema, sst_layer, &files, 1, 2, &[1000]).await;
-    }
-
-    async fn read_file(
-        files: &[FileHandle],
-        schema: RegionSchemaRef,
-        sst_layer: AccessLayerRef,
-    ) -> Vec<i64> {
-        let mut timestamps = vec![];
-        let mut reader = build_sst_reader(
-            REGION_ID,
-            schema,
-            sst_layer,
-            files,
-            (Some(i64::MIN), Some(i64::MAX)),
-        )
-        .await
-        .unwrap();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let ts = chunk.columns[0]
-                .as_any()
-                .downcast_ref::<TimestampMillisecondVector>()
-                .unwrap();
-            timestamps.extend(ts.iter_data().map(|t| t.unwrap().0.value()));
-        }
-        timestamps
-    }
-
-    /// Writes rows into file i1/i2 and splits these rows into sst file o1/o2/o3,
-    /// and check the output contains the same data as input files.
-    #[tokio::test]
-    async fn test_sst_split() {
-        let dir = create_temp_dir("write_parquet");
-        let path = dir.path().to_str().unwrap();
-        let mut builder = Fs::default();
-        let _ = builder.root(path);
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-
-        let schema = schema_for_test();
-        let seq = AtomicU64::new(0);
-
-        let input_file_ids = [FileId::random(), FileId::random()];
-        let output_file_ids = [FileId::random(), FileId::random(), FileId::random()];
-
-        let file1 = write_sst(
-            input_file_ids[0],
-            schema.clone(),
-            &seq,
-            object_store.clone(),
-            &[1000, 2000, 3000, 4001, 5001],
-            &[
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-            ],
-        )
-        .await;
-
-        // in file2 we delete the row with timestamp 1000.
-        let file2 = write_sst(
-            input_file_ids[1],
-            schema.clone(),
-            &seq,
-            object_store.clone(),
-            &[1000, 5002, 6000, 7000, 8000],
-            &[
-                OpType::Delete, // a deletion
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-            ],
-        )
-        .await;
-        let sst_layer = Arc::new(FsAccessLayer::new("./", object_store.clone()));
-        let input_files = vec![file2, file1];
-
-        let reader1 = build_sst_reader(
-            REGION_ID,
-            schema.clone(),
-            sst_layer.clone(),
-            &input_files,
-            (Some(0), Some(3)),
-        )
-        .await
-        .unwrap();
-        let reader2 = build_sst_reader(
-            REGION_ID,
-            schema.clone(),
-            sst_layer.clone(),
-            &input_files,
-            (Some(3), Some(6)),
-        )
-        .await
-        .unwrap();
-        let reader3 = build_sst_reader(
-            REGION_ID,
-            schema.clone(),
-            sst_layer.clone(),
-            &input_files,
-            (Some(6), Some(10)),
-        )
-        .await
-        .unwrap();
-
-        let opts = WriteOptions {
-            sst_write_buffer_size: ReadableSize::mb(8),
-        };
-        let s1 = ParquetWriter::new(
-            &output_file_ids[0].as_parquet(),
-            Source::Reader(reader1),
-            object_store.clone(),
-        )
-        .write_sst(&opts)
-        .await
-        .unwrap()
-        .unwrap();
-        assert_eq!(
-            Some((
-                Timestamp::new_millisecond(2000),
-                Timestamp::new_millisecond(2000)
-            )),
-            s1.time_range,
-        );
-
-        let s2 = ParquetWriter::new(
-            &output_file_ids[1].as_parquet(),
-            Source::Reader(reader2),
-            object_store.clone(),
-        )
-        .write_sst(&opts)
-        .await
-        .unwrap()
-        .unwrap();
-        assert_eq!(
-            Some((
-                Timestamp::new_millisecond(3000),
-                Timestamp::new_millisecond(5002)
-            )),
-            s2.time_range,
-        );
-
-        let s3 = ParquetWriter::new(
-            &output_file_ids[2].as_parquet(),
-            Source::Reader(reader3),
-            object_store.clone(),
-        )
-        .write_sst(&opts)
-        .await
-        .unwrap()
-        .unwrap();
-
-        assert_eq!(
-            Some((
-                Timestamp::new_millisecond(6000),
-                Timestamp::new_millisecond(8000)
-            )),
-            s3.time_range
-        );
-
-        let output_files = output_file_ids
-            .into_iter()
-            .map(|f| {
-                FileHandle::new(
-                    FileMeta {
-                        region_id: 0.into(),
-                        file_id: f,
-                        level: 1,
-                        time_range: None,
-                        file_size: 0,
-                    },
-                    Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}),
-                    new_noop_file_purger(),
-                )
-            })
-            .collect::<Vec<_>>();
-
-        let timestamps_in_inputs = read_file(&input_files, schema.clone(), sst_layer.clone()).await;
-        let timestamps_in_outputs =
-            read_file(&output_files, schema.clone(), sst_layer.clone()).await;
-
-        assert_eq!(timestamps_in_outputs, timestamps_in_inputs);
-    }
-
-    #[test]
-    fn test_build_time_range_filter() {
-        assert!(build_time_range_filter(
-            (Some(i64::MIN), Some(i64::MAX)),
-            "ts",
-            TimeUnit::Nanosecond
-        )
-        .is_none());
-
-        assert_eq!(
-            Expr::from(datafusion_expr::binary_expr(
-                datafusion_expr::col("ts"),
-                Operator::Lt,
-                datafusion_expr::lit(timestamp_to_scalar_value(
-                    TimeUnit::Nanosecond,
-                    Some(TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64),
-                )),
-            )),
-            build_time_range_filter((Some(i64::MIN), Some(1)), "ts", TimeUnit::Nanosecond).unwrap()
-        );
-
-        assert_eq!(
-            Expr::from(datafusion_expr::binary_expr(
-                datafusion_expr::col("ts"),
-                Operator::GtEq,
-                datafusion_expr::lit(timestamp_to_scalar_value(
-                    TimeUnit::Nanosecond,
-                    Some(
-                        2 * TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64
-                    ),
-                )),
-            )),
-            build_time_range_filter((Some(2), Some(i64::MAX)), "ts", TimeUnit::Nanosecond).unwrap()
-        );
-    }
-}
--- a/src/storage/src/config.rs
+++ b/src/storage/src/config.rs
@@ -1,71 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! storage engine config
-
-use std::time::Duration;
-
-use common_base::readable_size::ReadableSize;
-
-/// Default max flush tasks.
-pub const DEFAULT_MAX_FLUSH_TASKS: usize = 8;
-/// Default region write buffer size.
-pub const DEFAULT_REGION_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(32);
-/// Default interval to trigger auto flush in millis.
-pub const DEFAULT_AUTO_FLUSH_INTERVAL: u32 = 60 * 60 * 1000;
-/// Default interval to schedule the picker to flush automatically in millis.
-pub const DEFAULT_PICKER_SCHEDULE_INTERVAL: u32 = 5 * 60 * 1000;
-
-#[derive(Debug, Clone)]
-pub struct EngineConfig {
-    pub compress_manifest: bool,
-    pub manifest_checkpoint_margin: Option<u16>,
-    pub manifest_gc_duration: Option<Duration>,
-    pub max_files_in_l0: usize,
-    pub max_purge_tasks: usize,
-    /// Max inflight flush tasks.
-    pub max_flush_tasks: usize,
-    /// Default write buffer size for a region.
-    pub region_write_buffer_size: ReadableSize,
-    /// Interval to schedule the auto flush picker.
-    pub picker_schedule_interval: Duration,
-    /// Interval to auto flush a region if it has not flushed yet.
-    pub auto_flush_interval: Duration,
-    /// Limit for global write buffer size. Disabled by default.
-    pub global_write_buffer_size: Option<ReadableSize>,
-    /// Global retention period for all regions.
-    ///
-    /// The precedence order is: region ttl > global ttl.
-    pub global_ttl: Option<Duration>,
-}
-
-impl Default for EngineConfig {
-    fn default() -> Self {
-        Self {
-            compress_manifest: false,
-            manifest_checkpoint_margin: Some(10),
-            manifest_gc_duration: Some(Duration::from_secs(30)),
-            max_files_in_l0: 8,
-            max_purge_tasks: 32,
-            max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
-            region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
-            picker_schedule_interval: Duration::from_millis(
-                DEFAULT_PICKER_SCHEDULE_INTERVAL.into(),
-            ),
-            auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
-            global_write_buffer_size: None,
-            global_ttl: None,
-        }
-    }
-}
--- a/src/storage/src/engine.rs
+++ b/src/storage/src/engine.rs
@@ -1,750 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::sync::{Arc, RwLock};
-use std::time::Duration;
-
-use async_trait::async_trait;
-use common_telemetry::logging::{self, debug};
-use object_store::{util, ObjectStore};
-use snafu::ResultExt;
-use store_api::logstore::LogStore;
-use store_api::manifest::Manifest;
-use store_api::storage::{
-    CloseContext, CloseOptions, CompactionStrategy, CreateOptions, EngineContext, OpenOptions,
-    Region, RegionDescriptor, StorageEngine,
-};
-
-use crate::compaction::CompactionSchedulerRef;
-use crate::config::EngineConfig;
-use crate::error::{self, Error, Result};
-use crate::file_purger::{FilePurgeHandler, FilePurgerRef};
-use crate::flush::{
-    FlushScheduler, FlushSchedulerRef, FlushStrategyRef, PickerConfig, SizeBasedStrategy,
-};
-use crate::manifest::region::RegionManifest;
-use crate::manifest::storage::manifest_compress_type;
-use crate::memtable::{DefaultMemtableBuilder, MemtableBuilderRef};
-use crate::metadata::RegionMetadata;
-use crate::region::{RegionImpl, StoreConfig};
-use crate::scheduler::{LocalScheduler, Scheduler, SchedulerConfig};
-use crate::sst::FsAccessLayer;
-
-/// [StorageEngine] implementation.
-pub struct EngineImpl<S: LogStore> {
-    inner: Arc<EngineInner<S>>,
-}
-
-impl<S: LogStore> Clone for EngineImpl<S> {
-    fn clone(&self) -> Self {
-        Self {
-            inner: self.inner.clone(),
-        }
-    }
-}
-
-#[async_trait]
-impl<S: LogStore> StorageEngine for EngineImpl<S> {
-    type Error = Error;
-    type Region = RegionImpl<S>;
-
-    async fn open_region(
-        &self,
-        _ctx: &EngineContext,
-        name: &str,
-        opts: &OpenOptions,
-    ) -> Result<Option<Self::Region>> {
-        self.inner.open_region(name, opts).await
-    }
-
-    async fn close_region(
-        &self,
-        _ctx: &EngineContext,
-        name: &str,
-        opts: &CloseOptions,
-    ) -> Result<()> {
-        self.inner.close_region(name, opts).await
-    }
-
-    async fn create_region(
-        &self,
-        _ctx: &EngineContext,
-        descriptor: RegionDescriptor,
-        opts: &CreateOptions,
-    ) -> Result<Self::Region> {
-        self.inner.create_region(descriptor, opts).await
-    }
-
-    async fn drop_region(&self, _ctx: &EngineContext, region: Self::Region) -> Result<()> {
-        region.drop_region().await?;
-        self.inner.remove_region(region.name());
-        Ok(())
-    }
-
-    fn get_region(&self, _ctx: &EngineContext, name: &str) -> Result<Option<Self::Region>> {
-        Ok(self.inner.get_region(name))
-    }
-
-    async fn close(&self, _ctx: &EngineContext) -> Result<()> {
-        logging::info!("Stopping storage engine");
-
-        self.inner.close().await?;
-
-        logging::info!("Storage engine stopped");
-
-        Ok(())
-    }
-}
-
-impl<S: LogStore> EngineImpl<S> {
-    pub fn new(
-        config: EngineConfig,
-        log_store: Arc<S>,
-        object_store: ObjectStore,
-        compaction_scheduler: CompactionSchedulerRef<S>,
-    ) -> Result<Self> {
-        Ok(Self {
-            inner: Arc::new(EngineInner::new(
-                config,
-                log_store,
-                object_store,
-                compaction_scheduler,
-            )?),
-        })
-    }
-}
-
-/// Generate region sst path,
-/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'.
-#[inline]
-pub fn region_sst_dir(parent_dir: &str, region_name: &str) -> String {
-    format!("{parent_dir}{region_name}/")
-}
-
-/// Generate region manifest path,
-/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'.
-#[inline]
-pub fn region_manifest_dir(parent_dir: &str, region_name: &str) -> String {
-    format!("{parent_dir}{region_name}/manifest/")
-}
-
-/// A slot for region in the engine.
-///
-/// Also used as a placeholder in the region map when the region isn't ready, e.g. during
-/// creating/opening.
-#[derive(Debug)]
-pub(crate) enum RegionSlot<S: LogStore> {
-    /// The region is during creation.
-    Creating,
-    /// The region is during opening.
-    Opening,
-    /// The region is ready for access.
-    Ready(RegionImpl<S>),
-}
-
-impl<S: LogStore> RegionSlot<S> {
-    /// Try to get a ready region.
-    fn try_get_ready_region(&self) -> Result<RegionImpl<S>> {
-        if let RegionSlot::Ready(region) = self {
-            Ok(region.clone())
-        } else {
-            error::InvalidRegionStateSnafu {
-                state: self.state_name(),
-            }
-            .fail()
-        }
-    }
-
-    /// Returns the ready region or `None`.
-    fn get_ready_region(&self) -> Option<RegionImpl<S>> {
-        if let RegionSlot::Ready(region) = self {
-            Some(region.clone())
-        } else {
-            None
-        }
-    }
-
-    fn state_name(&self) -> &'static str {
-        match self {
-            RegionSlot::Creating => "creating",
-            RegionSlot::Opening => "opening",
-            RegionSlot::Ready(_) => "ready",
-        }
-    }
-}
-
-impl<S: LogStore> Clone for RegionSlot<S> {
-    // Manually implement Clone due to [rust#26925](https://github.com/rust-lang/rust/issues/26925).
-    // Maybe we should require `LogStore` to be clonable to work around this.
-    fn clone(&self) -> RegionSlot<S> {
-        match self {
-            RegionSlot::Creating => RegionSlot::Creating,
-            RegionSlot::Opening => RegionSlot::Opening,
-            RegionSlot::Ready(region) => RegionSlot::Ready(region.clone()),
-        }
-    }
-}
-
-/// Used to update slot or clean the slot on failure.
-struct SlotGuard<'a, S: LogStore> {
-    name: &'a str,
-    regions: &'a RegionMap<S>,
-    skip_clean: bool,
-}
-
-impl<'a, S: LogStore> SlotGuard<'a, S> {
-    fn new(name: &'a str, regions: &'a RegionMap<S>) -> SlotGuard<'a, S> {
-        SlotGuard {
-            name,
-            regions,
-            skip_clean: false,
-        }
-    }
-
-    /// Update the slot and skip cleaning on drop.
-    fn update(&mut self, slot: RegionSlot<S>) {
-        self.regions.update(self.name, slot);
-        self.skip_clean = true;
-    }
-}
-
-impl<'a, S: LogStore> Drop for SlotGuard<'a, S> {
-    fn drop(&mut self) {
-        if !self.skip_clean {
-            self.regions.remove(self.name)
-        }
-    }
-}
-
-/// Region slot map.
-pub struct RegionMap<S: LogStore>(RwLock<HashMap<String, RegionSlot<S>>>);
-
-impl<S: LogStore> RegionMap<S> {
-    /// Returns a new region map.
-    pub fn new() -> RegionMap<S> {
-        RegionMap(RwLock::new(HashMap::new()))
-    }
-
-    /// Returns the `Some(slot)` if there is existing slot with given `name`, or insert
-    /// given `slot` and returns `None`.
-    pub(crate) fn get_or_occupy_slot(
-        &self,
-        name: &str,
-        slot: RegionSlot<S>,
-    ) -> Option<RegionSlot<S>> {
-        {
-            // Try to get the region under read lock.
-            let regions = self.0.read().unwrap();
-            if let Some(slot) = regions.get(name) {
-                return Some(slot.clone());
-            }
-        }
-
-        // Get the region under write lock.
-        let mut regions = self.0.write().unwrap();
-        if let Some(slot) = regions.get(name) {
-            return Some(slot.clone());
-        }
-
-        // No slot in map, we can insert the slot now.
-        let _ = regions.insert(name.to_string(), slot);
-
-        None
-    }
-
-    /// Gets the region by the specific name.
-    fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
-        let slot = self.0.read().unwrap().get(name).cloned()?;
-        slot.get_ready_region()
-    }
-
-    /// Update the slot by name.
-    fn update(&self, name: &str, slot: RegionSlot<S>) {
-        let mut regions = self.0.write().unwrap();
-        if let Some(old) = regions.get_mut(name) {
-            *old = slot;
-        }
-    }
-
-    /// Remove region by name.
-    fn remove(&self, name: &str) {
-        let mut regions = self.0.write().unwrap();
-        let _ = regions.remove(name);
-    }
-
-    /// Collects regions.
-    pub(crate) fn list_regions(&self) -> Vec<RegionImpl<S>> {
-        let regions = self.0.read().unwrap();
-        regions
-            .values()
-            .filter_map(|slot| slot.get_ready_region())
-            .collect()
-    }
-
-    /// Clear the region map.
-    pub(crate) fn clear(&self) {
-        self.0.write().unwrap().clear();
-    }
-}
-
-impl<S: LogStore> Default for RegionMap<S> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-struct EngineInner<S: LogStore> {
-    object_store: ObjectStore,
-    log_store: Arc<S>,
-    regions: Arc<RegionMap<S>>,
-    memtable_builder: MemtableBuilderRef,
-    flush_scheduler: FlushSchedulerRef<S>,
-    flush_strategy: FlushStrategyRef,
-    compaction_scheduler: CompactionSchedulerRef<S>,
-    file_purger: FilePurgerRef,
-    config: Arc<EngineConfig>,
-}
-
-impl<S: LogStore> EngineInner<S> {
-    pub fn new(
-        config: EngineConfig,
-        log_store: Arc<S>,
-        object_store: ObjectStore,
-        compaction_scheduler: CompactionSchedulerRef<S>,
-    ) -> Result<Self> {
-        let regions = Arc::new(RegionMap::new());
-        let flush_scheduler = Arc::new(FlushScheduler::new(
-            SchedulerConfig {
-                max_inflight_tasks: config.max_flush_tasks,
-            },
-            compaction_scheduler.clone(),
-            regions.clone(),
-            PickerConfig {
-                schedule_interval: config.picker_schedule_interval,
-                auto_flush_interval: config.auto_flush_interval,
-            },
-        )?);
-
-        let file_purger = Arc::new(LocalScheduler::new(
-            SchedulerConfig {
-                max_inflight_tasks: config.max_purge_tasks,
-            },
-            FilePurgeHandler,
-        ));
-        let flush_strategy = Arc::new(SizeBasedStrategy::new(
-            config
-                .global_write_buffer_size
-                .map(|size| size.as_bytes() as usize),
-        ));
-        let memtable_builder = if config.global_write_buffer_size.is_some() {
-            // If global write buffer size is provided, we set the flush strategy
-            // to the memtable to track global memtable usage.
-            DefaultMemtableBuilder::with_flush_strategy(Some(flush_strategy.clone()))
-        } else {
-            DefaultMemtableBuilder::default()
-        };
-        Ok(Self {
-            object_store,
-            log_store,
-            regions,
-            memtable_builder: Arc::new(memtable_builder),
-            flush_scheduler,
-            flush_strategy,
-            compaction_scheduler,
-            file_purger,
-            config: Arc::new(config),
-        })
-    }
-
-    async fn close_region(&self, name: &str, opts: &CloseOptions) -> Result<()> {
-        if let Some(region) = self.get_region(name) {
-            let ctx = CloseContext { flush: opts.flush };
-            region.close(&ctx).await?;
-        }
-
-        self.regions.remove(name);
-
-        Ok(())
-    }
-
-    async fn open_region(&self, name: &str, opts: &OpenOptions) -> Result<Option<RegionImpl<S>>> {
-        // We can wait until the state of the slot has been changed to ready, but this will
-        // make the code more complicate, so we just return the error here.
-        if let Some(slot) = self.regions.get_or_occupy_slot(name, RegionSlot::Opening) {
-            return slot.try_get_ready_region().map(Some);
-        }
-
-        let mut guard = SlotGuard::new(name, &self.regions);
-
-        let store_config = self
-            .region_store_config(
-                &opts.parent_dir,
-                opts.write_buffer_size,
-                name,
-                &self.config,
-                opts.ttl,
-                opts.compaction_strategy.clone(),
-            )
-            .await?;
-
-        let region = match RegionImpl::open(name.to_string(), store_config, opts).await? {
-            None => return Ok(None),
-            Some(v) => v,
-        };
-        guard.update(RegionSlot::Ready(region.clone()));
-        debug!(
-            "Storage engine open region {}, id: {}",
-            region.name(),
-            region.id()
-        );
-        Ok(Some(region))
-    }
-
-    async fn create_region(
-        &self,
-        descriptor: RegionDescriptor,
-        opts: &CreateOptions,
-    ) -> Result<RegionImpl<S>> {
-        if let Some(slot) = self
-            .regions
-            .get_or_occupy_slot(&descriptor.name, RegionSlot::Creating)
-        {
-            return slot.try_get_ready_region();
-        }
-
-        // Now the region in under `Creating` state.
-        let region_name = descriptor.name.clone();
-        let mut guard = SlotGuard::new(&region_name, &self.regions);
-
-        let metadata: RegionMetadata =
-            descriptor
-                .try_into()
-                .context(error::InvalidRegionDescSnafu {
-                    region: &region_name,
-                })?;
-        let store_config = self
-            .region_store_config(
-                &opts.parent_dir,
-                opts.write_buffer_size,
-                &region_name,
-                &self.config,
-                opts.ttl,
-                opts.compaction_strategy.clone(),
-            )
-            .await?;
-
-        let region = RegionImpl::create(metadata, store_config).await?;
-
-        guard.update(RegionSlot::Ready(region.clone()));
-
-        debug!(
-            "Storage engine create region {}, id: {}",
-            region.name(),
-            region.id()
-        );
-
-        Ok(region)
-    }
-
-    fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
-        self.regions.get_region(name)
-    }
-
-    fn remove_region(&self, name: &str) {
-        self.regions.remove(name)
-    }
-
-    async fn region_store_config(
-        &self,
-        parent_dir: &str,
-        write_buffer_size: Option<usize>,
-        region_name: &str,
-        config: &EngineConfig,
-        region_ttl: Option<Duration>,
-        compaction_strategy: CompactionStrategy,
-    ) -> Result<StoreConfig<S>> {
-        let parent_dir = util::normalize_dir(parent_dir);
-
-        let sst_dir = &region_sst_dir(&parent_dir, region_name);
-        let sst_layer = Arc::new(FsAccessLayer::new(sst_dir, self.object_store.clone()));
-        let manifest_dir = region_manifest_dir(&parent_dir, region_name);
-        let manifest = RegionManifest::with_checkpointer(
-            &manifest_dir,
-            self.object_store.clone(),
-            manifest_compress_type(config.compress_manifest),
-            config.manifest_checkpoint_margin,
-            config.manifest_gc_duration,
-        );
-        manifest.start().await?;
-        let flush_strategy = self.flush_strategy.clone();
-
-        // If region_ttl is `None`, the global ttl takes effect.
-        let ttl = region_ttl.or(self.config.global_ttl);
-
-        Ok(StoreConfig {
-            log_store: self.log_store.clone(),
-            sst_layer,
-            manifest,
-            memtable_builder: self.memtable_builder.clone(),
-            flush_scheduler: self.flush_scheduler.clone(),
-            flush_strategy,
-            compaction_scheduler: self.compaction_scheduler.clone(),
-            engine_config: self.config.clone(),
-            file_purger: self.file_purger.clone(),
-            ttl,
-            write_buffer_size: write_buffer_size
-                .unwrap_or(self.config.region_write_buffer_size.as_bytes() as usize),
-            compaction_strategy,
-        })
-    }
-
-    async fn close(&self) -> Result<()> {
-        let regions = self.regions.list_regions();
-        let ctx = CloseContext::default();
-        for region in regions {
-            // Tolerate failure during closing regions.
-            if let Err(e) = region.close(&ctx).await {
-                logging::error!(e; "Failed to close region {}", region.id());
-            }
-        }
-        // Clear regions to release references to regions in the region map.
-        self.regions.clear();
-
-        self.compaction_scheduler.stop(true).await?;
-        self.flush_scheduler.stop().await?;
-        self.file_purger.stop(true).await
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::ffi::OsStr;
-    use std::path::Path;
-
-    use common_test_util::temp_dir::{create_temp_dir, TempDir};
-    use datatypes::type_id::LogicalTypeId;
-    use datatypes::vectors::{Float32Vector, Int32Vector, TimestampMillisecondVector, VectorRef};
-    use log_store::raft_engine::log_store::RaftEngineLogStore;
-    use log_store::test_util::log_store_util;
-    use object_store::services::Fs;
-    use store_api::storage::{
-        ChunkReader, FlushContext, ReadContext, Region, ScanRequest, Snapshot, WriteContext,
-        WriteRequest,
-    };
-
-    use super::*;
-    use crate::compaction::noop::NoopCompactionScheduler;
-    use crate::test_util::descriptor_util::RegionDescBuilder;
-
-    type TestEngine = EngineImpl<RaftEngineLogStore>;
-    type TestRegion = RegionImpl<RaftEngineLogStore>;
-
-    async fn create_engine_and_region(
-        tmp_dir: &TempDir,
-        log_file_dir: &TempDir,
-        region_name: &str,
-        region_id: u64,
-        config: EngineConfig,
-    ) -> (TestEngine, TestRegion) {
-        let log_file_dir_path = log_file_dir.path().to_str().unwrap();
-        let log_store = log_store_util::create_tmp_local_file_log_store(log_file_dir_path).await;
-
-        let store_dir = tmp_dir.path().to_string_lossy();
-
-        let mut builder = Fs::default();
-        let _ = builder.root(&store_dir);
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-
-        let compaction_scheduler = Arc::new(NoopCompactionScheduler::default());
-
-        let engine = EngineImpl::new(
-            config,
-            Arc::new(log_store),
-            object_store,
-            compaction_scheduler,
-        )
-        .unwrap();
-
-        let desc = RegionDescBuilder::new(region_name)
-            .id(region_id)
-            .push_key_column(("k1", LogicalTypeId::Int32, false))
-            .push_field_column(("v1", LogicalTypeId::Float32, true))
-            .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false))
-            .build();
-
-        let region = engine
-            .create_region(&EngineContext::default(), desc, &CreateOptions::default())
-            .await
-            .unwrap();
-
-        (engine, region)
-    }
-
-    fn parquet_file_num(path: &Path) -> usize {
-        path.read_dir()
-            .unwrap()
-            .filter_map(|entry| entry.ok())
-            .filter(|entry| entry.path().extension() == Some(OsStr::new("parquet")))
-            .count()
-    }
-
-    #[tokio::test]
-    async fn test_create_new_region() {
-        let dir = create_temp_dir("test_create_region");
-        let log_file_dir = create_temp_dir("test_engine_wal");
-
-        let region_name = "region-0";
-        let region_id = 123456;
-        let config = EngineConfig::default();
-
-        let (engine, region) =
-            create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
-        assert_eq!(region_name, region.name());
-
-        let ctx = EngineContext::default();
-        let region2 = engine.get_region(&ctx, region_name).unwrap().unwrap();
-        assert_eq!(region_name, region2.name());
-
-        assert!(engine.get_region(&ctx, "no such region").unwrap().is_none());
-    }
-
-    #[tokio::test]
-    async fn test_create_region_with_buffer_size() {
-        let dir = create_temp_dir("test_buffer_size");
-        let log_file_dir = create_temp_dir("test_buffer_wal");
-
-        let region_name = "region-0";
-        let region_id = 123456;
-        let mut config = EngineConfig::default();
-        let expect_buffer_size = config.region_write_buffer_size / 2;
-        config.region_write_buffer_size = expect_buffer_size;
-
-        let (_engine, region) =
-            create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
-        assert_eq!(
-            expect_buffer_size.as_bytes() as usize,
-            region.write_buffer_size().await
-        );
-    }
-
-    #[tokio::test]
-    async fn test_drop_region() {
-        common_telemetry::init_default_ut_logging();
-        let dir = create_temp_dir("test_drop_region");
-        let log_file_dir = create_temp_dir("test_engine_wal");
-
-        let region_name = "test_region";
-        let region_id = 123456;
-        let config = EngineConfig::default();
-
-        let (engine, region) =
-            create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
-
-        assert_eq!(region_name, region.name());
-
-        let mut wb = region.write_request();
-        let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef;
-        let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef;
-        let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef;
-
-        let put_data = HashMap::from([
-            ("k1".to_string(), k1),
-            ("v1".to_string(), v1),
-            ("ts".to_string(), tsv),
-        ]);
-        wb.put(put_data).unwrap();
-        let _ = region.write(&WriteContext::default(), wb).await.unwrap();
-
-        // Flush memtable to sst.
-        region.flush(&FlushContext::default()).await.unwrap();
-        let ctx = EngineContext::default();
-        engine
-            .close_region(&ctx, region.name(), &CloseOptions::default())
-            .await
-            .unwrap();
-
-        let dir_path = dir.path().join(region_name);
-
-        assert_eq!(1, parquet_file_num(&dir_path));
-
-        {
-            let region = engine
-                .open_region(&ctx, region_name, &OpenOptions::default())
-                .await
-                .unwrap()
-                .unwrap();
-
-            engine.drop_region(&ctx, region).await.unwrap();
-
-            assert!(engine.get_region(&ctx, region_name).unwrap().is_none());
-            assert!(!engine
-                .inner
-                .object_store
-                .is_exist(dir_path.join("manifest").to_str().unwrap())
-                .await
-                .unwrap());
-        }
-
-        // Wait for gc
-        tokio::time::sleep(Duration::from_millis(60)).await;
-        assert_eq!(0, parquet_file_num(&dir_path));
-    }
-
-    #[tokio::test]
-    async fn test_truncate_region() {
-        common_telemetry::init_default_ut_logging();
-        let dir = create_temp_dir("test_truncate_region");
-        let log_file_dir = create_temp_dir("test_engine_wal");
-
-        let region_name = "test_region";
-        let region_id = 123456;
-        let config = EngineConfig::default();
-
-        let (engine, region) =
-            create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
-
-        assert_eq!(region_name, region.name());
-
-        let mut wb = region.write_request();
-        let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef;
-        let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef;
-        let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef;
-
-        let put_data = HashMap::from([
-            ("k1".to_string(), k1),
-            ("v1".to_string(), v1),
-            ("ts".to_string(), tsv),
-        ]);
-        wb.put(put_data).unwrap();
-
-        // Insert data.
-        region.write(&WriteContext::default(), wb).await.unwrap();
-        let ctx = EngineContext::default();
-
-        // Truncate region.
-        region.truncate().await.unwrap();
-        assert!(engine.get_region(&ctx, region.name()).unwrap().is_some());
-
-        // Scan to verify the region is empty.
-        let read_ctx = ReadContext::default();
-        let snapshot = region.snapshot(&read_ctx).unwrap();
-        let resp = snapshot
-            .scan(&read_ctx, ScanRequest::default())
-            .await
-            .unwrap();
-        let mut reader = resp.reader;
-        assert!(reader.next_chunk().await.unwrap().is_none());
-    }
-}
--- a/src/storage/src/error.rs
+++ b/src/storage/src/error.rs
@@ -1,635 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::io::Error as IoError;
-use std::str::Utf8Error;
-
-use common_datasource::compression::CompressionType;
-use common_error::ext::{BoxedError, ErrorExt};
-use common_error::status_code::StatusCode;
-use common_macro::stack_trace_debug;
-use common_runtime::error::Error as RuntimeError;
-use datatypes::arrow::error::ArrowError;
-use datatypes::prelude::ConcreteDataType;
-use object_store::ErrorKind;
-use serde_json::error::Error as JsonError;
-use snafu::{Location, Snafu};
-use store_api::manifest::action::ProtocolVersion;
-use store_api::manifest::ManifestVersion;
-use store_api::storage::{RegionId, SequenceNumber};
-use tokio::task::JoinError;
-
-use crate::metadata::Error as MetadataError;
-use crate::write_batch;
-
-#[derive(Snafu)]
-#[snafu(visibility(pub))]
-#[stack_trace_debug]
-pub enum Error {
-    #[snafu(display("Invalid region descriptor, region: {}", region))]
-    InvalidRegionDesc {
-        region: String,
-        location: Location,
-        source: MetadataError,
-    },
-
-    #[snafu(display("Missing column {} in write batch", column))]
-    BatchMissingColumn { column: String, location: Location },
-
-    #[snafu(display("Failed to write parquet file"))]
-    WriteParquet {
-        #[snafu(source)]
-        error: parquet::errors::ParquetError,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to write to buffer"))]
-    WriteBuffer {
-        location: Location,
-        source: common_datasource::error::Error,
-    },
-
-    #[snafu(display("Failed to create RecordBatch from vectors"))]
-    NewRecordBatch {
-        location: Location,
-        #[snafu(source)]
-        error: ArrowError,
-    },
-
-    #[snafu(display("Fail to read object from path: {}", path))]
-    ReadObject {
-        path: String,
-        location: Location,
-        #[snafu(source)]
-        error: object_store::Error,
-    },
-
-    #[snafu(display("Fail to write object into path: {}", path))]
-    WriteObject {
-        path: String,
-        location: Location,
-        #[snafu(source)]
-        error: object_store::Error,
-    },
-
-    #[snafu(display("Fail to delete object from path: {}", path))]
-    DeleteObject {
-        path: String,
-        location: Location,
-        #[snafu(source)]
-        error: object_store::Error,
-    },
-
-    #[snafu(display("Fail to compress object by {}, path: {}", compress_type, path))]
-    CompressObject {
-        compress_type: CompressionType,
-        path: String,
-        #[snafu(source)]
-        error: std::io::Error,
-    },
-
-    #[snafu(display("Fail to decompress object by {}, path: {}", compress_type, path))]
-    DecompressObject {
-        compress_type: CompressionType,
-        path: String,
-        #[snafu(source)]
-        error: std::io::Error,
-    },
-
-    #[snafu(display("Fail to list objects in path: {}", path))]
-    ListObjects {
-        path: String,
-        location: Location,
-        #[snafu(source)]
-        error: object_store::Error,
-    },
-
-    #[snafu(display("Fail to create str from bytes"))]
-    Utf8 {
-        location: Location,
-        #[snafu(source)]
-        error: Utf8Error,
-    },
-
-    #[snafu(display("Fail to encode object into json "))]
-    EncodeJson {
-        location: Location,
-        #[snafu(source)]
-        error: JsonError,
-    },
-
-    #[snafu(display("Fail to decode object from json "))]
-    DecodeJson {
-        location: Location,
-        #[snafu(source)]
-        error: JsonError,
-    },
-
-    #[snafu(display("Invalid scan index, start: {}, end: {}", start, end))]
-    InvalidScanIndex {
-        start: ManifestVersion,
-        end: ManifestVersion,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to write WAL, WAL region_id: {}", region_id))]
-    WriteWal {
-        region_id: RegionId,
-        location: Location,
-        source: BoxedError,
-    },
-
-    #[snafu(display("Failed to encode WAL header"))]
-    EncodeWalHeader {
-        location: Location,
-        #[snafu(source)]
-        error: std::io::Error,
-    },
-
-    #[snafu(display("Failed to decode WAL header"))]
-    DecodeWalHeader {
-        location: Location,
-        #[snafu(source)]
-        error: std::io::Error,
-    },
-
-    #[snafu(display("Failed to wait flushing, region_id: {}", region_id))]
-    WaitFlush {
-        region_id: RegionId,
-        #[snafu(source)]
-        error: tokio::sync::oneshot::error::RecvError,
-        location: Location,
-    },
-
-    #[snafu(display(
-        "Manifest protocol forbid to read, min_version: {}, supported_version: {}",
-        min_version,
-        supported_version
-    ))]
-    ManifestProtocolForbidRead {
-        min_version: ProtocolVersion,
-        supported_version: ProtocolVersion,
-        location: Location,
-    },
-
-    #[snafu(display(
-        "Manifest protocol forbid to write, min_version: {}, supported_version: {}",
-        min_version,
-        supported_version
-    ))]
-    ManifestProtocolForbidWrite {
-        min_version: ProtocolVersion,
-        supported_version: ProtocolVersion,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to decode action list, {}", msg))]
-    DecodeMetaActionList { msg: String, location: Location },
-
-    #[snafu(display("Failed to read line, err"))]
-    Readline {
-        #[snafu(source)]
-        error: IoError,
-    },
-
-    #[snafu(display("Failed to read Parquet file: {}", file))]
-    ReadParquet {
-        file: String,
-        #[snafu(source)]
-        error: parquet::errors::ParquetError,
-        location: Location,
-    },
-
-    #[snafu(display("Region is under {} state, cannot proceed operation", state))]
-    InvalidRegionState {
-        state: &'static str,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to read WAL, region_id: {}", region_id))]
-    ReadWal {
-        region_id: RegionId,
-        location: Location,
-        source: BoxedError,
-    },
-
-    #[snafu(display("Failed to mark WAL as obsolete, region id: {}", region_id))]
-    MarkWalObsolete {
-        region_id: u64,
-        location: Location,
-        source: BoxedError,
-    },
-
-    #[snafu(display("WAL data corrupted, region_id: {}, message: {}", region_id, message))]
-    WalDataCorrupted {
-        region_id: RegionId,
-        message: String,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to delete WAL namespace, region id: {}", region_id))]
-    DeleteWalNamespace {
-        region_id: RegionId,
-        location: Location,
-        source: BoxedError,
-    },
-
-    #[snafu(display(
-        "Sequence of region should increase monotonically (should be {} < {})",
-        prev,
-        given
-    ))]
-    SequenceNotMonotonic {
-        prev: SequenceNumber,
-        given: SequenceNumber,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to convert store schema, file: {}", file))]
-    ConvertStoreSchema {
-        file: String,
-        location: Location,
-        source: MetadataError,
-    },
-
-    #[snafu(display("Invalid raw region metadata, region: {}", region))]
-    InvalidRawRegion {
-        region: String,
-        location: Location,
-        source: MetadataError,
-    },
-
-    #[snafu(display("Try to write the closed region"))]
-    ClosedRegion { location: Location },
-
-    #[snafu(display("Invalid projection"))]
-    InvalidProjection {
-        location: Location,
-        source: MetadataError,
-    },
-
-    #[snafu(display("Failed to push data to batch builder"))]
-    PushBatch {
-        location: Location,
-        source: datatypes::error::Error,
-    },
-
-    #[snafu(display("Failed to build batch, {}", msg))]
-    BuildBatch { msg: String, location: Location },
-
-    #[snafu(display("Failed to filter column {}", name))]
-    FilterColumn {
-        name: String,
-        location: Location,
-        source: datatypes::error::Error,
-    },
-
-    #[snafu(display("Invalid alter request"))]
-    InvalidAlterRequest {
-        location: Location,
-        source: MetadataError,
-    },
-
-    #[snafu(display("Failed to alter metadata"))]
-    AlterMetadata {
-        location: Location,
-        source: MetadataError,
-    },
-
-    #[snafu(display("Failed to create default value for column {}", name))]
-    CreateDefault {
-        name: String,
-        location: Location,
-        source: datatypes::error::Error,
-    },
-
-    #[snafu(display(
-        "Not allowed to write data with version {} to schema with version {}",
-        data_version,
-        schema_version
-    ))]
-    WriteToOldVersion {
-        /// Schema version of data to write.
-        data_version: u32,
-        schema_version: u32,
-        location: Location,
-    },
-
-    #[snafu(display("Column {} not in schema with version {}", column, version))]
-    NotInSchemaToCompat {
-        column: String,
-        version: u32,
-        location: Location,
-    },
-
-    #[snafu(display("Incompatible schema to read, reason: {}", reason))]
-    CompatRead { reason: String, location: Location },
-
-    #[snafu(display("Failed to read column {}, could not create default value", column))]
-    CreateDefaultToRead {
-        column: String,
-        location: Location,
-        source: datatypes::error::Error,
-    },
-
-    #[snafu(display("Failed to read column {}, no proper default value for it", column))]
-    NoDefaultToRead { column: String, location: Location },
-
-    #[snafu(display("Failed to convert arrow chunk to batch, name: {}", name))]
-    ConvertChunk {
-        name: String,
-        location: Location,
-        source: datatypes::error::Error,
-    },
-
-    #[snafu(display("Unknown column {}", name))]
-    UnknownColumn { name: String, location: Location },
-
-    #[snafu(display("Failed to create record batch for write batch"))]
-    CreateRecordBatch {
-        location: Location,
-        source: common_recordbatch::error::Error,
-    },
-
-    #[snafu(display(
-        "Request is too large, max is {}, current is {}",
-        write_batch::MAX_BATCH_SIZE,
-        num_rows
-    ))]
-    RequestTooLarge { num_rows: usize, location: Location },
-
-    #[snafu(display(
-        "Type of column {} does not match type in schema, expect {:?}, given {:?}",
-        name,
-        expect,
-        given
-    ))]
-    TypeMismatch {
-        name: String,
-        expect: ConcreteDataType,
-        given: ConcreteDataType,
-        location: Location,
-    },
-
-    #[snafu(display("Column {} is not null but input has null", name))]
-    HasNull { name: String, location: Location },
-
-    #[snafu(display(
-        "Length of column {} not equals to other columns, expect {}, given {}",
-        name,
-        expect,
-        given
-    ))]
-    UnequalLengths {
-        name: String,
-        expect: usize,
-        given: usize,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to decode write batch, corrupted data {}", message))]
-    BatchCorrupted { message: String, location: Location },
-
-    #[snafu(display("Failed to decode arrow data"))]
-    DecodeArrow {
-        location: Location,
-        #[snafu(source)]
-        error: ArrowError,
-    },
-
-    #[snafu(display("Failed to encode arrow data"))]
-    EncodeArrow {
-        location: Location,
-        #[snafu(source)]
-        error: ArrowError,
-    },
-
-    #[snafu(display("Failed to parse schema"))]
-    ParseSchema {
-        location: Location,
-        source: datatypes::error::Error,
-    },
-
-    #[snafu(display("More columns than expected in the request"))]
-    MoreColumnThanExpected { location: Location },
-
-    #[snafu(display("Failed to decode parquet file time range, msg: {}", msg))]
-    DecodeParquetTimeRange { msg: String, location: Location },
-
-    #[snafu(display("Scheduler rate limited, msg: {}", msg))]
-    RateLimited { msg: String },
-
-    #[snafu(display("Cannot schedule request, scheduler's already stopped"))]
-    IllegalSchedulerState { location: Location },
-
-    #[snafu(display("Failed to start manifest gc task"))]
-    StartManifestGcTask {
-        location: Location,
-        source: RuntimeError,
-    },
-
-    #[snafu(display("Failed to stop manifest gc task"))]
-    StopManifestGcTask {
-        location: Location,
-        source: RuntimeError,
-    },
-
-    #[snafu(display("Failed to stop scheduler"))]
-    StopScheduler {
-        #[snafu(source)]
-        error: JoinError,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to delete SST file"))]
-    DeleteSst {
-        #[snafu(source)]
-        error: object_store::Error,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to calculate SST expire time"))]
-    TtlCalculation {
-        location: Location,
-        source: common_time::error::Error,
-    },
-
-    #[snafu(display("Failed to create a checkpoint: {}", msg))]
-    ManifestCheckpoint { msg: String, location: Location },
-
-    #[snafu(display("The compaction task is cancelled, region_id: {}", region_id))]
-    CompactTaskCancel {
-        region_id: RegionId,
-        #[snafu(source)]
-        error: tokio::sync::oneshot::error::RecvError,
-    },
-
-    #[snafu(display(
-        "The flush request is duplicate, region_id: {}, sequence: {}",
-        region_id,
-        sequence
-    ))]
-    DuplicateFlush {
-        region_id: RegionId,
-        sequence: SequenceNumber,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to start picking task for flush"))]
-    StartPickTask {
-        location: Location,
-        source: RuntimeError,
-    },
-
-    #[snafu(display("Failed to stop picking task for flush"))]
-    StopPickTask {
-        location: Location,
-        source: RuntimeError,
-    },
-
-    #[snafu(display("Failed to convert columns to rows"))]
-    ConvertColumnsToRows {
-        #[snafu(source)]
-        error: ArrowError,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to sort arrays"))]
-    SortArrays {
-        #[snafu(source)]
-        error: ArrowError,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to build scan predicate"))]
-    BuildPredicate {
-        source: table::error::Error,
-        location: Location,
-    },
-
-    #[snafu(display("Failed to join spawned tasks"))]
-    JoinError {
-        #[snafu(source)]
-        error: JoinError,
-        location: Location,
-    },
-}
-
-pub type Result<T> = std::result::Result<T, Error>;
-
-impl Error {
-    /// Returns true if the error is the object path to delete
-    /// doesn't exist.
-    pub(crate) fn is_object_to_delete_not_found(&self) -> bool {
-        if let Error::DeleteObject { error, .. } = self {
-            error.kind() == ErrorKind::NotFound
-        } else {
-            false
-        }
-    }
-}
-
-impl ErrorExt for Error {
-    fn status_code(&self) -> StatusCode {
-        use Error::*;
-
-        match self {
-            InvalidScanIndex { .. }
-            | BatchMissingColumn { .. }
-            | InvalidProjection { .. }
-            | BuildBatch { .. }
-            | NotInSchemaToCompat { .. }
-            | WriteToOldVersion { .. }
-            | CreateRecordBatch { .. }
-            | RequestTooLarge { .. }
-            | TypeMismatch { .. }
-            | HasNull { .. }
-            | UnequalLengths { .. }
-            | MoreColumnThanExpected { .. } => StatusCode::InvalidArguments,
-
-            Utf8 { .. }
-            | EncodeJson { .. }
-            | DecodeJson { .. }
-            | WaitFlush { .. }
-            | DecodeMetaActionList { .. }
-            | Readline { .. }
-            | WalDataCorrupted { .. }
-            | SequenceNotMonotonic { .. }
-            | ConvertStoreSchema { .. }
-            | InvalidRawRegion { .. }
-            | ClosedRegion { .. }
-            | FilterColumn { .. }
-            | AlterMetadata { .. }
-            | CompatRead { .. }
-            | CreateDefaultToRead { .. }
-            | NoDefaultToRead { .. }
-            | NewRecordBatch { .. }
-            | BatchCorrupted { .. }
-            | DecodeArrow { .. }
-            | EncodeArrow { .. }
-            | ManifestCheckpoint { .. }
-            | CompressObject { .. }
-            | DecompressObject { .. }
-            | ParseSchema { .. } => StatusCode::Unexpected,
-
-            WriteParquet { .. }
-            | ReadObject { .. }
-            | WriteObject { .. }
-            | ListObjects { .. }
-            | DeleteObject { .. }
-            | WriteWal { .. }
-            | DecodeWalHeader { .. }
-            | EncodeWalHeader { .. }
-            | ManifestProtocolForbidRead { .. }
-            | ManifestProtocolForbidWrite { .. }
-            | ReadParquet { .. }
-            | InvalidRegionState { .. }
-            | ReadWal { .. } => StatusCode::StorageUnavailable,
-
-            UnknownColumn { .. } => StatusCode::TableColumnNotFound,
-
-            InvalidAlterRequest { source, .. } | InvalidRegionDesc { source, .. } => {
-                source.status_code()
-            }
-            WriteBuffer { source, .. } => source.status_code(),
-            PushBatch { source, .. } => source.status_code(),
-            CreateDefault { source, .. } => source.status_code(),
-            ConvertChunk { source, .. } => source.status_code(),
-            MarkWalObsolete { source, .. } => source.status_code(),
-            DeleteWalNamespace { source, .. } => source.status_code(),
-            DecodeParquetTimeRange { .. } => StatusCode::Unexpected,
-            RateLimited { .. } | StopScheduler { .. } | CompactTaskCancel { .. } => {
-                StatusCode::Internal
-            }
-            DeleteSst { .. } => StatusCode::StorageUnavailable,
-
-            StartManifestGcTask { .. }
-            | StopManifestGcTask { .. }
-            | IllegalSchedulerState { .. }
-            | DuplicateFlush { .. }
-            | StartPickTask { .. }
-            | StopPickTask { .. } => StatusCode::Unexpected,
-
-            TtlCalculation { source, .. } => source.status_code(),
-            ConvertColumnsToRows { .. } | SortArrays { .. } => StatusCode::Unexpected,
-            BuildPredicate { source, .. } => source.status_code(),
-            JoinError { .. } => StatusCode::Unexpected,
-        }
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
--- a/src/storage/src/file_purger.rs
+++ b/src/storage/src/file_purger.rs
@@ -1,235 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use common_telemetry::{debug, error};
-use store_api::storage::RegionId;
-use tokio::sync::Notify;
-
-use crate::error::Result;
-use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken};
-use crate::scheduler::{Handler, LocalScheduler, Request};
-use crate::sst::{AccessLayerRef, FileId};
-
-pub struct FilePurgeRequest {
-    pub region_id: RegionId,
-    pub file_id: FileId,
-    pub sst_layer: AccessLayerRef,
-}
-
-impl Request for FilePurgeRequest {
-    type Key = String;
-
-    fn key(&self) -> Self::Key {
-        format!("{}/{}", self.region_id, self.file_id)
-    }
-
-    fn complete(self, _result: Result<()>) {}
-}
-
-pub struct FilePurgeHandler;
-
-#[async_trait::async_trait]
-impl Handler for FilePurgeHandler {
-    type Request = FilePurgeRequest;
-
-    async fn handle_request(
-        &self,
-        req: Self::Request,
-        token: BoxedRateLimitToken,
-        finish_notifier: Arc<Notify>,
-    ) -> Result<()> {
-        req.sst_layer.delete_sst(req.file_id).await.map_err(|e| {
-            error!(e; "Failed to delete SST file, file: {}, region: {}", 
-                req.file_id.as_parquet(), req.region_id);
-            e
-        })?;
-        debug!(
-            "Successfully deleted SST file: {}, region: {}",
-            req.file_id.as_parquet(),
-            req.region_id
-        );
-        token.try_release();
-        finish_notifier.notify_one();
-        Ok(())
-    }
-}
-
-pub type FilePurgerRef = Arc<LocalScheduler<FilePurgeRequest>>;
-
-#[cfg(test)]
-pub mod noop {
-    use std::sync::Arc;
-
-    use tokio::sync::Notify;
-
-    use crate::error::Result;
-    use crate::file_purger::{FilePurgeRequest, FilePurgerRef};
-    use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken};
-    use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig};
-
-    pub fn new_noop_file_purger() -> FilePurgerRef {
-        Arc::new(LocalScheduler::new(
-            SchedulerConfig::default(),
-            NoopFilePurgeHandler,
-        ))
-    }
-
-    #[derive(Debug)]
-    pub struct NoopFilePurgeHandler;
-
-    #[async_trait::async_trait]
-    impl Handler for NoopFilePurgeHandler {
-        type Request = FilePurgeRequest;
-
-        async fn handle_request(
-            &self,
-            _req: Self::Request,
-            token: BoxedRateLimitToken,
-            finish_notifier: Arc<Notify>,
-        ) -> Result<()> {
-            token.try_release();
-            finish_notifier.notify_one();
-            Ok(())
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use api::v1::OpType;
-    use common_test_util::temp_dir::create_temp_dir;
-    use object_store::services::Fs;
-    use object_store::ObjectStore;
-
-    use super::*;
-    use crate::file_purger::noop::NoopFilePurgeHandler;
-    use crate::memtable::tests::{schema_for_test, write_kvs};
-    use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
-    use crate::scheduler::{Scheduler, SchedulerConfig};
-    use crate::sst::{AccessLayer, FileHandle, FileMeta, FsAccessLayer, Source, WriteOptions};
-
-    struct MockRateLimitToken;
-
-    impl RateLimitToken for MockRateLimitToken {
-        fn try_release(&self) {}
-    }
-
-    async fn create_sst_file(
-        os: ObjectStore,
-        sst_file_id: FileId,
-        file_purger: FilePurgerRef,
-    ) -> (FileHandle, String, AccessLayerRef) {
-        let schema = schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema.clone());
-
-        write_kvs(
-            &*memtable,
-            10,
-            OpType::Put,
-            &[1, 2],
-            &[(Some(1), Some(1)), (Some(2), Some(2))],
-        );
-
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let sst_path = "table1";
-        let layer = Arc::new(FsAccessLayer::new(sst_path, os.clone()));
-        let sst_info = layer
-            .write_sst(sst_file_id, Source::Iter(iter), &WriteOptions::default())
-            .await
-            .unwrap()
-            .unwrap();
-
-        (
-            FileHandle::new(
-                FileMeta {
-                    region_id: 0.into(),
-                    file_id: sst_file_id,
-                    time_range: None,
-                    level: 0,
-                    file_size: sst_info.file_size,
-                },
-                layer.clone(),
-                file_purger,
-            ),
-            sst_path.to_string(),
-            layer as _,
-        )
-    }
-
-    #[tokio::test]
-    async fn test_file_purger_handler() {
-        let dir = create_temp_dir("file-purge");
-        let mut builder = Fs::default();
-        let _ = builder.root(dir.path().to_str().unwrap());
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-
-        let sst_file_id = FileId::random();
-
-        let noop_file_purger = Arc::new(LocalScheduler::new(
-            SchedulerConfig::default(),
-            NoopFilePurgeHandler,
-        ));
-        let (_file, path, layer) =
-            create_sst_file(object_store.clone(), sst_file_id, noop_file_purger).await;
-        let request = FilePurgeRequest {
-            region_id: 0.into(),
-            file_id: sst_file_id,
-            sst_layer: layer,
-        };
-
-        let handler = FilePurgeHandler;
-        let notify = Arc::new(Notify::new());
-        handler
-            .handle_request(request, Box::new(MockRateLimitToken {}), notify.clone())
-            .await
-            .unwrap();
-
-        notify.notified().await;
-        let exists = object_store
-            .is_exist(&format!("{}/{}", path, sst_file_id.as_parquet()))
-            .await
-            .unwrap();
-        assert!(!exists);
-    }
-
-    #[tokio::test]
-    async fn test_file_purge_loop() {
-        common_telemetry::init_default_ut_logging();
-        let dir = create_temp_dir("file-purge");
-        let mut builder = Fs::default();
-        let _ = builder.root(dir.path().to_str().unwrap());
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-        let sst_file_id = FileId::random();
-        let scheduler = Arc::new(LocalScheduler::new(
-            SchedulerConfig::default(),
-            FilePurgeHandler,
-        ));
-        let (handle, path, _layer) =
-            create_sst_file(object_store.clone(), sst_file_id, scheduler.clone()).await;
-
-        {
-            // mark file as deleted and drop the handle, we expect the file is deleted.
-            handle.mark_deleted();
-            drop(handle);
-        }
-        scheduler.stop(true).await.unwrap();
-
-        assert!(!object_store
-            .is_exist(&format!("{}/{}", path, sst_file_id.as_parquet()))
-            .await
-            .unwrap());
-    }
-}
--- a/src/storage/src/flush.rs
+++ b/src/storage/src/flush.rs
@@ -1,495 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-mod picker;
-mod scheduler;
-
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Arc;
-
-use common_base::readable_size::ReadableSize;
-use common_telemetry::logging;
-pub use picker::{FlushPicker, PickerConfig};
-pub use scheduler::{
-    FlushHandle, FlushRegionRequest, FlushRequest, FlushScheduler, FlushSchedulerRef,
-};
-use store_api::logstore::LogStore;
-use store_api::storage::consts::WRITE_ROW_GROUP_SIZE;
-use store_api::storage::{RegionId, SequenceNumber};
-
-use crate::config::EngineConfig;
-use crate::error::Result;
-use crate::manifest::action::*;
-use crate::manifest::region::RegionManifest;
-use crate::memtable::{IterContext, MemtableId, MemtableRef};
-use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED};
-use crate::region::{RegionWriterRef, SharedDataRef};
-use crate::sst::{AccessLayerRef, FileId, FileMeta, Source, SstInfo, WriteOptions};
-use crate::wal::Wal;
-
-/// Current flush-related status of a region.
-#[derive(Debug, Clone, Copy)]
-pub struct RegionStatus {
-    /// Id of the region this status belongs to.
-    pub region_id: RegionId,
-    /// Size of the mutable memtable.
-    pub bytes_mutable: usize,
-    /// Write buffer size of the region.
-    pub write_buffer_size: usize,
-}
-
-/// Type of flush request to send.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum FlushType {
-    /// Flush current region.
-    Region,
-    /// Engine level flush. Find regions to flush globally.
-    Engine,
-}
-
-/// Strategy to control whether to flush a region before writing to the region.
-pub trait FlushStrategy: Send + Sync + std::fmt::Debug {
-    /// Returns whether to trigger a flush operation.
-    fn should_flush(&self, status: RegionStatus) -> Option<FlushType>;
-
-    /// Reserves `mem` bytes.
-    fn reserve_mem(&self, mem: usize);
-
-    /// Tells the strategy we are freeing `mem` bytes.
-    ///
-    /// We are in the process of freeing `mem` bytes, so it is not considered
-    /// when checking the soft limit.
-    fn schedule_free_mem(&self, mem: usize);
-
-    /// We have freed `mem` bytes.
-    fn free_mem(&self, mem: usize);
-}
-
-pub type FlushStrategyRef = Arc<dyn FlushStrategy>;
-
-/// Flush strategy based on memory usage.
-#[derive(Debug)]
-pub struct SizeBasedStrategy {
-    /// Write buffer size for all memtables.
-    global_write_buffer_size: Option<usize>,
-    /// Mutable memtable memory size limitation, only valid when `global_write_buffer_size`
-    /// is `Some`.
-    mutable_limitation: usize,
-    /// Memory in used (e.g. used by mutable and immutable memtables).
-    memory_used: AtomicUsize,
-    /// Memory that hasn't been scheduled to free (e.g. used by mutable memtables).
-    memory_active: AtomicUsize,
-}
-
-impl SizeBasedStrategy {
-    /// Returns a new [SizeBasedStrategy] with specific `global_write_buffer_size`.
-    pub fn new(global_write_buffer_size: Option<usize>) -> Self {
-        Self {
-            global_write_buffer_size,
-            mutable_limitation: get_mutable_limitation(global_write_buffer_size),
-            memory_used: AtomicUsize::new(0),
-            memory_active: AtomicUsize::new(0),
-        }
-    }
-
-    /// Returns whether to trigger an engine level flush.
-    ///
-    /// Inspired by RocksDB's WriteBufferManager.
-    /// <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94>
-    fn should_flush_engine(&self) -> bool {
-        // We only check global limit when it is Some.
-        let Some(global_write_buffer_size) = self.global_write_buffer_size else {
-            return false;
-        };
-
-        let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed);
-        if mutable_memtable_memory_usage > self.mutable_limitation {
-            logging::info!(
-                "Engine should flush (over mutable limit), mutable_usage: {}, mutable_limitation: {}.",
-                mutable_memtable_memory_usage,
-                self.mutable_limitation,
-            );
-            return true;
-        }
-
-        let memory_usage = self.memory_used.load(Ordering::Relaxed);
-        // If the memory exceeds the buffer size, we trigger more aggressive
-        // flush. But if already more than half memory is being flushed,
-        // triggering more flush may not help. We will hold it instead.
-        if memory_usage >= global_write_buffer_size
-            && mutable_memtable_memory_usage >= global_write_buffer_size / 2
-        {
-            logging::info!(
-                "Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
-                 mutable_usage: {}.",
-                memory_usage,
-                global_write_buffer_size,
-                mutable_memtable_memory_usage,
-            );
-            return true;
-        }
-
-        false
-    }
-
-    /// Returns true if the global memory limitation is enabled.
-    #[inline]
-    fn is_global_limit_enabled(&self) -> bool {
-        self.global_write_buffer_size.is_some()
-    }
-}
-
-#[inline]
-fn get_mutable_limitation(global_write_buffer_size: Option<usize>) -> usize {
-    // Inspired by RocksDB.
-    // https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L86
-    global_write_buffer_size
-        .map(|size| size * 7 / 8)
-        .unwrap_or(0)
-}
-
-impl Default for SizeBasedStrategy {
-    fn default() -> Self {
-        Self {
-            global_write_buffer_size: None,
-            mutable_limitation: 0,
-            memory_used: AtomicUsize::new(0),
-            memory_active: AtomicUsize::new(0),
-        }
-    }
-}
-
-impl FlushStrategy for SizeBasedStrategy {
-    fn should_flush(&self, status: RegionStatus) -> Option<FlushType> {
-        if status.bytes_mutable >= status.write_buffer_size {
-            // If the mutable memtable is full, we should freeze it and flush it.
-            logging::debug!(
-                "Region should flush as mutable memtable is full, region: {}, bytes_mutable: {}, \
-                write_buffer_size: {}.",
-                status.region_id,
-                status.bytes_mutable,
-                status.write_buffer_size,
-            );
-
-            return Some(FlushType::Region);
-        }
-
-        if self.should_flush_engine() {
-            return Some(FlushType::Engine);
-        }
-
-        None
-    }
-
-    fn reserve_mem(&self, mem: usize) {
-        if self.is_global_limit_enabled() {
-            let _ = self.memory_used.fetch_add(mem, Ordering::Relaxed);
-            let _ = self.memory_active.fetch_add(mem, Ordering::Relaxed);
-        }
-    }
-
-    fn schedule_free_mem(&self, mem: usize) {
-        if self.is_global_limit_enabled() {
-            let _ = self.memory_active.fetch_sub(mem, Ordering::Relaxed);
-        }
-    }
-
-    fn free_mem(&self, mem: usize) {
-        if self.is_global_limit_enabled() {
-            let _ = self.memory_used.fetch_sub(mem, Ordering::Relaxed);
-        }
-    }
-}
-
-pub struct FlushJob<S: LogStore> {
-    /// Max memtable id in these memtables,
-    /// used to remove immutable memtables in current version.
-    pub max_memtable_id: MemtableId,
-    /// Memtables to be flushed.
-    pub memtables: Vec<MemtableRef>,
-    /// Last sequence of data to be flushed.
-    pub flush_sequence: SequenceNumber,
-    /// Shared data of region to be flushed.
-    pub shared: SharedDataRef,
-    /// Sst access layer of the region.
-    pub sst_layer: AccessLayerRef,
-    /// Region writer, used to persist log entry that points to the latest manifest file.
-    pub writer: RegionWriterRef<S>,
-    /// Region write-ahead logging, used to write data/meta to the log file.
-    pub wal: Wal<S>,
-    /// Region manifest service, used to persist metadata.
-    pub manifest: RegionManifest,
-    /// Storage engine config
-    pub engine_config: Arc<EngineConfig>,
-}
-
-impl<S: LogStore> FlushJob<S> {
-    /// Execute the flush job.
-    async fn run(&mut self) -> Result<()> {
-        let _timer = FLUSH_ELAPSED.start_timer();
-
-        let file_metas = self.write_memtables_to_layer().await?;
-        if file_metas.is_empty() {
-            // skip writing manifest and wal if no files are flushed.
-            return Ok(());
-        }
-        self.write_manifest_and_apply(&file_metas).await?;
-
-        Ok(())
-    }
-
-    async fn write_memtables_to_layer(&mut self) -> Result<Vec<FileMeta>> {
-        let region_id = self.shared.id();
-        let mut futures = Vec::with_capacity(self.memtables.len());
-        let iter_ctx = IterContext {
-            // TODO(ruihang): dynamic row group size based on content (#412)
-            batch_size: WRITE_ROW_GROUP_SIZE,
-            // All sequences are visible by default.
-            ..Default::default()
-        };
-
-        for m in &self.memtables {
-            // skip empty memtable
-            if m.num_rows() == 0 {
-                continue;
-            }
-
-            let file_id = FileId::random();
-            // TODO(hl): Check if random file name already exists in meta.
-            let iter = m.iter(iter_ctx.clone())?;
-            let sst_layer = self.sst_layer.clone();
-            let write_options = WriteOptions {
-                sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
-            };
-            futures.push(async move {
-                Ok(sst_layer
-                    .write_sst(file_id, Source::Iter(iter), &write_options)
-                    .await?
-                    .map(
-                        |SstInfo {
-                             time_range,
-                             file_size,
-                             ..
-                         }| FileMeta {
-                            region_id,
-                            file_id,
-                            time_range,
-                            level: 0,
-                            file_size,
-                        },
-                    ))
-            });
-        }
-
-        let metas: Vec<_> = futures_util::future::try_join_all(futures)
-            .await?
-            .into_iter()
-            .flatten()
-            .collect();
-
-        let flush_bytes = metas.iter().map(|f| f.file_size).sum();
-
-        FLUSH_BYTES_TOTAL.inc_by(flush_bytes);
-
-        let file_ids = metas.iter().map(|f| f.file_id).collect::<Vec<_>>();
-        logging::info!("Successfully flush memtables, region:{region_id}, files: {file_ids:?}");
-        Ok(metas)
-    }
-
-    async fn write_manifest_and_apply(&mut self, file_metas: &[FileMeta]) -> Result<()> {
-        let edit = RegionEdit {
-            region_version: self.shared.version_control.metadata().version(),
-            flushed_sequence: Some(self.flush_sequence),
-            files_to_add: file_metas.to_vec(),
-            files_to_remove: Vec::default(),
-            compaction_time_window: None,
-        };
-
-        self.writer
-            .write_edit_and_apply(
-                &self.wal,
-                &self.shared,
-                &self.manifest,
-                edit,
-                Some(self.max_memtable_id),
-            )
-            .await?;
-        self.wal.obsolete(self.flush_sequence).await
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::memtable::AllocTracker;
-
-    #[test]
-    fn test_get_mutable_limitation() {
-        assert_eq!(7, get_mutable_limitation(Some(8)));
-        assert_eq!(8, get_mutable_limitation(Some(10)));
-        assert_eq!(56, get_mutable_limitation(Some(64)));
-        assert_eq!(0, get_mutable_limitation(None));
-    }
-
-    #[test]
-    fn test_strategy_global_disabled() {
-        let strategy = SizeBasedStrategy::new(None);
-        strategy.reserve_mem(1000);
-        assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
-        strategy.schedule_free_mem(1000);
-        assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
-        strategy.free_mem(1000);
-        assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
-
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 400,
-            write_buffer_size: 300,
-        };
-        assert_eq!(Some(FlushType::Region), strategy.should_flush(status));
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 100,
-            write_buffer_size: 300,
-        };
-        assert_eq!(None, strategy.should_flush(status));
-    }
-
-    #[test]
-    fn test_strategy_over_mutable_limit() {
-        let strategy = SizeBasedStrategy::new(Some(1000));
-        strategy.reserve_mem(500);
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 300,
-            write_buffer_size: 500,
-        };
-        assert_eq!(None, strategy.should_flush(status));
-        strategy.reserve_mem(400);
-
-        // Flush region.
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 400,
-            write_buffer_size: 300,
-        };
-        assert_eq!(Some(FlushType::Region), strategy.should_flush(status));
-
-        // More than mutable limitation, Flush global.
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 100,
-            write_buffer_size: 300,
-        };
-        assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
-
-        strategy.schedule_free_mem(500);
-        assert_eq!(None, strategy.should_flush(status));
-        assert_eq!(900, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed));
-
-        strategy.free_mem(500);
-        assert_eq!(400, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed));
-    }
-
-    #[test]
-    fn test_strategy_over_global() {
-        common_telemetry::init_default_ut_logging();
-
-        let strategy = SizeBasedStrategy::new(Some(1000));
-        strategy.reserve_mem(1100);
-        strategy.schedule_free_mem(200);
-        // More than global limit.
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 100,
-            write_buffer_size: 300,
-        };
-        assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
-
-        // More than global limit, but mutable not enough (< 500).
-        strategy.schedule_free_mem(450);
-        let status = RegionStatus {
-            region_id: 1.into(),
-            bytes_mutable: 100,
-            write_buffer_size: 300,
-        };
-        assert_eq!(None, strategy.should_flush(status));
-        strategy.schedule_free_mem(100);
-        assert_eq!(None, strategy.should_flush(status));
-
-        // Now mutable is enough.
-        strategy.reserve_mem(150);
-        // We can flush again.
-        assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
-        strategy.reserve_mem(100);
-        assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
-    }
-
-    #[test]
-    fn test_alloc_tracker_without_strategy() {
-        let tracker = AllocTracker::new(None);
-        assert_eq!(0, tracker.bytes_allocated());
-        tracker.on_allocate(100);
-        assert_eq!(100, tracker.bytes_allocated());
-        tracker.on_allocate(200);
-        assert_eq!(300, tracker.bytes_allocated());
-
-        tracker.done_allocating();
-        assert_eq!(300, tracker.bytes_allocated());
-    }
-
-    #[test]
-    fn test_alloc_tracker_with_strategy() {
-        let strategy = Arc::new(SizeBasedStrategy::new(Some(1000)));
-        {
-            let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef));
-
-            tracker.on_allocate(100);
-            assert_eq!(100, tracker.bytes_allocated());
-            assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
-            assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed));
-
-            for _ in 0..2 {
-                // Done allocating won't free the same memory multiple times.
-                tracker.done_allocating();
-                assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
-                assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
-            }
-        }
-
-        assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
-    }
-
-    #[test]
-    fn test_alloc_tracker_without_done_allocating() {
-        let strategy = Arc::new(SizeBasedStrategy::new(Some(1000)));
-        {
-            let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef));
-
-            tracker.on_allocate(100);
-            assert_eq!(100, tracker.bytes_allocated());
-            assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
-            assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed));
-        }
-
-        assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
-        assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
-    }
-}
--- a/src/storage/src/flush/picker.rs
+++ b/src/storage/src/flush/picker.rs
@@ -1,263 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::time::Duration;
-
-use async_trait::async_trait;
-use common_telemetry::logging;
-use common_time::util;
-use store_api::logstore::LogStore;
-use store_api::storage::{FlushContext, FlushReason, Region};
-
-use crate::config::{DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_PICKER_SCHEDULE_INTERVAL};
-use crate::region::RegionImpl;
-
-/// Config for [FlushPicker].
-pub struct PickerConfig {
-    /// Interval to schedule the picker.
-    pub schedule_interval: Duration,
-    /// Interval to auto flush a region if it has not flushed yet.
-    pub auto_flush_interval: Duration,
-}
-
-impl PickerConfig {
-    /// Returns the auto flush interval in millis or a default value
-    /// if overflow occurs.
-    fn auto_flush_interval_millis(&self) -> i64 {
-        self.auto_flush_interval
-            .as_millis()
-            .try_into()
-            .unwrap_or(DEFAULT_AUTO_FLUSH_INTERVAL.into())
-    }
-}
-
-impl Default for PickerConfig {
-    fn default() -> Self {
-        PickerConfig {
-            schedule_interval: Duration::from_millis(DEFAULT_PICKER_SCHEDULE_INTERVAL.into()),
-            auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
-        }
-    }
-}
-
-/// Flush task picker.
-#[derive(Debug, Clone)]
-pub struct FlushPicker {
-    /// Interval to flush a region automatically.
-    auto_flush_interval_millis: i64,
-}
-
-impl FlushPicker {
-    /// Returns a new FlushPicker.
-    pub fn new(config: PickerConfig) -> FlushPicker {
-        FlushPicker {
-            auto_flush_interval_millis: config.auto_flush_interval_millis(),
-        }
-    }
-
-    /// Picks regions and flushes them by interval.
-    ///
-    /// Returns the number of flushed regions.
-    pub async fn pick_by_interval<T: FlushItem>(&self, regions: &[T]) -> usize {
-        let now = util::current_time_millis();
-        // Flush regions by interval.
-        if let Some(earliest_flush_millis) = now.checked_sub(self.auto_flush_interval_millis) {
-            flush_regions_by_interval(regions, earliest_flush_millis).await
-        } else {
-            0
-        }
-    }
-
-    /// Picks and flushes regions when the write buffer is full.
-    pub async fn pick_by_write_buffer_full<T: FlushItem>(&self, regions: &[T]) {
-        // In such case, we pick the oldest region to flush. If this is not enough,
-        // the next time the region writer will trigger the picker again. Then we
-        // can pick another region to flush. The total memory will go down eventually.
-        let target = regions
-            .iter()
-            .filter(|region| region.mutable_memtable_usage() > 0)
-            .min_by_key(|region| region.last_flush_time());
-        if let Some(region) = target {
-            logging::debug!(
-                "Request flush for region {} due to global buffer is full",
-                region.item_id()
-            );
-
-            region.request_flush(FlushReason::GlobalBufferFull).await;
-        }
-    }
-}
-
-/// Item for picker to flush.
-#[async_trait]
-pub trait FlushItem {
-    /// Id of the item.
-    fn item_id(&self) -> u64;
-
-    /// Last flush time in millis.
-    fn last_flush_time(&self) -> i64;
-
-    /// Mutable memtable usage.
-    fn mutable_memtable_usage(&self) -> usize;
-
-    /// Requests the item to schedule a flush for specific `reason`.
-    ///
-    /// The flush job itself should run in background.
-    async fn request_flush(&self, reason: FlushReason);
-}
-
-#[async_trait]
-impl<S: LogStore> FlushItem for RegionImpl<S> {
-    fn item_id(&self) -> u64 {
-        self.id().into()
-    }
-
-    fn last_flush_time(&self) -> i64 {
-        self.last_flush_millis()
-    }
-
-    fn mutable_memtable_usage(&self) -> usize {
-        let current = self.version_control().current();
-        let memtables = current.memtables();
-        memtables.mutable_bytes_allocated()
-    }
-
-    async fn request_flush(&self, reason: FlushReason) {
-        let ctx = FlushContext {
-            wait: false,
-            reason,
-            ..Default::default()
-        };
-
-        if let Err(e) = self.flush(&ctx).await {
-            logging::error!(e; "Failed to flush region {}", self.id());
-        }
-    }
-}
-
-/// Auto flush regions based on last flush time.
-///
-/// Returns the number of flushed regions.
-async fn flush_regions_by_interval<T: FlushItem>(
-    regions: &[T],
-    earliest_flush_millis: i64,
-) -> usize {
-    let mut flushed = 0;
-    for region in regions {
-        if region.last_flush_time() < earliest_flush_millis {
-            logging::debug!(
-                "Auto flush region {} due to last flush time ({} < {})",
-                region.item_id(),
-                region.last_flush_time(),
-                earliest_flush_millis,
-            );
-
-            flushed += 1;
-            region.request_flush(FlushReason::Periodically).await;
-        }
-    }
-
-    flushed
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Mutex;
-
-    use super::*;
-
-    struct MockItem {
-        id: u64,
-        last_flush_time: i64,
-        usage: usize,
-        flush_reason: Mutex<Option<FlushReason>>,
-    }
-
-    impl MockItem {
-        fn new(id: u64, last_flush_time: i64, usage: usize) -> MockItem {
-            MockItem {
-                id,
-                last_flush_time,
-                usage,
-                flush_reason: Mutex::new(None),
-            }
-        }
-
-        fn flush_reason(&self) -> Option<FlushReason> {
-            *self.flush_reason.lock().unwrap()
-        }
-    }
-
-    #[async_trait]
-    impl FlushItem for MockItem {
-        fn item_id(&self) -> u64 {
-            self.id
-        }
-
-        fn last_flush_time(&self) -> i64 {
-            self.last_flush_time
-        }
-
-        fn mutable_memtable_usage(&self) -> usize {
-            self.usage
-        }
-
-        async fn request_flush(&self, reason: FlushReason) {
-            let mut flush_reason = self.flush_reason.lock().unwrap();
-            *flush_reason = Some(reason);
-        }
-    }
-
-    #[tokio::test]
-    async fn test_pick_by_interval() {
-        let regions = [
-            MockItem::new(0, util::current_time_millis(), 1),
-            MockItem::new(1, util::current_time_millis() - 60 * 1000, 1),
-        ];
-        let picker = FlushPicker::new(PickerConfig {
-            // schedule_interval is unused in this test.
-            schedule_interval: Duration::from_millis(10),
-            auto_flush_interval: Duration::from_millis(30 * 1000),
-        });
-        let flushed = picker.pick_by_interval(&regions).await;
-        assert_eq!(1, flushed);
-        assert!(regions[0].flush_reason().is_none());
-        assert_eq!(Some(FlushReason::Periodically), regions[1].flush_reason());
-    }
-
-    #[tokio::test]
-    async fn test_pick_by_buffer_full() {
-        let regions = [
-            MockItem::new(0, util::current_time_millis(), 10),
-            MockItem::new(1, util::current_time_millis() - 60 * 1000, 0),
-            MockItem::new(1, util::current_time_millis() - 60 * 1000, 10),
-        ];
-        let picker = FlushPicker::new(PickerConfig {
-            schedule_interval: Duration::from_millis(10),
-            auto_flush_interval: Duration::from_millis(30 * 1000),
-        });
-        picker.pick_by_write_buffer_full(&regions).await;
-        assert!(regions[0].flush_reason().is_none());
-        assert!(regions[1].flush_reason().is_none());
-        assert_eq!(
-            Some(FlushReason::GlobalBufferFull),
-            regions[2].flush_reason()
-        );
-
-        // No target.
-        let regions = [MockItem::new(1, util::current_time_millis(), 0)];
-        picker.pick_by_write_buffer_full(&regions).await;
-        assert!(regions[0].flush_reason().is_none());
-    }
-}
--- a/src/storage/src/flush/scheduler.rs
+++ b/src/storage/src/flush/scheduler.rs
@@ -1,378 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use async_trait::async_trait;
-use common_base::readable_size::ReadableSize;
-use common_runtime::{RepeatedTask, TaskFunction};
-use common_telemetry::logging;
-use snafu::{ensure, ResultExt};
-use store_api::logstore::LogStore;
-use store_api::storage::{RegionId, SequenceNumber};
-use tokio::sync::oneshot::{Receiver, Sender};
-use tokio::sync::{oneshot, Notify};
-
-use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
-use crate::config::EngineConfig;
-use crate::engine::RegionMap;
-use crate::error::{
-    DuplicateFlushSnafu, Error, Result, StartPickTaskSnafu, StopPickTaskSnafu, WaitFlushSnafu,
-};
-use crate::flush::{FlushJob, FlushPicker, PickerConfig};
-use crate::manifest::region::RegionManifest;
-use crate::memtable::{MemtableId, MemtableRef};
-use crate::metrics::FLUSH_ERRORS_TOTAL;
-use crate::region;
-use crate::region::{RegionWriterRef, SharedDataRef};
-use crate::scheduler::rate_limit::BoxedRateLimitToken;
-use crate::scheduler::{Handler, LocalScheduler, Request, Scheduler, SchedulerConfig};
-use crate::sst::AccessLayerRef;
-use crate::wal::Wal;
-
-/// Key for [FlushRequest].
-#[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub enum FlushKey {
-    Engine,
-    Region(RegionId, SequenceNumber),
-}
-
-/// Flush request.
-pub enum FlushRequest<S: LogStore> {
-    /// Flush the engine.
-    Engine,
-    /// Flush a region.
-    Region {
-        /// Region flush request.
-        req: FlushRegionRequest<S>,
-        /// Flush result sender.
-        sender: Sender<Result<()>>,
-    },
-}
-
-impl<S: LogStore> Request for FlushRequest<S> {
-    type Key = FlushKey;
-
-    #[inline]
-    fn key(&self) -> FlushKey {
-        match &self {
-            FlushRequest::Engine => FlushKey::Engine,
-            FlushRequest::Region { req, .. } => {
-                FlushKey::Region(req.shared.id(), req.flush_sequence)
-            }
-        }
-    }
-
-    fn complete(self, result: Result<()>) {
-        if let FlushRequest::Region { sender, .. } = self {
-            let _ = sender.send(result);
-        }
-    }
-}
-
-/// Region flush request.
-pub struct FlushRegionRequest<S: LogStore> {
-    /// Max memtable id in these memtables,
-    /// used to remove immutable memtables in current version.
-    pub max_memtable_id: MemtableId,
-    /// Memtables to be flushed.
-    pub memtables: Vec<MemtableRef>,
-    /// Last sequence of data to be flushed.
-    pub flush_sequence: SequenceNumber,
-    /// Shared data of region to be flushed.
-    pub shared: SharedDataRef,
-    /// Sst access layer of the region.
-    pub sst_layer: AccessLayerRef,
-    /// Region writer, used to persist log entry that points to the latest manifest file.
-    pub writer: RegionWriterRef<S>,
-    /// Region write-ahead logging, used to write data/meta to the log file.
-    pub wal: Wal<S>,
-    /// Region manifest service, used to persist metadata.
-    pub manifest: RegionManifest,
-    /// Storage engine config
-    pub engine_config: Arc<EngineConfig>,
-
-    // Compaction related options:
-    /// TTL of the region.
-    pub ttl: Option<Duration>,
-    /// Time window for compaction.
-    pub compaction_time_window: Option<i64>,
-    pub compaction_picker: CompactionPickerRef<S>,
-}
-
-impl<S: LogStore> FlushRegionRequest<S> {
-    #[inline]
-    fn region_id(&self) -> RegionId {
-        self.shared.id()
-    }
-}
-
-impl<S: LogStore> From<&FlushRegionRequest<S>> for FlushJob<S> {
-    fn from(req: &FlushRegionRequest<S>) -> FlushJob<S> {
-        FlushJob {
-            max_memtable_id: req.max_memtable_id,
-            memtables: req.memtables.clone(),
-            flush_sequence: req.flush_sequence,
-            shared: req.shared.clone(),
-            sst_layer: req.sst_layer.clone(),
-            writer: req.writer.clone(),
-            wal: req.wal.clone(),
-            manifest: req.manifest.clone(),
-            engine_config: req.engine_config.clone(),
-        }
-    }
-}
-
-impl<S: LogStore> From<&FlushRegionRequest<S>> for CompactionRequestImpl<S> {
-    fn from(req: &FlushRegionRequest<S>) -> CompactionRequestImpl<S> {
-        CompactionRequestImpl {
-            region_id: req.region_id(),
-            sst_layer: req.sst_layer.clone(),
-            writer: req.writer.clone(),
-            shared: req.shared.clone(),
-            manifest: req.manifest.clone(),
-            wal: req.wal.clone(),
-            ttl: req.ttl,
-            compaction_time_window: req.compaction_time_window,
-            sender: None,
-            picker: req.compaction_picker.clone(),
-            sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
-            // compaction triggered by flush always reschedules
-            reschedule_on_finish: true,
-        }
-    }
-}
-
-/// A handle to get the flush result.
-#[derive(Debug)]
-pub struct FlushHandle {
-    region_id: RegionId,
-    receiver: Receiver<Result<()>>,
-}
-
-impl FlushHandle {
-    /// Waits until the flush job is finished.
-    pub async fn wait(self) -> Result<()> {
-        self.receiver.await.context(WaitFlushSnafu {
-            region_id: self.region_id,
-        })?
-    }
-}
-
-/// Flush scheduler.
-pub struct FlushScheduler<S: LogStore> {
-    /// Flush task scheduler.
-    scheduler: LocalScheduler<FlushRequest<S>>,
-    /// Auto flush task.
-    auto_flush_task: RepeatedTask<Error>,
-    #[cfg(test)]
-    pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
-}
-
-pub type FlushSchedulerRef<S> = Arc<FlushScheduler<S>>;
-
-impl<S: LogStore> FlushScheduler<S> {
-    /// Returns a new [FlushScheduler].
-    pub fn new(
-        config: SchedulerConfig,
-        compaction_scheduler: CompactionSchedulerRef<S>,
-        regions: Arc<RegionMap<S>>,
-        picker_config: PickerConfig,
-    ) -> Result<Self> {
-        let task_interval = picker_config.schedule_interval;
-        let picker = FlushPicker::new(picker_config);
-        // Now we just clone the picker since we don't need to share states and
-        // the clone of picker is cheap.
-        let task_fn = AutoFlushFunction {
-            regions: regions.clone(),
-            picker: picker.clone(),
-        };
-        let auto_flush_task = RepeatedTask::new(task_interval, Box::new(task_fn));
-        auto_flush_task
-            .start(common_runtime::bg_runtime())
-            .context(StartPickTaskSnafu)?;
-        #[cfg(test)]
-        let pending_tasks = Arc::new(tokio::sync::RwLock::new(vec![]));
-        let handler = FlushHandler {
-            compaction_scheduler,
-            regions,
-            picker,
-            #[cfg(test)]
-            pending_tasks: pending_tasks.clone(),
-        };
-
-        Ok(Self {
-            scheduler: LocalScheduler::new(config, handler),
-            auto_flush_task,
-            #[cfg(test)]
-            pending_tasks,
-        })
-    }
-
-    /// Schedules a region flush request and return the handle to the flush task.
-    pub fn schedule_region_flush(&self, req: FlushRegionRequest<S>) -> Result<FlushHandle> {
-        let region_id = req.region_id();
-        let sequence = req.flush_sequence;
-        let (sender, receiver) = oneshot::channel();
-
-        let scheduled = self
-            .scheduler
-            .schedule(FlushRequest::Region { req, sender })?;
-        // Normally we should not have duplicate flush request.
-        ensure!(
-            scheduled,
-            DuplicateFlushSnafu {
-                region_id,
-                sequence,
-            }
-        );
-
-        Ok(FlushHandle {
-            region_id,
-            receiver,
-        })
-    }
-
-    /// Schedules a engine flush request.
-    pub fn schedule_engine_flush(&self) -> Result<()> {
-        let _ = self.scheduler.schedule(FlushRequest::Engine)?;
-        Ok(())
-    }
-
-    /// Stop the scheduler.
-    pub async fn stop(&self) -> Result<()> {
-        self.auto_flush_task
-            .stop()
-            .await
-            .context(StopPickTaskSnafu)?;
-        self.scheduler.stop(true).await?;
-
-        #[cfg(test)]
-        let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await;
-
-        Ok(())
-    }
-}
-
-struct FlushHandler<S: LogStore> {
-    compaction_scheduler: CompactionSchedulerRef<S>,
-    regions: Arc<RegionMap<S>>,
-    picker: FlushPicker,
-    #[cfg(test)]
-    pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
-}
-
-#[async_trait::async_trait]
-impl<S: LogStore> Handler for FlushHandler<S> {
-    type Request = FlushRequest<S>;
-
-    async fn handle_request(
-        &self,
-        req: FlushRequest<S>,
-        token: BoxedRateLimitToken,
-        finish_notifier: Arc<Notify>,
-    ) -> Result<()> {
-        let compaction_scheduler = self.compaction_scheduler.clone();
-        let region_map = self.regions.clone();
-        let picker = self.picker.clone();
-        let _handle = common_runtime::spawn_bg(async move {
-            match req {
-                FlushRequest::Engine => {
-                    let regions = region_map.list_regions();
-                    picker.pick_by_write_buffer_full(&regions).await;
-                }
-                FlushRequest::Region { req, sender } => {
-                    execute_flush_region(req, sender, compaction_scheduler).await;
-                }
-            }
-
-            // releases rate limit token
-            token.try_release();
-            // notify scheduler to schedule next task when current task finishes.
-            finish_notifier.notify_one();
-        });
-
-        #[cfg(test)]
-        self.pending_tasks.write().await.push(_handle);
-        Ok(())
-    }
-}
-
-async fn execute_flush_region<S: LogStore>(
-    req: FlushRegionRequest<S>,
-    sender: Sender<Result<()>>,
-    compaction_scheduler: CompactionSchedulerRef<S>,
-) {
-    let mut flush_job = FlushJob::from(&req);
-
-    if let Err(e) = flush_job.run().await {
-        logging::error!(e; "Failed to flush region {}", req.region_id());
-
-        FLUSH_ERRORS_TOTAL.inc();
-
-        FlushRequest::Region { req, sender }.complete(Err(e));
-    } else {
-        logging::debug!("Successfully flush region: {}", req.region_id());
-
-        // Update last flush time.
-        req.shared.update_flush_millis();
-
-        let compaction_request = CompactionRequestImpl::from(&req);
-        let max_files_in_l0 = req.engine_config.max_files_in_l0;
-        let shared_data = req.shared.clone();
-
-        let level0_file_num = shared_data
-            .version_control
-            .current()
-            .ssts()
-            .level(0)
-            .file_num();
-        if level0_file_num <= max_files_in_l0 {
-            logging::debug!(
-                "No enough SST files in level 0 (threshold: {}), skip compaction",
-                max_files_in_l0
-            );
-        } else {
-            // If flush is success, schedule a compaction request for this region.
-            let _ =
-                region::schedule_compaction(shared_data, compaction_scheduler, compaction_request);
-        }
-
-        // Complete the request.
-        FlushRequest::Region { req, sender }.complete(Ok(()));
-    }
-}
-
-/// Task function to pick regions to flush.
-struct AutoFlushFunction<S: LogStore> {
-    /// Regions of the engine.
-    regions: Arc<RegionMap<S>>,
-    picker: FlushPicker,
-}
-
-#[async_trait]
-impl<S: LogStore> TaskFunction<Error> for AutoFlushFunction<S> {
-    async fn call(&mut self) -> Result<()> {
-        // Get all regions.
-        let regions = self.regions.list_regions();
-        let _ = self.picker.pick_by_interval(&regions).await;
-
-        Ok(())
-    }
-
-    fn name(&self) -> &str {
-        "FlushPicker-pick-task"
-    }
-}
--- a/src/storage/src/lib.rs
+++ b/src/storage/src/lib.rs
@@ -1,49 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Storage engine implementation.
-
-#![feature(let_chains)]
-
-mod chunk;
-pub mod codec;
-pub mod compaction;
-pub mod config;
-mod engine;
-pub mod error;
-mod flush;
-pub mod manifest;
-pub mod memtable;
-pub mod metadata;
-pub mod proto;
-pub mod read;
-pub mod region;
-pub mod scheduler;
-pub mod schema;
-mod snapshot;
-pub mod sst;
-mod sync;
-#[cfg(test)]
-mod test_util;
-mod version;
-mod wal;
-pub mod write_batch;
-
-pub use engine::EngineImpl;
-mod file_purger;
-mod metrics;
-mod window_infer;
-
-pub use sst::parquet::ParquetWriter;
-pub use sst::Source;
--- a/src/storage/src/manifest.rs
+++ b/src/storage/src/manifest.rs
@@ -1,26 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! manifest storage
-pub(crate) mod action;
-pub mod checkpoint;
-pub mod helper;
-mod impl_;
-pub mod region;
-pub(crate) mod storage;
-#[cfg(test)]
-pub mod test_utils;
-
-pub use self::impl_::*;
-pub use self::storage::manifest_compress_type;
--- a/src/storage/src/manifest/action.rs
+++ b/src/storage/src/manifest/action.rs
@@ -1,443 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::io::{BufRead, BufReader};
-
-use serde::{Deserialize, Serialize};
-use serde_json as json;
-use snafu::{ensure, OptionExt, ResultExt};
-use store_api::manifest::action::{ProtocolAction, ProtocolVersion, VersionHeader};
-use store_api::manifest::{Checkpoint, ManifestVersion, MetaAction};
-use store_api::storage::{RegionId, SequenceNumber};
-
-use crate::error::{
-    self, DecodeJsonSnafu, DecodeMetaActionListSnafu, ManifestProtocolForbidReadSnafu,
-    ReadlineSnafu, Result,
-};
-use crate::manifest::helper;
-use crate::metadata::{ColumnFamilyMetadata, ColumnMetadata, VersionNumber};
-use crate::sst::{FileId, FileMeta};
-
-/// Minimal data that could be used to persist and recover [RegionMetadata](crate::metadata::RegionMetadata).
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
-pub struct RawRegionMetadata {
-    pub id: RegionId,
-    pub name: String,
-    pub columns: RawColumnsMetadata,
-    pub column_families: RawColumnFamiliesMetadata,
-    pub version: VersionNumber,
-}
-
-/// Minimal data that could be used to persist and recover [ColumnsMetadata](crate::metadata::ColumnsMetadata).
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
-pub struct RawColumnsMetadata {
-    pub columns: Vec<ColumnMetadata>,
-    pub row_key_end: usize,
-    pub timestamp_key_index: usize,
-    pub user_column_end: usize,
-}
-
-/// Minimal data that could be used to persist and recover [ColumnFamiliesMetadata](crate::metadata::ColumnFamiliesMetadata).
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
-pub struct RawColumnFamiliesMetadata {
-    pub column_families: Vec<ColumnFamilyMetadata>,
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub struct RegionChange {
-    /// The committed sequence of the region when this change happens. So the
-    /// data with sequence **greater than** this sequence would use the new
-    /// metadata.
-    pub committed_sequence: SequenceNumber,
-    /// The metadata after changed.
-    pub metadata: RawRegionMetadata,
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub struct RegionRemove {
-    pub region_id: RegionId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub struct RegionEdit {
-    pub region_version: VersionNumber,
-    pub flushed_sequence: Option<SequenceNumber>,
-    pub files_to_add: Vec<FileMeta>,
-    pub files_to_remove: Vec<FileMeta>,
-    pub compaction_time_window: Option<i64>,
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub struct RegionTruncate {
-    pub region_id: RegionId,
-    pub committed_sequence: SequenceNumber,
-}
-
-/// The region version checkpoint
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub struct RegionVersion {
-    pub manifest_version: ManifestVersion,
-    pub flushed_sequence: Option<SequenceNumber>,
-    pub files: HashMap<FileId, FileMeta>,
-}
-
-/// The region manifest data checkpoint
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)]
-pub struct RegionManifestData {
-    pub committed_sequence: SequenceNumber,
-    pub metadata: RawRegionMetadata,
-    pub version: Option<RegionVersion>,
-}
-
-#[derive(Debug, Default)]
-pub struct RegionManifestDataBuilder {
-    committed_sequence: SequenceNumber,
-    metadata: RawRegionMetadata,
-    version: Option<RegionVersion>,
-}
-
-impl RegionManifestDataBuilder {
-    pub fn with_checkpoint(checkpoint: Option<RegionManifestData>) -> Self {
-        if let Some(s) = checkpoint {
-            Self {
-                metadata: s.metadata,
-                version: s.version,
-                committed_sequence: s.committed_sequence,
-            }
-        } else {
-            Default::default()
-        }
-    }
-
-    pub fn apply_change(&mut self, change: RegionChange) {
-        self.metadata = change.metadata;
-        self.committed_sequence = change.committed_sequence;
-    }
-
-    pub fn apply_edit(&mut self, manifest_version: ManifestVersion, edit: RegionEdit) {
-        if let Some(version) = &mut self.version {
-            version.manifest_version = manifest_version;
-            version.flushed_sequence = edit.flushed_sequence;
-            for file in edit.files_to_add {
-                let _ = version.files.insert(file.file_id, file);
-            }
-            for file in edit.files_to_remove {
-                let _ = version.files.remove(&file.file_id);
-            }
-        } else {
-            self.version = Some(RegionVersion {
-                manifest_version,
-                flushed_sequence: edit.flushed_sequence,
-                files: edit
-                    .files_to_add
-                    .into_iter()
-                    .map(|f| (f.file_id, f))
-                    .collect(),
-            });
-        }
-    }
-    pub fn build(self) -> RegionManifestData {
-        RegionManifestData {
-            metadata: self.metadata,
-            version: self.version,
-            committed_sequence: self.committed_sequence,
-        }
-    }
-}
-
-// The checkpoint of region manifest, generated by checkpoint.
-#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
-pub struct RegionCheckpoint {
-    /// The snasphot protocol
-    pub protocol: ProtocolAction,
-    /// The last manifest version that this checkpoint compacts(inclusive).
-    pub last_version: ManifestVersion,
-    // The number of manifest actions that this checkpoint compacts.
-    pub compacted_actions: usize,
-    // The checkpoint data
-    pub checkpoint: Option<RegionManifestData>,
-}
-
-impl Checkpoint for RegionCheckpoint {
-    type Error = error::Error;
-
-    fn set_protocol(&mut self, action: ProtocolAction) {
-        self.protocol = action;
-    }
-
-    fn last_version(&self) -> ManifestVersion {
-        self.last_version
-    }
-
-    fn encode(&self) -> Result<Vec<u8>> {
-        helper::encode_checkpoint(self)
-    }
-
-    fn decode(bs: &[u8], reader_version: ProtocolVersion) -> Result<Self> {
-        helper::decode_checkpoint(bs, reader_version)
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub enum RegionMetaAction {
-    Protocol(ProtocolAction),
-    Change(RegionChange),
-    Remove(RegionRemove),
-    Edit(RegionEdit),
-    Truncate(RegionTruncate),
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
-pub struct RegionMetaActionList {
-    pub actions: Vec<RegionMetaAction>,
-    pub prev_version: ManifestVersion,
-}
-
-impl RegionMetaActionList {
-    pub fn with_action(action: RegionMetaAction) -> Self {
-        Self {
-            actions: vec![action],
-            prev_version: 0,
-        }
-    }
-
-    pub fn new(actions: Vec<RegionMetaAction>) -> Self {
-        Self {
-            actions,
-            prev_version: 0,
-        }
-    }
-}
-
-impl MetaAction for RegionMetaActionList {
-    type Error = error::Error;
-
-    fn set_protocol(&mut self, action: ProtocolAction) {
-        // The protocol action should be the first action in action list by convention.
-        self.actions.insert(0, RegionMetaAction::Protocol(action));
-    }
-
-    fn set_prev_version(&mut self, version: ManifestVersion) {
-        self.prev_version = version;
-    }
-
-    /// Encode self into json in the form of string lines, starts with prev_version and then action json list.
-    fn encode(&self) -> Result<Vec<u8>> {
-        helper::encode_actions(self.prev_version, &self.actions)
-    }
-
-    fn decode(
-        bs: &[u8],
-        reader_version: ProtocolVersion,
-    ) -> Result<(Self, Option<ProtocolAction>)> {
-        let mut lines = BufReader::new(bs).lines();
-
-        let mut action_list = RegionMetaActionList {
-            actions: Vec::default(),
-            prev_version: 0,
-        };
-
-        {
-            let first_line = lines
-                .next()
-                .with_context(|| DecodeMetaActionListSnafu {
-                    msg: format!(
-                        "Invalid content in manifest: {}",
-                        std::str::from_utf8(bs).unwrap_or("**invalid bytes**")
-                    ),
-                })?
-                .context(ReadlineSnafu)?;
-
-            // Decode prev_version
-            let v: VersionHeader = json::from_str(&first_line).context(DecodeJsonSnafu)?;
-            action_list.prev_version = v.prev_version;
-        }
-
-        // Decode actions
-        let mut protocol_action = None;
-        let mut actions = Vec::default();
-        for line in lines {
-            let line = &line.context(ReadlineSnafu)?;
-            let action: RegionMetaAction = json::from_str(line).context(DecodeJsonSnafu)?;
-
-            if let RegionMetaAction::Protocol(p) = &action {
-                ensure!(
-                    p.is_readable(reader_version),
-                    ManifestProtocolForbidReadSnafu {
-                        min_version: p.min_reader_version,
-                        supported_version: reader_version,
-                    }
-                );
-                protocol_action = Some(p.clone());
-            }
-
-            actions.push(action);
-        }
-        action_list.actions = actions;
-
-        Ok((action_list, protocol_action))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use common_telemetry::logging;
-    use datatypes::type_id::LogicalTypeId;
-
-    use super::*;
-    use crate::manifest::test_utils;
-    use crate::metadata::RegionMetadata;
-    use crate::sst::FileId;
-    use crate::test_util::descriptor_util::RegionDescBuilder;
-
-    #[test]
-    fn test_encode_decode_action_list() {
-        common_telemetry::init_default_ut_logging();
-        let mut protocol = ProtocolAction::new();
-        protocol.min_reader_version = 1;
-        let mut action_list = RegionMetaActionList::new(vec![
-            RegionMetaAction::Protocol(protocol.clone()),
-            RegionMetaAction::Edit(test_utils::build_region_edit(
-                99,
-                &[FileId::random(), FileId::random()],
-                &[FileId::random()],
-            )),
-        ]);
-        action_list.set_prev_version(3);
-
-        let bs = action_list.encode().unwrap();
-        // {"prev_version":3}
-        // {"Protocol":{"min_reader_version":1,"min_writer_version":0}}
-        // {"Edit":{"region_version":0,"flush_sequence":99,"files_to_add":[{"file_name":"test1","level":1},{"file_name":"test2","level":2}],"files_to_remove":[{"file_name":"test0","level":0}]}}
-
-        logging::debug!(
-            "Encoded action list: \r\n{}",
-            String::from_utf8(bs.clone()).unwrap()
-        );
-
-        let e = RegionMetaActionList::decode(&bs, 0);
-        assert!(e.is_err());
-        assert_eq!(
-            "Manifest protocol forbid to read, min_version: 1, supported_version: 0",
-            format!("{}", e.err().unwrap())
-        );
-
-        let (decode_list, p) = RegionMetaActionList::decode(&bs, 1).unwrap();
-        assert_eq!(decode_list, action_list);
-        assert_eq!(p.unwrap(), protocol);
-    }
-
-    // These tests are used to ensure backward compatibility of manifest files.
-    // DO NOT modify the serialized string when they fail, check if your
-    // modification to manifest-related structs is compatible with older manifests.
-    #[test]
-    fn test_region_manifest_compatibility() {
-        let region_edit = r#"{"region_version":0,"flushed_sequence":null,"files_to_add":[{"region_id":4402341478400,"file_name":"4b220a70-2b03-4641-9687-b65d94641208.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":1}],"files_to_remove":[{"region_id":4402341478400,"file_name":"34b6ebb9-b8a5-4a4b-b744-56f67defad02.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":0}]}"#;
-        let _ = serde_json::from_str::<RegionEdit>(region_edit).unwrap();
-
-        let region_change = r#" {"committed_sequence":42,"metadata":{"id":0,"name":"region-0","columns":{"columns":[{"cf_id":0,"desc":{"id":2,"name":"k1","data_type":{"Int32":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":0,"desc":{"id":1,"name":"timestamp","data_type":{"Timestamp":{"Millisecond":null}},"is_nullable":false,"is_time_index":true,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":3,"name":"v1","data_type":{"Float32":{}},"is_nullable":true,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483649,"name":"__sequence","data_type":{"UInt64":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483650,"name":"__op_type","data_type":{"UInt8":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}}],"row_key_end":2,"timestamp_key_index":1,"enable_version_column":false,"user_column_end":3},"column_families":{"column_families":[{"name":"default","cf_id":1,"column_index_start":2,"column_index_end":3}]},"version":0}}"#;
-        let _ = serde_json::from_str::<RegionChange>(region_change).unwrap();
-
-        let region_remove = r#"{"region_id":42}"#;
-        let _ = serde_json::from_str::<RegionRemove>(region_remove).unwrap();
-
-        let protocol_action = r#"{"min_reader_version":1,"min_writer_version":2}"#;
-        let _ = serde_json::from_str::<ProtocolAction>(protocol_action).unwrap();
-    }
-
-    fn mock_file_meta() -> FileMeta {
-        FileMeta {
-            region_id: 0.into(),
-            file_id: FileId::random(),
-            time_range: None,
-            level: 0,
-            file_size: 1024,
-        }
-    }
-
-    #[test]
-    fn test_region_manifest_builder() {
-        let desc = RegionDescBuilder::new("test_region_manifest_builder")
-            .push_field_column(("v0", LogicalTypeId::Int64, true))
-            .build();
-        let region_metadata: RegionMetadata = desc.try_into().unwrap();
-
-        let mut builder = RegionManifestDataBuilder::with_checkpoint(None);
-
-        builder.apply_change(RegionChange {
-            committed_sequence: 42,
-            metadata: RawRegionMetadata::from(&region_metadata),
-        });
-        let files = vec![mock_file_meta(), mock_file_meta()];
-        builder.apply_edit(
-            84,
-            RegionEdit {
-                region_version: 0,
-                flushed_sequence: Some(99),
-                files_to_add: files.clone(),
-                files_to_remove: vec![],
-                compaction_time_window: None,
-            },
-        );
-        builder.apply_edit(
-            85,
-            RegionEdit {
-                region_version: 0,
-                flushed_sequence: Some(100),
-                files_to_add: vec![],
-                files_to_remove: vec![files[0].clone()],
-                compaction_time_window: None,
-            },
-        );
-
-        let manifest = builder.build();
-        assert_eq!(manifest.metadata, RawRegionMetadata::from(&region_metadata));
-        assert_eq!(manifest.committed_sequence, 42);
-        assert_eq!(
-            manifest.version,
-            Some(RegionVersion {
-                manifest_version: 85,
-                flushed_sequence: Some(100),
-                files: files[1..].iter().map(|f| (f.file_id, f.clone())).collect(),
-            })
-        );
-    }
-
-    #[test]
-    fn test_encode_decode_region_checkpoint() {
-        let region_checkpoint = RegionCheckpoint {
-            protocol: ProtocolAction::default(),
-            last_version: 42,
-            compacted_actions: 10,
-            checkpoint: Some(RegionManifestData {
-                committed_sequence: 100,
-                metadata: RawRegionMetadata::default(),
-                version: Some(RegionVersion {
-                    manifest_version: 84,
-                    flushed_sequence: Some(99),
-                    files: vec![mock_file_meta(), mock_file_meta()]
-                        .into_iter()
-                        .map(|f| (f.file_id, f))
-                        .collect(),
-                }),
-            }),
-        };
-
-        let bytes = region_checkpoint.encode().unwrap();
-        assert!(!bytes.is_empty());
-        let decoded_checkpoint = RegionCheckpoint::decode(&bytes, 0).unwrap();
-        assert_eq!(region_checkpoint, decoded_checkpoint);
-    }
-}
--- a/src/storage/src/manifest/checkpoint.rs
+++ b/src/storage/src/manifest/checkpoint.rs
@@ -1,35 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-
-use async_trait::async_trait;
-use store_api::manifest::{Checkpoint, MetaAction};
-
-use crate::error::{Error, Result};
-use crate::manifest::ManifestImpl;
-
-#[async_trait]
-pub trait Checkpointer: Send + Sync + std::fmt::Debug {
-    type Checkpoint: Checkpoint<Error = Error>;
-    type MetaAction: MetaAction<Error = Error>;
-
-    /// Try to create a checkpoint, return the checkpoint if successes.
-    async fn do_checkpoint(
-        &self,
-        manifest: &ManifestImpl<Self::Checkpoint, Self::MetaAction>,
-    ) -> Result<Option<Self::Checkpoint>>;
-
-    fn as_any(&self) -> &dyn Any;
-}
--- a/src/storage/src/manifest/helper.rs
+++ b/src/storage/src/manifest/helper.rs
@@ -1,69 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::io::Write;
-
-use serde::Serialize;
-use serde_json::to_writer;
-use snafu::{ensure, ResultExt};
-use store_api::manifest::action::{ProtocolVersion, VersionHeader};
-use store_api::manifest::ManifestVersion;
-
-use crate::error::{
-    DecodeJsonSnafu, EncodeJsonSnafu, ManifestProtocolForbidReadSnafu, Result, Utf8Snafu,
-};
-use crate::manifest::action::RegionCheckpoint;
-
-pub const NEWLINE: &[u8] = b"\n";
-
-pub fn encode_actions<T: Serialize>(
-    prev_version: ManifestVersion,
-    actions: &[T],
-) -> Result<Vec<u8>> {
-    let mut bytes = Vec::default();
-    {
-        // Encode prev_version
-        let v = VersionHeader { prev_version };
-
-        to_writer(&mut bytes, &v).context(EncodeJsonSnafu)?;
-        // unwrap is fine here, because we write into a buffer.
-        bytes.write_all(NEWLINE).unwrap();
-    }
-
-    for action in actions {
-        to_writer(&mut bytes, action).context(EncodeJsonSnafu)?;
-        bytes.write_all(NEWLINE).unwrap();
-    }
-
-    Ok(bytes)
-}
-
-pub fn encode_checkpoint(snasphot: &RegionCheckpoint) -> Result<Vec<u8>> {
-    let s = serde_json::to_string(snasphot).context(EncodeJsonSnafu)?;
-    Ok(s.into_bytes())
-}
-
-pub fn decode_checkpoint(bs: &[u8], reader_version: ProtocolVersion) -> Result<RegionCheckpoint> {
-    let s = std::str::from_utf8(bs).context(Utf8Snafu)?;
-    let checkpoint: RegionCheckpoint = serde_json::from_str(s).context(DecodeJsonSnafu)?;
-    ensure!(
-        checkpoint.protocol.is_readable(reader_version),
-        ManifestProtocolForbidReadSnafu {
-            min_version: checkpoint.protocol.min_reader_version,
-            supported_version: reader_version,
-        }
-    );
-
-    Ok(checkpoint)
-}
--- a/src/storage/src/manifest/impl_.rs
+++ b/src/storage/src/manifest/impl_.rs
@@ -1,405 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::marker::PhantomData;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::Arc;
-use std::time::Duration;
-
-use arc_swap::ArcSwap;
-use async_trait::async_trait;
-use common_datasource::compression::CompressionType;
-use common_runtime::{RepeatedTask, TaskFunction};
-use common_telemetry::{debug, logging, warn};
-use object_store::ObjectStore;
-use snafu::{ensure, ResultExt};
-use store_api::manifest::action::{self, ProtocolAction, ProtocolVersion};
-use store_api::manifest::*;
-
-use crate::error::{
-    Error, ManifestProtocolForbidWriteSnafu, Result, StartManifestGcTaskSnafu,
-    StopManifestGcTaskSnafu,
-};
-use crate::manifest::action::RegionCheckpoint;
-use crate::manifest::checkpoint::Checkpointer;
-use crate::manifest::storage::{ManifestObjectStore, ObjectStoreLogIterator};
-
-const CHECKPOINT_ACTIONS_MARGIN: u16 = 10;
-const GC_DURATION_SECS: u64 = 600;
-
-#[derive(Clone, Debug)]
-pub struct ManifestImpl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
-    inner: Arc<ManifestImplInner<S, M>>,
-    checkpointer: Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>>,
-    last_checkpoint_version: Arc<AtomicU64>,
-    checkpoint_actions_margin: u16,
-    gc_task: Option<Arc<RepeatedTask<Error>>>,
-}
-
-impl<S: 'static + Checkpoint<Error = Error>, M: 'static + MetaAction<Error = Error>>
-    ManifestImpl<S, M>
-{
-    pub fn new(
-        manifest_dir: &str,
-        object_store: ObjectStore,
-        compress_type: CompressionType,
-        checkpoint_actions_margin: Option<u16>,
-        gc_duration: Option<Duration>,
-        checkpointer: Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>>,
-    ) -> Self {
-        let inner = Arc::new(ManifestImplInner::new(
-            manifest_dir,
-            object_store,
-            compress_type,
-        ));
-        let gc_task = if checkpointer.is_some() {
-            // only start gc task when checkpoint is enabled.
-            Some(Arc::new(RepeatedTask::new(
-                gc_duration.unwrap_or_else(|| Duration::from_secs(GC_DURATION_SECS)),
-                Box::new(ManifestGcTask {
-                    inner: inner.clone(),
-                }),
-            )))
-        } else {
-            None
-        };
-
-        ManifestImpl {
-            inner,
-            checkpointer,
-            checkpoint_actions_margin: checkpoint_actions_margin
-                .unwrap_or(CHECKPOINT_ACTIONS_MARGIN),
-            last_checkpoint_version: Arc::new(AtomicU64::new(MIN_VERSION)),
-            gc_task,
-        }
-    }
-
-    pub fn create(
-        manifest_dir: &str,
-        object_store: ObjectStore,
-        compress_type: CompressionType,
-    ) -> Self {
-        Self::new(manifest_dir, object_store, compress_type, None, None, None)
-    }
-
-    #[inline]
-    pub(crate) fn checkpointer(
-        &self,
-    ) -> &Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>> {
-        &self.checkpointer
-    }
-
-    #[inline]
-    pub(crate) fn set_last_checkpoint_version(&self, version: ManifestVersion) {
-        self.last_checkpoint_version
-            .store(version, Ordering::Relaxed);
-    }
-
-    /// Update inner state.
-    pub fn update_state(&self, version: ManifestVersion, protocol: Option<ProtocolAction>) {
-        self.inner.update_state(version, protocol);
-    }
-
-    pub(crate) async fn save_checkpoint(&self, checkpoint: &RegionCheckpoint) -> Result<()> {
-        ensure!(
-            checkpoint
-                .protocol
-                .is_writable(self.inner.supported_writer_version),
-            ManifestProtocolForbidWriteSnafu {
-                min_version: checkpoint.protocol.min_writer_version,
-                supported_version: self.inner.supported_writer_version,
-            }
-        );
-        let bytes = checkpoint.encode()?;
-        self.manifest_store()
-            .save_checkpoint(checkpoint.last_version, &bytes)
-            .await
-    }
-
-    pub(crate) async fn may_do_checkpoint(&self, version: ManifestVersion) -> Result<()> {
-        if version - self.last_checkpoint_version.load(Ordering::Relaxed)
-            >= self.checkpoint_actions_margin as u64
-        {
-            let s = self.do_checkpoint().await?;
-            debug!("Manifest checkpoint, checkpoint: {:#?}", s);
-        }
-
-        Ok(())
-    }
-
-    #[inline]
-    pub(crate) fn manifest_store(&self) -> &Arc<ManifestObjectStore> {
-        self.inner.manifest_store()
-    }
-}
-
-#[async_trait]
-impl<S: 'static + Checkpoint<Error = Error>, M: 'static + MetaAction<Error = Error>> Manifest
-    for ManifestImpl<S, M>
-{
-    type Error = Error;
-    type Checkpoint = S;
-    type MetaAction = M;
-    type MetaActionIterator = MetaActionIteratorImpl<M>;
-
-    async fn update(&self, action_list: M) -> Result<ManifestVersion> {
-        let version = self.inner.save(action_list).await?;
-
-        self.may_do_checkpoint(version).await?;
-        Ok(version)
-    }
-
-    async fn scan(
-        &self,
-        start: ManifestVersion,
-        end: ManifestVersion,
-    ) -> Result<Self::MetaActionIterator> {
-        self.inner.scan(start, end).await
-    }
-
-    async fn do_checkpoint(&self) -> Result<Option<S>> {
-        if let Some(cp) = &self.checkpointer {
-            let checkpoint = cp.do_checkpoint(self).await?;
-            if let Some(checkpoint) = &checkpoint {
-                self.set_last_checkpoint_version(checkpoint.last_version());
-            }
-            return Ok(checkpoint);
-        }
-
-        Ok(None)
-    }
-
-    async fn last_checkpoint(&self) -> Result<Option<S>> {
-        self.inner.last_checkpoint().await
-    }
-
-    fn last_version(&self) -> ManifestVersion {
-        self.inner.last_version()
-    }
-
-    async fn start(&self) -> Result<()> {
-        if let Some(task) = &self.gc_task {
-            task.start(common_runtime::bg_runtime())
-                .context(StartManifestGcTaskSnafu)?;
-        }
-
-        Ok(())
-    }
-
-    async fn stop(&self) -> Result<()> {
-        if let Some(task) = &self.gc_task {
-            task.stop().await.context(StopManifestGcTaskSnafu)?;
-        }
-
-        Ok(())
-    }
-}
-
-#[derive(Debug)]
-struct ManifestImplInner<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
-    store: Arc<ManifestObjectStore>,
-    version: AtomicU64,
-    /// Current using protocol
-    protocol: ArcSwap<ProtocolAction>,
-    /// Current node supported protocols (reader_version, writer_version)
-    supported_reader_version: ProtocolVersion,
-    supported_writer_version: ProtocolVersion,
-    _phantom: PhantomData<(S, M)>,
-}
-
-pub struct MetaActionIteratorImpl<M: MetaAction<Error = Error>> {
-    log_iter: ObjectStoreLogIterator,
-    reader_version: ProtocolVersion,
-    last_protocol: Option<ProtocolAction>,
-    _phantom: PhantomData<M>,
-}
-
-impl<M: MetaAction<Error = Error>> MetaActionIteratorImpl<M> {
-    pub fn last_protocol(&self) -> &Option<ProtocolAction> {
-        &self.last_protocol
-    }
-}
-
-#[async_trait]
-impl<M: MetaAction<Error = Error>> MetaActionIterator for MetaActionIteratorImpl<M> {
-    type Error = Error;
-    type MetaAction = M;
-
-    async fn next_action(&mut self) -> Result<Option<(ManifestVersion, M)>> {
-        match self.log_iter.next_log().await? {
-            Some((v, bytes)) => {
-                let (action_list, protocol) = M::decode(&bytes, self.reader_version)?;
-
-                if protocol.is_some() {
-                    self.last_protocol = protocol;
-                }
-
-                Ok(Some((v, action_list)))
-            }
-            None => Ok(None),
-        }
-    }
-}
-
-struct ManifestGcTask<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
-    inner: Arc<ManifestImplInner<S, M>>,
-}
-
-#[async_trait::async_trait]
-impl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> TaskFunction<Error>
-    for ManifestGcTask<S, M>
-{
-    fn name(&self) -> &str {
-        "region-manifest-gc"
-    }
-
-    async fn call(&mut self) -> Result<()> {
-        if let Some((last_version, _)) = self.inner.store.load_last_checkpoint().await? {
-            // Purge all manifest <= last_version and checkpoint files < last_version.
-            let deleted = self
-                .inner
-                .store
-                .delete_until(last_version + 1, true)
-                .await?;
-            debug!(
-                "Deleted {} logs from region manifest storage(path={}), last_version: {}.",
-                deleted,
-                self.inner.store.path(),
-                last_version,
-            );
-        }
-
-        Ok(())
-    }
-}
-
-impl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> ManifestImplInner<S, M> {
-    fn new(manifest_dir: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self {
-        let (reader_version, writer_version) = action::supported_protocol_version();
-
-        Self {
-            store: Arc::new(ManifestObjectStore::new(
-                manifest_dir,
-                object_store,
-                compress_type,
-            )),
-            version: AtomicU64::new(0),
-            protocol: ArcSwap::new(Arc::new(ProtocolAction::new())),
-            supported_reader_version: reader_version,
-            supported_writer_version: writer_version,
-            _phantom: PhantomData,
-        }
-    }
-
-    #[inline]
-    fn manifest_store(&self) -> &Arc<ManifestObjectStore> {
-        &self.store
-    }
-
-    #[inline]
-    fn inc_version(&self) -> ManifestVersion {
-        self.version.fetch_add(1, Ordering::Relaxed)
-    }
-
-    fn update_state(&self, version: ManifestVersion, protocol: Option<ProtocolAction>) {
-        self.version.store(version, Ordering::Relaxed);
-        if let Some(p) = protocol {
-            self.protocol.store(Arc::new(p));
-        }
-    }
-
-    #[inline]
-    fn last_version(&self) -> ManifestVersion {
-        self.version.load(Ordering::Relaxed)
-    }
-
-    async fn save(&self, mut action_list: M) -> Result<ManifestVersion> {
-        let protocol = self.protocol.load();
-
-        ensure!(
-            protocol.is_writable(self.supported_writer_version),
-            ManifestProtocolForbidWriteSnafu {
-                min_version: protocol.min_writer_version,
-                supported_version: self.supported_writer_version,
-            }
-        );
-
-        let version = self.inc_version();
-
-        if version == 0 || protocol.min_writer_version < self.supported_writer_version {
-            let new_protocol = ProtocolAction {
-                min_reader_version: self.supported_reader_version,
-                min_writer_version: self.supported_writer_version,
-            };
-            action_list.set_protocol(new_protocol.clone());
-
-            logging::info!(
-                "Updated manifest protocol from {} to {}.",
-                protocol,
-                new_protocol
-            );
-
-            self.protocol.store(Arc::new(new_protocol));
-        }
-
-        logging::debug!(
-            "Save region metadata action: {:?}, version: {}",
-            action_list,
-            version
-        );
-
-        self.store.save(version, &action_list.encode()?).await?;
-
-        Ok(version)
-    }
-
-    async fn scan(
-        &self,
-        start: ManifestVersion,
-        end: ManifestVersion,
-    ) -> Result<MetaActionIteratorImpl<M>> {
-        Ok(MetaActionIteratorImpl {
-            log_iter: self.store.scan(start, end).await?,
-            reader_version: self.supported_reader_version,
-            last_protocol: None,
-            _phantom: PhantomData,
-        })
-    }
-
-    async fn last_checkpoint(&self) -> Result<Option<S>> {
-        let protocol = self.protocol.load();
-        let last_checkpoint = self.store.load_last_checkpoint().await?;
-
-        if let Some((version, bytes)) = last_checkpoint {
-            let checkpoint = S::decode(&bytes, protocol.min_reader_version)?;
-            assert!(checkpoint.last_version() >= version);
-            if checkpoint.last_version() > version {
-                // It happens when saving checkpoint successfully, but failed at saving checkpoint metadata(the "__last_checkpoint" file).
-                // Then we try to use the old checkpoint and do the checkpoint next time.
-                // If the old checkpoint was deleted, it's fine that we return the latest checkpoint.
-                // The only side effect is leaving some unused checkpoint files,
-                // and they will be purged by gc task.
-                warn!("The checkpoint manifest version {} in {} is greater than checkpoint metadata version {}.", self.store.path(), checkpoint.last_version(), version);
-
-                if let Some((_, bytes)) = self.store.load_checkpoint(version).await? {
-                    let old_checkpoint = S::decode(&bytes, protocol.min_reader_version)?;
-                    return Ok(Some(old_checkpoint));
-                }
-            }
-            Ok(Some(checkpoint))
-        } else {
-            Ok(None)
-        }
-    }
-}
--- a/src/storage/src/manifest/region.rs
+++ b/src/storage/src/manifest/region.rs
@@ -1,690 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region manifest impl
-use std::any::Any;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::Arc;
-use std::time::Duration;
-
-use async_trait::async_trait;
-use common_datasource::compression::CompressionType;
-use common_telemetry::{info, warn};
-use object_store::ObjectStore;
-use store_api::manifest::action::ProtocolAction;
-use store_api::manifest::{
-    Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator, MIN_VERSION,
-};
-
-use crate::error::{ManifestCheckpointSnafu, Result};
-use crate::manifest::action::*;
-use crate::manifest::checkpoint::Checkpointer;
-use crate::manifest::ManifestImpl;
-
-pub type RegionManifest = ManifestImpl<RegionCheckpoint, RegionMetaActionList>;
-
-#[derive(Debug)]
-pub struct RegionManifestCheckpointer {
-    // The latest manifest version when flushing memtables.
-    // Checkpoint can't exceed over flushed manifest version because we have to keep
-    // the region metadata for replaying WAL to ensure correct data schema.
-    flushed_manifest_version: AtomicU64,
-}
-
-impl RegionManifestCheckpointer {
-    pub(crate) fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) {
-        let current = self.flushed_manifest_version.load(Ordering::Relaxed);
-
-        self.flushed_manifest_version
-            .store(current.max(manifest_version), Ordering::Relaxed);
-    }
-}
-
-#[async_trait]
-impl Checkpointer for RegionManifestCheckpointer {
-    type Checkpoint = RegionCheckpoint;
-    type MetaAction = RegionMetaActionList;
-
-    async fn do_checkpoint(
-        &self,
-        manifest: &ManifestImpl<RegionCheckpoint, RegionMetaActionList>,
-    ) -> Result<Option<RegionCheckpoint>> {
-        let last_checkpoint = manifest.last_checkpoint().await?;
-
-        let current_version = manifest.last_version();
-        let (start_version, mut protocol, mut manifest_builder) =
-            if let Some(checkpoint) = last_checkpoint {
-                (
-                    checkpoint.last_version + 1,
-                    checkpoint.protocol,
-                    RegionManifestDataBuilder::with_checkpoint(checkpoint.checkpoint),
-                )
-            } else {
-                (
-                    MIN_VERSION,
-                    ProtocolAction::default(),
-                    RegionManifestDataBuilder::default(),
-                )
-            };
-
-        let end_version =
-            current_version.min(self.flushed_manifest_version.load(Ordering::Relaxed)) + 1;
-        if start_version >= end_version {
-            return Ok(None);
-        }
-
-        info!("Begin to do region manifest checkpoint, path: {}, start_version: {}, end_version: {}, flushed_manifest_version: {}",
-              manifest.manifest_store().path(),
-              start_version,
-              end_version,
-              self.flushed_manifest_version.load(Ordering::Relaxed));
-
-        let mut iter = manifest.scan(start_version, end_version).await?;
-
-        let mut last_version = start_version;
-        let mut compacted_actions = 0;
-        while let Some((version, action_list)) = iter.next_action().await? {
-            for action in action_list.actions {
-                match action {
-                    RegionMetaAction::Change(c) => manifest_builder.apply_change(c),
-                    RegionMetaAction::Edit(e) => manifest_builder.apply_edit(version, e),
-                    RegionMetaAction::Protocol(p) => protocol = p,
-                    action => {
-                        return ManifestCheckpointSnafu {
-                            msg: format!("can't apply region action: {:?}", action),
-                        }
-                        .fail();
-                    }
-                }
-            }
-            last_version = version;
-            compacted_actions += 1;
-        }
-
-        if compacted_actions == 0 {
-            return Ok(None);
-        }
-
-        let region_manifest = manifest_builder.build();
-        let checkpoint = RegionCheckpoint {
-            protocol,
-            last_version,
-            compacted_actions,
-            checkpoint: Some(region_manifest),
-        };
-
-        manifest.save_checkpoint(&checkpoint).await?;
-        if let Err(e) = manifest
-            .manifest_store()
-            .delete(start_version, last_version + 1)
-            .await
-        {
-            // We only log when the error kind isn't `NotFound`
-            if !e.is_object_to_delete_not_found() {
-                // It doesn't matter when deletion fails, they will be purged by gc task.
-                warn!(
-                    "Failed to delete manifest logs [{},{}] in path: {}. err: {}",
-                    start_version,
-                    last_version,
-                    manifest.manifest_store().path(),
-                    e
-                );
-            }
-        }
-
-        info!("Region manifest checkpoint, path: {}, start_version: {}, last_version: {}, compacted actions: {}",
-              manifest.manifest_store().path(),
-              start_version,
-              last_version,
-              compacted_actions);
-
-        Ok(Some(checkpoint))
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
-impl RegionManifest {
-    pub fn with_checkpointer(
-        manifest_dir: &str,
-        object_store: ObjectStore,
-        compress_type: CompressionType,
-        checkpoint_actions_margin: Option<u16>,
-        gc_duration: Option<Duration>,
-    ) -> Self {
-        Self::new(
-            manifest_dir,
-            object_store,
-            compress_type,
-            checkpoint_actions_margin,
-            gc_duration,
-            Some(Arc::new(RegionManifestCheckpointer {
-                flushed_manifest_version: AtomicU64::new(0),
-            })),
-        )
-    }
-
-    // Update flushed manifest version in checkpointer
-    pub fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) {
-        if let Some(checkpointer) = self.checkpointer() {
-            if let Some(checkpointer) = checkpointer
-                .as_any()
-                .downcast_ref::<RegionManifestCheckpointer>()
-            {
-                checkpointer.set_flushed_manifest_version(manifest_version);
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use common_test_util::temp_dir::create_temp_dir;
-    use object_store::services::{Fs, S3};
-    use object_store::test_util::{s3_test_config, TempFolder};
-    use object_store::ObjectStore;
-    use store_api::manifest::action::ProtocolAction;
-    use store_api::manifest::{Manifest, MetaActionIterator, MAX_VERSION};
-
-    use super::*;
-    use crate::manifest::manifest_compress_type;
-    use crate::manifest::test_utils::*;
-    use crate::metadata::RegionMetadata;
-    use crate::sst::FileId;
-
-    #[tokio::test]
-    async fn test_fs_region_manifest_compress() {
-        let manifest = new_fs_manifest(true, None).await;
-        test_region_manifest(&manifest).await
-    }
-
-    #[tokio::test]
-    async fn test_fs_region_manifest_uncompress() {
-        let manifest = new_fs_manifest(false, None).await;
-        test_region_manifest(&manifest).await
-    }
-
-    #[tokio::test]
-    async fn test_s3_region_manifest_compress() {
-        if s3_test_config().is_some() {
-            let (manifest, temp_dir) = new_s3_manifest(true, None).await;
-            test_region_manifest(&manifest).await;
-            temp_dir.remove_all().await.unwrap();
-        }
-    }
-
-    #[tokio::test]
-    async fn test_s3_region_manifest_uncompress() {
-        if s3_test_config().is_some() {
-            let (manifest, temp_dir) = new_s3_manifest(false, None).await;
-            test_region_manifest(&manifest).await;
-            temp_dir.remove_all().await.unwrap();
-        }
-    }
-
-    async fn new_fs_manifest(compress: bool, gc_duration: Option<Duration>) -> RegionManifest {
-        let tmp_dir = create_temp_dir("test_region_manifest");
-        let mut builder = Fs::default();
-        let _ = builder.root(&tmp_dir.path().to_string_lossy());
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-
-        let manifest = RegionManifest::with_checkpointer(
-            "/manifest/",
-            object_store,
-            manifest_compress_type(compress),
-            None,
-            gc_duration,
-        );
-        manifest.start().await.unwrap();
-        manifest
-    }
-
-    async fn new_s3_manifest(
-        compress: bool,
-        gc_duration: Option<Duration>,
-    ) -> (RegionManifest, TempFolder) {
-        let s3_config = s3_test_config().unwrap();
-        let mut builder = S3::default();
-        let _ = builder
-            .root(&s3_config.root)
-            .access_key_id(&s3_config.access_key_id)
-            .secret_access_key(&s3_config.secret_access_key)
-            .bucket(&s3_config.bucket);
-
-        if s3_config.region.is_some() {
-            let _ = builder.region(s3_config.region.as_ref().unwrap());
-        }
-        let store = ObjectStore::new(builder).unwrap().finish();
-        let temp_folder = TempFolder::new(&store, "/");
-        let manifest = RegionManifest::with_checkpointer(
-            "/manifest/",
-            store,
-            manifest_compress_type(compress),
-            None,
-            gc_duration,
-        );
-        manifest.start().await.unwrap();
-
-        (manifest, temp_folder)
-    }
-
-    async fn test_region_manifest(manifest: &RegionManifest) {
-        common_telemetry::init_default_ut_logging();
-
-        let region_meta = Arc::new(build_region_meta());
-
-        assert_eq!(
-            None,
-            manifest
-                .scan(0, MAX_VERSION)
-                .await
-                .unwrap()
-                .next_action()
-                .await
-                .unwrap()
-        );
-
-        assert!(manifest
-            .update(RegionMetaActionList::with_action(RegionMetaAction::Change(
-                RegionChange {
-                    metadata: region_meta.as_ref().into(),
-                    committed_sequence: 99,
-                },
-            )))
-            .await
-            .is_ok());
-
-        let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
-
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        assert_eq!(0, v);
-        assert_eq!(2, action_list.actions.len());
-        let protocol = &action_list.actions[0];
-        assert!(matches!(
-            protocol,
-            RegionMetaAction::Protocol(ProtocolAction { .. })
-        ));
-
-        let action = &action_list.actions[1];
-
-        match action {
-            RegionMetaAction::Change(c) => {
-                assert_eq!(
-                    RegionMetadata::try_from(c.metadata.clone()).unwrap(),
-                    *region_meta
-                );
-                assert_eq!(c.committed_sequence, 99);
-            }
-            _ => unreachable!(),
-        }
-
-        // Save some actions
-        assert!(manifest
-            .update(RegionMetaActionList::new(vec![
-                RegionMetaAction::Edit(build_region_edit(1, &[FileId::random()], &[])),
-                RegionMetaAction::Edit(build_region_edit(
-                    2,
-                    &[FileId::random(), FileId::random()],
-                    &[],
-                )),
-            ]))
-            .await
-            .is_ok());
-
-        let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        assert_eq!(0, v);
-        assert_eq!(2, action_list.actions.len());
-        let protocol = &action_list.actions[0];
-        assert!(matches!(
-            protocol,
-            RegionMetaAction::Protocol(ProtocolAction { .. })
-        ));
-
-        let action = &action_list.actions[1];
-        match action {
-            RegionMetaAction::Change(c) => {
-                assert_eq!(
-                    RegionMetadata::try_from(c.metadata.clone()).unwrap(),
-                    *region_meta
-                );
-                assert_eq!(c.committed_sequence, 99);
-            }
-            _ => unreachable!(),
-        }
-
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        assert_eq!(1, v);
-        assert_eq!(2, action_list.actions.len());
-        assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_)));
-        assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_)));
-
-        // Reach end
-        assert!(iter.next_action().await.unwrap().is_none());
-
-        manifest.stop().await.unwrap();
-    }
-
-    async fn assert_scan(manifest: &RegionManifest, start_version: ManifestVersion, expected: u64) {
-        let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
-        let mut actions = 0;
-        while let Some((v, _)) = iter.next_action().await.unwrap() {
-            assert_eq!(v, start_version + actions);
-            actions += 1;
-        }
-        assert_eq!(expected, actions);
-    }
-
-    #[tokio::test(flavor = "multi_thread")]
-    async fn test_fs_region_manifest_checkpoint_compress() {
-        let duration = Duration::from_millis(50);
-        let manifest = new_fs_manifest(true, Some(duration)).await;
-
-        test_region_manifest_checkpoint(&manifest, duration).await
-    }
-
-    #[tokio::test]
-    async fn test_fs_region_manifest_checkpoint_uncompress() {
-        let duration = Duration::from_millis(50);
-        let manifest = new_fs_manifest(false, Some(duration)).await;
-
-        test_region_manifest_checkpoint(&manifest, duration).await
-    }
-
-    #[tokio::test]
-    async fn test_s3_region_manifest_checkpoint_compress() {
-        if s3_test_config().is_some() {
-            let duration = Duration::from_millis(50);
-            let (manifest, temp_dir) = new_s3_manifest(true, Some(duration)).await;
-
-            test_region_manifest_checkpoint(&manifest, duration).await;
-            temp_dir.remove_all().await.unwrap();
-        }
-    }
-
-    #[tokio::test]
-    async fn test_s3_region_manifest_checkpoint_uncompress() {
-        if s3_test_config().is_some() {
-            let duration = Duration::from_millis(50);
-            let (manifest, temp_dir) = new_s3_manifest(false, Some(duration)).await;
-
-            test_region_manifest_checkpoint(&manifest, duration).await;
-            temp_dir.remove_all().await.unwrap();
-        }
-    }
-
-    async fn test_region_manifest_checkpoint(
-        manifest: &RegionManifest,
-        test_gc_duration: Duration,
-    ) {
-        common_telemetry::init_default_ut_logging();
-
-        let region_meta = Arc::new(build_region_meta());
-        let new_region_meta = Arc::new(build_altered_region_meta());
-
-        let file = FileId::random();
-        let file_ids = vec![FileId::random(), FileId::random()];
-
-        let actions: Vec<RegionMetaActionList> = vec![
-            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
-                metadata: region_meta.as_ref().into(),
-                committed_sequence: 1,
-            })),
-            RegionMetaActionList::new(vec![
-                RegionMetaAction::Edit(build_region_edit(2, &[file], &[])),
-                RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])),
-            ]),
-            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
-                metadata: new_region_meta.as_ref().into(),
-                committed_sequence: 99,
-            })),
-        ];
-
-        for action in actions {
-            let _ = manifest.update(action).await.unwrap();
-        }
-        assert!(manifest.last_checkpoint().await.unwrap().is_none());
-        assert_scan(manifest, 0, 3).await;
-        // update flushed manifest version for doing checkpoint
-        manifest.set_flushed_manifest_version(2);
-
-        let mut checkpoint_versions = vec![];
-
-        // do a checkpoint
-        let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
-        let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(checkpoint, last_checkpoint);
-        assert_eq!(checkpoint.compacted_actions, 3);
-        assert_eq!(checkpoint.last_version, 2);
-        checkpoint_versions.push(2);
-        let alterd_raw_meta = RawRegionMetadata::from(new_region_meta.as_ref());
-        assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
-            committed_sequence: 99,
-            metadata,
-            version: Some(RegionVersion {
-                manifest_version: 1,
-                flushed_sequence: Some(3),
-                files,
-            }),
-        }) if files.len() == 2 &&
-                         files.contains_key(&file_ids[0]) &&
-                         files.contains_key(&file_ids[1]) &&
-                         *metadata == alterd_raw_meta));
-        // all actions were compacted
-        assert_eq!(
-            None,
-            manifest
-                .scan(0, MAX_VERSION)
-                .await
-                .unwrap()
-                .next_action()
-                .await
-                .unwrap()
-        );
-
-        assert!(manifest.do_checkpoint().await.unwrap().is_none());
-        let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(checkpoint, last_checkpoint);
-
-        // add new actions
-        let new_file = FileId::random();
-        let actions: Vec<RegionMetaActionList> = vec![
-            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
-                metadata: region_meta.as_ref().into(),
-                committed_sequence: 200,
-            })),
-            RegionMetaActionList::new(vec![RegionMetaAction::Edit(build_region_edit(
-                201,
-                &[new_file],
-                &file_ids,
-            ))]),
-        ];
-        for action in actions {
-            let _ = manifest.update(action).await.unwrap();
-        }
-
-        assert_scan(manifest, 3, 2).await;
-
-        // do another checkpoints
-        // compacted RegionChange
-        manifest.set_flushed_manifest_version(3);
-        let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
-        let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(checkpoint, last_checkpoint);
-        assert_eq!(checkpoint.compacted_actions, 1);
-        assert_eq!(checkpoint.last_version, 3);
-        checkpoint_versions.push(3);
-        assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
-            committed_sequence: 200,
-            metadata,
-            version: Some(RegionVersion {
-                manifest_version: 1,
-                flushed_sequence: Some(3),
-                files,
-            }),
-        }) if files.len() == 2 &&
-                         files.contains_key(&file_ids[0]) &&
-                         files.contains_key(&file_ids[1]) &&
-                         *metadata == RawRegionMetadata::from(region_meta.as_ref())));
-
-        assert_scan(manifest, 4, 1).await;
-        // compacted RegionEdit
-        manifest.set_flushed_manifest_version(4);
-        let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
-        let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(checkpoint, last_checkpoint);
-        assert_eq!(checkpoint.compacted_actions, 1);
-        assert_eq!(checkpoint.last_version, 4);
-        checkpoint_versions.push(4);
-        assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
-            committed_sequence: 200,
-            metadata,
-            version: Some(RegionVersion {
-                manifest_version: 4,
-                flushed_sequence: Some(201),
-                files,
-            }),
-        }) if files.len() == 1 &&
-                         files.contains_key(&new_file) &&
-                         *metadata == RawRegionMetadata::from(region_meta.as_ref())));
-
-        // all actions were compacted
-        assert_eq!(
-            None,
-            manifest
-                .scan(0, MAX_VERSION)
-                .await
-                .unwrap()
-                .next_action()
-                .await
-                .unwrap()
-        );
-
-        // wait for gc
-        tokio::time::sleep(test_gc_duration * 3).await;
-
-        for v in checkpoint_versions {
-            if v < 4 {
-                // ensure old checkpoints were purged.
-                assert!(manifest
-                    .manifest_store()
-                    .load_checkpoint(v)
-                    .await
-                    .unwrap()
-                    .is_none());
-            } else {
-                // the last checkpoints is still exists.
-                let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
-                assert_eq!(checkpoint, last_checkpoint);
-            }
-        }
-
-        manifest.stop().await.unwrap();
-    }
-
-    #[tokio::test]
-    async fn test_region_manifest_truncate() {
-        common_telemetry::init_default_ut_logging();
-
-        let manifest = new_fs_manifest(false, None).await;
-        let region_meta = Arc::new(build_region_meta());
-        let committed_sequence = 99;
-
-        let file = FileId::random();
-        let file_ids = vec![FileId::random(), FileId::random()];
-
-        // Save some actions.
-        let actions: Vec<RegionMetaActionList> = vec![
-            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
-                metadata: region_meta.as_ref().into(),
-                committed_sequence: 1,
-            })),
-            RegionMetaActionList::new(vec![
-                RegionMetaAction::Edit(build_region_edit(2, &[file], &[])),
-                RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])),
-            ]),
-            RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
-                region_id: 0.into(),
-                committed_sequence,
-            })),
-            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
-                metadata: region_meta.as_ref().into(),
-                committed_sequence: 1,
-            })),
-        ];
-
-        for action in actions {
-            manifest.update(action).await.unwrap();
-        }
-
-        // Scan manifest.
-        let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
-
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        info!("action_list = {:?}", action_list.actions);
-        assert_eq!(0, v);
-        assert_eq!(2, action_list.actions.len());
-        let protocol = &action_list.actions[0];
-        assert!(matches!(
-            protocol,
-            RegionMetaAction::Protocol(ProtocolAction { .. })
-        ));
-
-        let change = &action_list.actions[1];
-        assert!(matches!(
-            change,
-            RegionMetaAction::Change(RegionChange {
-                committed_sequence: 1,
-                ..
-            })
-        ));
-
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        assert_eq!(1, v);
-        assert_eq!(2, action_list.actions.len());
-        assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_)));
-        assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_)));
-
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        assert_eq!(2, v);
-        assert_eq!(1, action_list.actions.len());
-        let truncate = &action_list.actions[0];
-        assert!(matches!(
-            truncate,
-            RegionMetaAction::Truncate(RegionTruncate {
-                committed_sequence: 99,
-                ..
-            })
-        ));
-
-        let (v, action_list) = iter.next_action().await.unwrap().unwrap();
-        assert_eq!(3, v);
-        assert_eq!(1, action_list.actions.len());
-        let change = &action_list.actions[0];
-        assert!(matches!(
-            change,
-            RegionMetaAction::Change(RegionChange {
-                committed_sequence: 1,
-                ..
-            })
-        ));
-
-        // Reach end
-        assert!(iter.next_action().await.unwrap().is_none());
-    }
-}
--- a/src/storage/src/manifest/storage.rs
+++ b/src/storage/src/manifest/storage.rs
@@ -1,741 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::iter::Iterator;
-use std::str::FromStr;
-
-use async_trait::async_trait;
-use common_datasource::compression::CompressionType;
-use common_telemetry::logging;
-use futures::TryStreamExt;
-use lazy_static::lazy_static;
-use object_store::{raw_normalize_path, util, Entry, ErrorKind, ObjectStore};
-use regex::Regex;
-use serde::{Deserialize, Serialize};
-use snafu::{ensure, ResultExt};
-use store_api::manifest::{LogIterator, ManifestLogStorage, ManifestVersion};
-
-use crate::error::{
-    CompressObjectSnafu, DecodeJsonSnafu, DecompressObjectSnafu, DeleteObjectSnafu,
-    EncodeJsonSnafu, Error, InvalidScanIndexSnafu, ListObjectsSnafu, ReadObjectSnafu, Result,
-    Utf8Snafu, WriteObjectSnafu,
-};
-
-lazy_static! {
-    static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap();
-    static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap();
-}
-
-const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
-const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip;
-/// Due to backward compatibility, it is possible that the user's manifest file has not been compressed.
-/// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing.
-const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
-
-#[inline]
-pub const fn manifest_compress_type(compress: bool) -> CompressionType {
-    if compress {
-        DEFAULT_MANIFEST_COMPRESSION_TYPE
-    } else {
-        FALL_BACK_COMPRESS_TYPE
-    }
-}
-
-#[inline]
-pub fn delta_file(version: ManifestVersion) -> String {
-    format!("{version:020}.json")
-}
-
-#[inline]
-pub fn checkpoint_file(version: ManifestVersion) -> String {
-    format!("{version:020}.checkpoint")
-}
-
-#[inline]
-pub fn gen_path(path: &str, file: &str, compress_type: CompressionType) -> String {
-    if compress_type == CompressionType::Uncompressed {
-        format!("{}{}", path, file)
-    } else {
-        format!("{}{}.{}", path, file, compress_type.file_extension())
-    }
-}
-
-/// Return's the file manifest version from path
-///
-/// # Panics
-/// Panics if the file path is not a valid delta or checkpoint file.
-#[inline]
-pub fn file_version(path: &str) -> ManifestVersion {
-    let s = path.split('.').next().unwrap();
-    s.parse().unwrap_or_else(|_| panic!("Invalid file: {path}"))
-}
-
-/// Return's the file compress algorithm by file extension.
-///
-/// for example file
-/// `00000000000000000000.json.gz` -> `CompressionType::GZIP`
-#[inline]
-pub fn file_compress_type(path: &str) -> CompressionType {
-    let s = path.rsplit('.').next().unwrap_or("");
-    CompressionType::from_str(s).unwrap_or(CompressionType::Uncompressed)
-}
-
-#[inline]
-pub fn is_delta_file(file_name: &str) -> bool {
-    DELTA_RE.is_match(file_name)
-}
-
-#[inline]
-pub fn is_checkpoint_file(file_name: &str) -> bool {
-    CHECKPOINT_RE.is_match(file_name)
-}
-
-pub struct ObjectStoreLogIterator {
-    object_store: ObjectStore,
-    iter: Box<dyn Iterator<Item = (ManifestVersion, Entry)> + Send + Sync>,
-}
-
-#[async_trait]
-impl LogIterator for ObjectStoreLogIterator {
-    type Error = Error;
-
-    async fn next_log(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
-        match self.iter.next() {
-            Some((v, entry)) => {
-                let compress_type = file_compress_type(entry.name());
-                let bytes = self
-                    .object_store
-                    .read(entry.path())
-                    .await
-                    .context(ReadObjectSnafu { path: entry.path() })?;
-                let data = compress_type
-                    .decode(bytes)
-                    .await
-                    .context(DecompressObjectSnafu {
-                        compress_type,
-                        path: entry.path(),
-                    })?;
-                Ok(Some((v, data)))
-            }
-            None => Ok(None),
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct ManifestObjectStore {
-    object_store: ObjectStore,
-    compress_type: CompressionType,
-    path: String,
-}
-
-impl ManifestObjectStore {
-    pub fn new(path: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self {
-        Self {
-            object_store,
-            compress_type,
-            path: util::normalize_dir(path),
-        }
-    }
-
-    #[inline]
-    /// Returns the delta file path under the **current** compression algorithm
-    fn delta_file_path(&self, version: ManifestVersion) -> String {
-        gen_path(&self.path, &delta_file(version), self.compress_type)
-    }
-
-    #[inline]
-    /// Returns the checkpoint file path under the **current** compression algorithm
-    fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
-        gen_path(&self.path, &checkpoint_file(version), self.compress_type)
-    }
-
-    #[inline]
-    /// Returns the last checkpoint path, because the last checkpoint is not compressed,
-    /// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
-    fn last_checkpoint_path(&self) -> String {
-        format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
-    }
-
-    /// Return all `R`s in the root directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
-    /// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
-    async fn get_paths<F, R>(&self, filter: F) -> Result<Vec<R>>
-    where
-        F: Fn(Entry) -> Option<R>,
-    {
-        let streamer = self
-            .object_store
-            .lister_with(&self.path)
-            .await
-            .context(ListObjectsSnafu { path: &self.path })?;
-        streamer
-            .try_filter_map(|e| async { Ok(filter(e)) })
-            .try_collect::<Vec<_>>()
-            .await
-            .context(ListObjectsSnafu { path: &self.path })
-    }
-
-    pub(crate) fn path(&self) -> &str {
-        &self.path
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-struct CheckpointMetadata {
-    pub size: usize,
-    /// The latest version this checkpoint contains.
-    pub version: ManifestVersion,
-    pub checksum: Option<String>,
-    pub extend_metadata: Option<HashMap<String, String>>,
-}
-
-impl CheckpointMetadata {
-    fn encode(&self) -> Result<impl AsRef<[u8]>> {
-        serde_json::to_string(self).context(EncodeJsonSnafu)
-    }
-
-    fn decode(bs: &[u8]) -> Result<Self> {
-        let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
-
-        serde_json::from_str(data).context(DecodeJsonSnafu)
-    }
-}
-
-#[async_trait]
-impl ManifestLogStorage for ManifestObjectStore {
-    type Error = Error;
-    type Iter = ObjectStoreLogIterator;
-
-    async fn scan(
-        &self,
-        start: ManifestVersion,
-        end: ManifestVersion,
-    ) -> Result<ObjectStoreLogIterator> {
-        ensure!(start <= end, InvalidScanIndexSnafu { start, end });
-
-        let mut entries: Vec<(ManifestVersion, Entry)> = self
-            .get_paths(|entry| {
-                let file_name = entry.name();
-                if is_delta_file(file_name) {
-                    let version = file_version(file_name);
-                    if start <= version && version < end {
-                        return Some((version, entry));
-                    }
-                }
-                None
-            })
-            .await?;
-
-        entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2));
-
-        Ok(ObjectStoreLogIterator {
-            object_store: self.object_store.clone(),
-            iter: Box::new(entries.into_iter()),
-        })
-    }
-
-    async fn delete_until(
-        &self,
-        end: ManifestVersion,
-        keep_last_checkpoint: bool,
-    ) -> Result<usize> {
-        // Stores (entry, is_checkpoint, version) in a Vec.
-        let entries: Vec<_> = self
-            .get_paths(|entry| {
-                let file_name = entry.name();
-                let is_checkpoint = is_checkpoint_file(file_name);
-                if is_delta_file(file_name) || is_checkpoint_file(file_name) {
-                    let version = file_version(file_name);
-                    if version < end {
-                        return Some((entry, is_checkpoint, version));
-                    }
-                }
-                None
-            })
-            .await?;
-        let checkpoint_version = if keep_last_checkpoint {
-            // Note that the order of entries is unspecific.
-            entries
-                .iter()
-                .filter_map(
-                    |(_e, is_checkpoint, version)| {
-                        if *is_checkpoint {
-                            Some(version)
-                        } else {
-                            None
-                        }
-                    },
-                )
-                .max()
-        } else {
-            None
-        };
-        let paths: Vec<_> = entries
-            .iter()
-            .filter(|(_e, is_checkpoint, version)| {
-                if let Some(max_version) = checkpoint_version {
-                    if *is_checkpoint {
-                        // We need to keep the checkpoint file.
-                        version < max_version
-                    } else {
-                        // We can delete the log file with max_version as the checkpoint
-                        // file contains the log file's content.
-                        version <= max_version
-                    }
-                } else {
-                    true
-                }
-            })
-            .map(|e| e.0.path().to_string())
-            .collect();
-        let ret = paths.len();
-
-        logging::debug!(
-            "Deleting {} logs from manifest storage path {} until {}, checkpoint: {:?}, paths: {:?}",
-            ret,
-            self.path,
-            end,
-            checkpoint_version,
-            paths,
-        );
-
-        self.object_store
-            .remove(paths)
-            .await
-            .with_context(|_| DeleteObjectSnafu {
-                path: self.path.clone(),
-            })?;
-
-        Ok(ret)
-    }
-
-    async fn delete_all(&self, remove_action_manifest: ManifestVersion) -> Result<()> {
-        let entries: Vec<Entry> = self.get_paths(Some).await?;
-
-        // Filter out the latest delta file.
-        let paths: Vec<_> = entries
-            .iter()
-            .filter(|e| {
-                let name = e.name();
-                if is_delta_file(name) && file_version(name) == remove_action_manifest {
-                    return false;
-                }
-                true
-            })
-            .map(|e| e.path().to_string())
-            .collect();
-
-        logging::info!(
-            "Deleting {} from manifest storage path {} paths: {:?}",
-            paths.len(),
-            self.path,
-            paths,
-        );
-
-        // Delete all files except the latest delta file.
-        self.object_store
-            .remove(paths)
-            .await
-            .with_context(|_| DeleteObjectSnafu {
-                path: self.path.clone(),
-            })?;
-
-        // Delete the latest delta file and the manifest directory.
-        self.object_store
-            .remove_all(&self.path)
-            .await
-            .with_context(|_| DeleteObjectSnafu {
-                path: self.path.clone(),
-            })?;
-        logging::info!("Deleted manifest storage path {}", self.path);
-
-        Ok(())
-    }
-
-    async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
-        let path = self.delta_file_path(version);
-        logging::debug!("Save log to manifest storage, version: {}", version);
-        let data = self
-            .compress_type
-            .encode(bytes)
-            .await
-            .context(CompressObjectSnafu {
-                compress_type: self.compress_type,
-                path: &path,
-            })?;
-        self.object_store
-            .write(&path, data)
-            .await
-            .context(WriteObjectSnafu { path })
-    }
-
-    async fn delete(&self, start: ManifestVersion, end: ManifestVersion) -> Result<()> {
-        ensure!(start <= end, InvalidScanIndexSnafu { start, end });
-
-        // Due to backward compatibility, it is possible that the user's log between start and end has not been compressed,
-        // so we need to delete the uncompressed file corresponding to that version, even if the uncompressed file in that version do not exist.
-        let mut paths = Vec::with_capacity(((end - start) * 2) as usize);
-        for version in start..end {
-            paths.push(raw_normalize_path(&self.delta_file_path(version)));
-            if self.compress_type != FALL_BACK_COMPRESS_TYPE {
-                paths.push(raw_normalize_path(&gen_path(
-                    &self.path,
-                    &delta_file(version),
-                    FALL_BACK_COMPRESS_TYPE,
-                )));
-            }
-        }
-
-        logging::debug!(
-            "Deleting logs from manifest storage, start: {}, end: {}",
-            start,
-            end
-        );
-
-        self.object_store
-            .remove(paths.clone())
-            .await
-            .with_context(|_| DeleteObjectSnafu {
-                path: paths.join(","),
-            })?;
-
-        Ok(())
-    }
-
-    async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
-        let path = self.checkpoint_file_path(version);
-        let data = self
-            .compress_type
-            .encode(bytes)
-            .await
-            .context(CompressObjectSnafu {
-                compress_type: self.compress_type,
-                path: &path,
-            })?;
-        self.object_store
-            .write(&path, data)
-            .await
-            .context(WriteObjectSnafu { path })?;
-
-        // Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
-        let last_checkpoint_path = self.last_checkpoint_path();
-
-        let checkpoint_metadata = CheckpointMetadata {
-            size: bytes.len(),
-            version,
-            checksum: None,
-            extend_metadata: None,
-        };
-
-        logging::debug!(
-            "Save checkpoint in path: {},  metadata: {:?}",
-            last_checkpoint_path,
-            checkpoint_metadata
-        );
-
-        let bs = checkpoint_metadata.encode()?;
-        self.object_store
-            .write(&last_checkpoint_path, bs.as_ref().to_vec())
-            .await
-            .context(WriteObjectSnafu {
-                path: last_checkpoint_path,
-            })?;
-
-        Ok(())
-    }
-
-    async fn load_checkpoint(
-        &self,
-        version: ManifestVersion,
-    ) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
-        let path = self.checkpoint_file_path(version);
-        // Due to backward compatibility, it is possible that the user's checkpoint not compressed,
-        // so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
-        let checkpoint_data =
-            match self.object_store.read(&path).await {
-                Ok(checkpoint) => {
-                    let decompress_data = self.compress_type.decode(checkpoint).await.context(
-                        DecompressObjectSnafu {
-                            compress_type: self.compress_type,
-                            path,
-                        },
-                    )?;
-                    Ok(Some(decompress_data))
-                }
-                Err(e) => {
-                    if e.kind() == ErrorKind::NotFound {
-                        if self.compress_type != FALL_BACK_COMPRESS_TYPE {
-                            let fall_back_path = gen_path(
-                                &self.path,
-                                &checkpoint_file(version),
-                                FALL_BACK_COMPRESS_TYPE,
-                            );
-                            logging::debug!(
-                                "Failed to load checkpoint from path: {}, fall back to path: {}",
-                                path,
-                                fall_back_path
-                            );
-                            match self.object_store.read(&fall_back_path).await {
-                                Ok(checkpoint) => {
-                                    let decompress_data = FALL_BACK_COMPRESS_TYPE
-                                        .decode(checkpoint)
-                                        .await
-                                        .context(DecompressObjectSnafu {
-                                            compress_type: FALL_BACK_COMPRESS_TYPE,
-                                            path,
-                                        })?;
-                                    Ok(Some(decompress_data))
-                                }
-                                Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
-                                Err(e) => Err(e).context(ReadObjectSnafu {
-                                    path: &fall_back_path,
-                                }),
-                            }
-                        } else {
-                            Ok(None)
-                        }
-                    } else {
-                        Err(e).context(ReadObjectSnafu { path: &path })
-                    }
-                }
-            }?;
-        Ok(checkpoint_data.map(|data| (version, data)))
-    }
-
-    async fn delete_checkpoint(&self, version: ManifestVersion) -> Result<()> {
-        // Due to backward compatibility, it is possible that the user's checkpoint file has not been compressed,
-        // so we need to delete the uncompressed checkpoint file corresponding to that version, even if the uncompressed checkpoint file in that version do not exist.
-        let paths = if self.compress_type != FALL_BACK_COMPRESS_TYPE {
-            vec![
-                raw_normalize_path(&self.checkpoint_file_path(version)),
-                raw_normalize_path(&gen_path(
-                    &self.path,
-                    &checkpoint_file(version),
-                    FALL_BACK_COMPRESS_TYPE,
-                )),
-            ]
-        } else {
-            vec![raw_normalize_path(&self.checkpoint_file_path(version))]
-        };
-
-        self.object_store
-            .remove(paths.clone())
-            .await
-            .context(DeleteObjectSnafu {
-                path: paths.join(","),
-            })?;
-        Ok(())
-    }
-
-    async fn load_last_checkpoint(&self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
-        let last_checkpoint_path = self.last_checkpoint_path();
-        let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
-            Ok(data) => data,
-            Err(e) if e.kind() == ErrorKind::NotFound => {
-                return Ok(None);
-            }
-            Err(e) => {
-                return Err(e).context(ReadObjectSnafu {
-                    path: last_checkpoint_path,
-                });
-            }
-        };
-
-        let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
-
-        logging::debug!(
-            "Load checkpoint in path: {}, metadata: {:?}",
-            last_checkpoint_path,
-            checkpoint_metadata
-        );
-
-        self.load_checkpoint(checkpoint_metadata.version).await
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use common_test_util::temp_dir::create_temp_dir;
-    use object_store::services::Fs;
-    use object_store::ObjectStore;
-
-    use super::*;
-
-    fn new_test_manifest_store() -> ManifestObjectStore {
-        common_telemetry::init_default_ut_logging();
-        let tmp_dir = create_temp_dir("test_manifest_log_store");
-        let mut builder = Fs::default();
-        let _ = builder.root(&tmp_dir.path().to_string_lossy());
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-        ManifestObjectStore::new("/", object_store, CompressionType::Uncompressed)
-    }
-
-    #[test]
-    // Define this test mainly to prevent future unintentional changes may break the backward compatibility.
-    fn test_compress_file_path_generation() {
-        let path = "/foo/bar/";
-        let version: ManifestVersion = 0;
-        let file_path = gen_path(path, &delta_file(version), CompressionType::Gzip);
-        assert_eq!(file_path.as_str(), "/foo/bar/00000000000000000000.json.gz")
-    }
-
-    #[tokio::test]
-    async fn test_manifest_log_store_uncompress() {
-        let mut log_store = new_test_manifest_store();
-        log_store.compress_type = CompressionType::Uncompressed;
-        test_manifest_log_store_case(log_store).await;
-    }
-
-    #[tokio::test]
-    async fn test_manifest_log_store_compress() {
-        let mut log_store = new_test_manifest_store();
-        log_store.compress_type = CompressionType::Gzip;
-        test_manifest_log_store_case(log_store).await;
-    }
-
-    async fn test_manifest_log_store_case(log_store: ManifestObjectStore) {
-        for v in 0..5 {
-            log_store
-                .save(v, format!("hello, {v}").as_bytes())
-                .await
-                .unwrap();
-        }
-
-        let mut it = log_store.scan(1, 4).await.unwrap();
-        for v in 1..4 {
-            let (version, bytes) = it.next_log().await.unwrap().unwrap();
-            assert_eq!(v, version);
-            assert_eq!(format!("hello, {v}").as_bytes(), bytes);
-        }
-        assert!(it.next_log().await.unwrap().is_none());
-
-        let mut it = log_store.scan(0, 11).await.unwrap();
-        for v in 0..5 {
-            let (version, bytes) = it.next_log().await.unwrap().unwrap();
-            assert_eq!(v, version);
-            assert_eq!(format!("hello, {v}").as_bytes(), bytes);
-        }
-        assert!(it.next_log().await.unwrap().is_none());
-
-        // Delete [0, 3)
-        log_store.delete(0, 3).await.unwrap();
-
-        // [3, 5) remains
-        let mut it = log_store.scan(0, 11).await.unwrap();
-        for v in 3..5 {
-            let (version, bytes) = it.next_log().await.unwrap().unwrap();
-            assert_eq!(v, version);
-            assert_eq!(format!("hello, {v}").as_bytes(), bytes);
-        }
-        assert!(it.next_log().await.unwrap().is_none());
-
-        // test checkpoint
-        assert!(log_store.load_last_checkpoint().await.unwrap().is_none());
-        log_store
-            .save_checkpoint(3, "checkpoint".as_bytes())
-            .await
-            .unwrap();
-
-        let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(checkpoint, "checkpoint".as_bytes());
-        assert_eq!(3, v);
-
-        //delete (,4) logs and keep checkpoint 3.
-        let _ = log_store.delete_until(4, true).await.unwrap();
-        let _ = log_store.load_checkpoint(3).await.unwrap().unwrap();
-        let _ = log_store.load_last_checkpoint().await.unwrap().unwrap();
-        let mut it = log_store.scan(0, 11).await.unwrap();
-        let (version, bytes) = it.next_log().await.unwrap().unwrap();
-        assert_eq!(4, version);
-        assert_eq!("hello, 4".as_bytes(), bytes);
-        assert!(it.next_log().await.unwrap().is_none());
-
-        // delete all logs and checkpoints
-        let _ = log_store.delete_until(11, false).await.unwrap();
-        assert!(log_store.load_checkpoint(3).await.unwrap().is_none());
-        assert!(log_store.load_last_checkpoint().await.unwrap().is_none());
-        let mut it = log_store.scan(0, 11).await.unwrap();
-        assert!(it.next_log().await.unwrap().is_none());
-    }
-
-    #[tokio::test]
-    // test ManifestObjectStore can read/delete previously uncompressed data correctly
-    async fn test_compress_backward_compatible() {
-        let mut log_store = new_test_manifest_store();
-
-        // write uncompress data to stimulate previously uncompressed data
-        log_store.compress_type = CompressionType::Uncompressed;
-        for v in 0..5 {
-            log_store
-                .save(v, format!("hello, {v}").as_bytes())
-                .await
-                .unwrap();
-        }
-        log_store
-            .save_checkpoint(5, "checkpoint_uncompressed".as_bytes())
-            .await
-            .unwrap();
-
-        // change compress type
-        log_store.compress_type = CompressionType::Gzip;
-
-        // test load_last_checkpoint work correctly for previously uncompressed data
-        let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(v, 5);
-        assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes());
-
-        // write compressed data to stimulate compress alogorithom take effect
-        for v in 5..10 {
-            log_store
-                .save(v, format!("hello, {v}").as_bytes())
-                .await
-                .unwrap();
-        }
-        log_store
-            .save_checkpoint(10, "checkpoint_compressed".as_bytes())
-            .await
-            .unwrap();
-
-        // test data reading
-        let mut it = log_store.scan(0, 10).await.unwrap();
-        for v in 0..10 {
-            let (version, bytes) = it.next_log().await.unwrap().unwrap();
-            assert_eq!(v, version);
-            assert_eq!(format!("hello, {v}").as_bytes(), bytes);
-        }
-        let (v, checkpoint) = log_store.load_checkpoint(5).await.unwrap().unwrap();
-        assert_eq!(v, 5);
-        assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes());
-        let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
-        assert_eq!(v, 10);
-        assert_eq!(checkpoint, "checkpoint_compressed".as_bytes());
-
-        // Delete previously uncompressed checkpoint
-        log_store.delete_checkpoint(5).await.unwrap();
-        assert!(log_store.load_checkpoint(5).await.unwrap().is_none());
-
-        // Delete [3, 7), contain uncompressed/compressed data
-        log_store.delete(3, 7).await.unwrap();
-        // [3, 7) deleted
-        let mut it = log_store.scan(3, 7).await.unwrap();
-        assert!(it.next_log().await.unwrap().is_none());
-
-        // Delete util 10, contain uncompressed/compressed data
-        // log 0, 1, 2, 7, 8, 9 will be delete
-        assert_eq!(6, log_store.delete_until(10, false).await.unwrap());
-        let mut it = log_store.scan(0, 10).await.unwrap();
-        assert!(it.next_log().await.unwrap().is_none());
-    }
-}
--- a/src/storage/src/manifest/test_utils.rs
+++ b/src/storage/src/manifest/test_utils.rs
@@ -1,83 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use datatypes::type_id::LogicalTypeId;
-use store_api::storage::SequenceNumber;
-
-use crate::manifest::action::*;
-use crate::metadata::RegionMetadata;
-use crate::sst::{FileId, FileMeta};
-use crate::test_util::descriptor_util::RegionDescBuilder;
-
-pub const DEFAULT_TEST_FILE_SIZE: u64 = 1024;
-
-pub fn build_region_meta() -> RegionMetadata {
-    let region_name = "region-0";
-    let desc = RegionDescBuilder::new(region_name)
-        .id(0)
-        .push_key_column(("k1", LogicalTypeId::Int32, false))
-        .push_field_column(("v1", LogicalTypeId::Float32, true))
-        .build();
-    desc.try_into().unwrap()
-}
-
-pub fn build_altered_region_meta() -> RegionMetadata {
-    let region_name = "region-0";
-    let desc = RegionDescBuilder::new(region_name)
-        .id(0)
-        .push_key_column(("k1", LogicalTypeId::Int32, false))
-        .push_field_column(("v1", LogicalTypeId::Float32, true))
-        .push_field_column(("v2", LogicalTypeId::Float32, true))
-        .build();
-    desc.try_into().unwrap()
-}
-
-pub fn build_region_edit(
-    sequence: SequenceNumber,
-    files_to_add: &[FileId],
-    files_to_remove: &[FileId],
-) -> RegionEdit {
-    RegionEdit {
-        region_version: 0,
-        flushed_sequence: Some(sequence),
-        files_to_add: files_to_add
-            .iter()
-            .map(|f| FileMeta {
-                region_id: 0.into(),
-                file_id: *f,
-                time_range: None,
-                level: 0,
-                file_size: DEFAULT_TEST_FILE_SIZE,
-            })
-            .collect(),
-        files_to_remove: files_to_remove
-            .iter()
-            .map(|f| FileMeta {
-                region_id: 0.into(),
-                file_id: *f,
-                time_range: None,
-                level: 0,
-                file_size: DEFAULT_TEST_FILE_SIZE,
-            })
-            .collect(),
-        compaction_time_window: None,
-    }
-}
-
-pub fn build_region_truncate(committed_sequence: u64) -> RegionTruncate {
-    RegionTruncate {
-        region_id: 0.into(),
-        committed_sequence,
-    }
-}
--- a/src/storage/src/memtable.rs
+++ b/src/storage/src/memtable.rs
@@ -1,294 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-mod btree;
-mod inserter;
-#[cfg(test)]
-pub mod tests;
-mod version;
-
-use std::fmt;
-use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering};
-use std::sync::Arc;
-
-use api::v1::OpType;
-use common_time::range::TimestampRange;
-use common_time::Timestamp;
-use datatypes::vectors::VectorRef;
-use store_api::storage::{consts, SequenceNumber};
-
-use crate::error::Result;
-use crate::flush::FlushStrategyRef;
-use crate::memtable::btree::BTreeMemtable;
-pub use crate::memtable::inserter::Inserter;
-pub use crate::memtable::version::MemtableVersion;
-use crate::metrics::WRITE_BUFFER_BYTES;
-use crate::read::Batch;
-use crate::schema::{ProjectedSchemaRef, RegionSchemaRef};
-
-/// Unique id for memtables under same region.
-pub type MemtableId = u32;
-
-#[derive(Debug, Default)]
-pub struct MemtableStats {
-    /// The  estimated bytes allocated by this memtable from heap. Result
-    /// of this method may be larger than the estimated based on `num_rows` because
-    /// of the implementor's pre-alloc behavior.
-    pub estimated_bytes: usize,
-    /// The max timestamp that this memtable contains.
-    pub max_timestamp: Timestamp,
-    /// The min timestamp that this memtable contains.
-    pub min_timestamp: Timestamp,
-}
-
-impl MemtableStats {
-    pub fn bytes_allocated(&self) -> usize {
-        self.estimated_bytes
-    }
-}
-
-/// In memory storage.
-pub trait Memtable: Send + Sync + fmt::Debug {
-    /// Returns id of this memtable.
-    fn id(&self) -> MemtableId;
-
-    /// Returns schema of the memtable.
-    fn schema(&self) -> RegionSchemaRef;
-
-    /// Write key/values to the memtable.
-    ///
-    /// # Panics
-    /// Panics if the schema of key/value differs from memtable's schema.
-    fn write(&self, kvs: &KeyValues) -> Result<()>;
-
-    /// Iterates the memtable.
-    fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator>;
-
-    /// Returns the number of rows in the memtable.
-    fn num_rows(&self) -> usize;
-
-    /// Returns stats of this memtable.
-    fn stats(&self) -> MemtableStats;
-
-    /// Mark the memtable is immutable.
-    ///
-    /// The region MUST call this inside the region writer's write lock.
-    fn mark_immutable(&self);
-}
-
-pub type MemtableRef = Arc<dyn Memtable>;
-
-/// Context for iterating memtable.
-///
-/// Should be cheap to clone.
-#[derive(Debug, Clone)]
-pub struct IterContext {
-    /// The suggested batch size of the iterator.
-    pub batch_size: usize,
-    /// Max visible sequence (inclusive).
-    pub visible_sequence: SequenceNumber,
-
-    /// Schema the reader expect to read.
-    ///
-    /// Set to `None` to read all columns.
-    pub projected_schema: Option<ProjectedSchemaRef>,
-
-    /// Timestamp range
-    pub time_range: Option<TimestampRange>,
-}
-
-impl Default for IterContext {
-    fn default() -> Self {
-        Self {
-            batch_size: consts::READ_BATCH_SIZE,
-            // All data in memory is visible by default.
-            visible_sequence: SequenceNumber::MAX,
-            projected_schema: None,
-            time_range: None,
-        }
-    }
-}
-
-/// The ordering of the iterator output.
-#[derive(Debug, PartialEq, Eq)]
-pub enum RowOrdering {
-    /// The output rows are unordered.
-    Unordered,
-
-    /// The output rows are ordered by key.
-    Key,
-}
-
-/// Iterator of memtable.
-///
-/// Since data of memtable are stored in memory, so avoid defining this trait
-/// as an async trait.
-pub trait BatchIterator: Iterator<Item = Result<Batch>> + Send + Sync {
-    /// Returns the schema of this iterator.
-    fn schema(&self) -> ProjectedSchemaRef;
-
-    /// Returns the ordering of the output rows from this iterator.
-    fn ordering(&self) -> RowOrdering;
-}
-
-pub type BoxedBatchIterator = Box<dyn BatchIterator>;
-
-pub trait MemtableBuilder: Send + Sync + fmt::Debug {
-    fn build(&self, schema: RegionSchemaRef) -> MemtableRef;
-}
-
-pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
-
-/// Key-value pairs in columnar format.
-pub struct KeyValues {
-    pub sequence: SequenceNumber,
-    pub op_type: OpType,
-    /// Start index of these key-value paris in batch. Each row in the same batch has
-    /// a unique index to identify it.
-    pub start_index_in_batch: usize,
-    pub keys: Vec<VectorRef>,
-    pub values: Vec<VectorRef>,
-    pub timestamp: Option<VectorRef>,
-}
-
-impl KeyValues {
-    // Note that `sequence` is not reset.
-    fn reset(&mut self, op_type: OpType, index_in_batch: usize) {
-        self.op_type = op_type;
-        self.start_index_in_batch = index_in_batch;
-        self.keys.clear();
-        self.values.clear();
-        self.timestamp = None;
-    }
-
-    pub fn len(&self) -> usize {
-        self.timestamp.as_ref().map(|v| v.len()).unwrap_or_default()
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    pub fn estimated_memory_size(&self) -> usize {
-        self.keys.iter().fold(0, |acc, v| acc + v.memory_size())
-            + self.values.iter().fold(0, |acc, v| acc + v.memory_size())
-            + self
-                .timestamp
-                .as_ref()
-                .map(|t| t.memory_size())
-                .unwrap_or_default()
-    }
-}
-
-/// Memtable memory allocation tracker.
-pub struct AllocTracker {
-    flush_strategy: Option<FlushStrategyRef>,
-    /// Bytes allocated by the tracker.
-    bytes_allocated: AtomicUsize,
-    /// Whether allocating is done.
-    is_done_allocating: AtomicBool,
-}
-
-impl fmt::Debug for AllocTracker {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_struct("AllocTracker")
-            .field("bytes_allocated", &self.bytes_allocated)
-            .field("is_done_allocating", &self.is_done_allocating)
-            .finish()
-    }
-}
-
-impl AllocTracker {
-    /// Returns a new [AllocTracker].
-    pub fn new(flush_strategy: Option<FlushStrategyRef>) -> AllocTracker {
-        AllocTracker {
-            flush_strategy,
-            bytes_allocated: AtomicUsize::new(0),
-            is_done_allocating: AtomicBool::new(false),
-        }
-    }
-
-    /// Tracks `bytes` memory is allocated.
-    pub(crate) fn on_allocate(&self, bytes: usize) {
-        let _ = self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
-        WRITE_BUFFER_BYTES.add(bytes as i64);
-        if let Some(flush_strategy) = &self.flush_strategy {
-            flush_strategy.reserve_mem(bytes);
-        }
-    }
-
-    /// Marks we have finished allocating memory so we can free it from
-    /// the write buffer's limit.
-    ///
-    /// The region MUST ensure that it calls this method inside the region writer's write lock.
-    pub(crate) fn done_allocating(&self) {
-        if let Some(flush_strategy) = &self.flush_strategy {
-            if self
-                .is_done_allocating
-                .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
-                .is_ok()
-            {
-                flush_strategy.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed));
-            }
-        }
-    }
-
-    /// Returns bytes allocated.
-    pub(crate) fn bytes_allocated(&self) -> usize {
-        self.bytes_allocated.load(Ordering::Relaxed)
-    }
-}
-
-impl Drop for AllocTracker {
-    fn drop(&mut self) {
-        if !self.is_done_allocating.load(Ordering::Relaxed) {
-            self.done_allocating();
-        }
-
-        let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed);
-        WRITE_BUFFER_BYTES.sub(bytes_allocated as i64);
-
-        // Memory tracked by this tracker is freed.
-        if let Some(flush_strategy) = &self.flush_strategy {
-            flush_strategy.free_mem(bytes_allocated);
-        }
-    }
-}
-
-/// Default memtable builder that builds `BTreeMemtable`.
-#[derive(Debug, Default)]
-pub struct DefaultMemtableBuilder {
-    memtable_id: AtomicU32,
-    flush_strategy: Option<FlushStrategyRef>,
-}
-
-impl DefaultMemtableBuilder {
-    /// Returns a new [DefaultMemtableBuilder] with specific `flush_strategy`.
-    ///
-    /// If `flush_strategy` is `Some`, the memtable will report its memory usage
-    /// to the `flush_strategy`.
-    pub fn with_flush_strategy(flush_strategy: Option<FlushStrategyRef>) -> Self {
-        Self {
-            memtable_id: AtomicU32::new(0),
-            flush_strategy,
-        }
-    }
-}
-
-impl MemtableBuilder for DefaultMemtableBuilder {
-    fn build(&self, schema: RegionSchemaRef) -> MemtableRef {
-        let id = self.memtable_id.fetch_add(1, Ordering::Relaxed);
-        Arc::new(BTreeMemtable::new(id, schema, self.flush_strategy.clone()))
-    }
-}
--- a/src/storage/src/memtable/btree.rs
+++ b/src/storage/src/memtable/btree.rs
@@ -1,573 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::cmp::Ordering;
-use std::collections::{btree_map, BTreeMap};
-use std::fmt;
-use std::ops::Bound;
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, RwLock};
-
-use api::v1::OpType;
-use common_time::range::TimestampRange;
-use datatypes::data_type::DataType;
-use datatypes::prelude::*;
-use datatypes::value::Value;
-use datatypes::vectors::{UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder};
-use store_api::storage::{SequenceNumber, MIN_OP_TYPE};
-
-use crate::error::Result;
-use crate::flush::FlushStrategyRef;
-use crate::memtable::{
-    AllocTracker, BatchIterator, BoxedBatchIterator, IterContext, KeyValues, Memtable, MemtableId,
-    MemtableStats, RowOrdering,
-};
-use crate::read::Batch;
-use crate::schema::compat::ReadAdapter;
-use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
-
-type RwLockMap = RwLock<BTreeMap<InnerKey, RowValue>>;
-
-/// A simple memtable implementation based on std's [`BTreeMap`].
-///
-/// Mainly for test purpose, don't use in production.
-pub struct BTreeMemtable {
-    id: MemtableId,
-    schema: RegionSchemaRef,
-    map: Arc<RwLockMap>,
-    alloc_tracker: AllocTracker,
-    max_timestamp: AtomicI64,
-    min_timestamp: AtomicI64,
-}
-
-impl BTreeMemtable {
-    pub fn new(
-        id: MemtableId,
-        schema: RegionSchemaRef,
-        flush_strategy: Option<FlushStrategyRef>,
-    ) -> BTreeMemtable {
-        BTreeMemtable {
-            id,
-            schema,
-            map: Arc::new(RwLock::new(BTreeMap::new())),
-            alloc_tracker: AllocTracker::new(flush_strategy),
-            max_timestamp: AtomicI64::new(i64::MIN),
-            min_timestamp: AtomicI64::new(i64::MAX),
-        }
-    }
-
-    /// Updates memtable stats.
-    /// This function is guarded by `BTreeMemtable::map` so that store-after-load is safe.
-    fn update_stats(&self, request_size: usize, min: Option<Value>, max: Option<Value>) {
-        self.alloc_tracker.on_allocate(request_size);
-
-        if let Some(min) = min {
-            let min_val = min
-                .as_timestamp()
-                .expect("Min timestamp must be a valid timestamp value")
-                .value();
-            let cur_min = self.min_timestamp.load(AtomicOrdering::Relaxed);
-            if min_val < cur_min {
-                self.min_timestamp.store(min_val, AtomicOrdering::Relaxed);
-            }
-        }
-
-        if let Some(max) = max {
-            let cur_max = self.max_timestamp.load(AtomicOrdering::Relaxed);
-            let max_val = max
-                .as_timestamp()
-                .expect("Max timestamp must be a valid timestamp value")
-                .value();
-            if max_val > cur_max {
-                self.max_timestamp.store(max_val, AtomicOrdering::Relaxed);
-            }
-        }
-    }
-}
-
-impl fmt::Debug for BTreeMemtable {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let len = self.map.read().unwrap().len();
-
-        f.debug_struct("BTreeMemtable")
-            .field("id", &self.id)
-            // Only show StoreSchema
-            .field("schema", &self.schema)
-            .field("rows", &len)
-            .field("alloc_tracker", &self.alloc_tracker)
-            .field("max_timestamp", &self.max_timestamp)
-            .field("min_timestamp", &self.min_timestamp)
-            .finish()
-    }
-}
-
-impl Memtable for BTreeMemtable {
-    fn id(&self) -> MemtableId {
-        self.id
-    }
-
-    fn schema(&self) -> RegionSchemaRef {
-        self.schema.clone()
-    }
-
-    fn write(&self, kvs: &KeyValues) -> Result<()> {
-        debug_assert!(kvs.timestamp.is_some());
-        let iter_row = IterRow::new(kvs);
-        let mut map = self.map.write().unwrap();
-
-        let mut min_ts = None;
-        let mut max_ts = None;
-        for (inner_key, row_value) in iter_row {
-            let ts = inner_key.timestamp();
-            let min_ts = min_ts.get_or_insert_with(|| ts.clone());
-            let max_ts = max_ts.get_or_insert_with(|| ts.clone());
-            if ts < min_ts {
-                *min_ts = ts.clone();
-            }
-            if ts > max_ts {
-                *max_ts = ts.clone();
-            }
-            let _ = map.insert(inner_key, row_value);
-        }
-
-        self.update_stats(kvs.estimated_memory_size(), min_ts, max_ts);
-
-        Ok(())
-    }
-
-    fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator> {
-        assert!(ctx.batch_size > 0);
-
-        let iter = BTreeIterator::new(ctx, self.schema.clone(), self.map.clone())?;
-
-        Ok(Box::new(iter))
-    }
-
-    fn num_rows(&self) -> usize {
-        self.map.read().unwrap().len()
-    }
-
-    fn stats(&self) -> MemtableStats {
-        let ts_meta = self.schema.column_metadata(self.schema.timestamp_index());
-
-        let Some(timestamp_type) = ts_meta.desc.data_type.as_timestamp() else {
-            // safety: timestamp column always has timestamp type, otherwise it's a bug.
-            panic!(
-                "Timestamp column is not a valid timestamp type: {:?}",
-                self.schema
-            );
-        };
-
-        MemtableStats {
-            estimated_bytes: self.alloc_tracker.bytes_allocated(),
-            max_timestamp: timestamp_type
-                .create_timestamp(self.max_timestamp.load(AtomicOrdering::Relaxed)),
-            min_timestamp: timestamp_type
-                .create_timestamp(self.min_timestamp.load(AtomicOrdering::Relaxed)),
-        }
-    }
-
-    fn mark_immutable(&self) {
-        self.alloc_tracker.done_allocating();
-    }
-}
-
-struct BTreeIterator {
-    ctx: IterContext,
-    /// Schema of this memtable.
-    schema: RegionSchemaRef,
-    /// Projected schema that user expect to read.
-    projected_schema: ProjectedSchemaRef,
-    adapter: ReadAdapter,
-    map: Arc<RwLockMap>,
-    last_key: Option<InnerKey>,
-}
-
-impl BatchIterator for BTreeIterator {
-    fn schema(&self) -> ProjectedSchemaRef {
-        self.projected_schema.clone()
-    }
-
-    fn ordering(&self) -> RowOrdering {
-        RowOrdering::Key
-    }
-}
-
-impl Iterator for BTreeIterator {
-    type Item = Result<Batch>;
-
-    fn next(&mut self) -> Option<Result<Batch>> {
-        self.next_batch().transpose()
-    }
-}
-
-impl BTreeIterator {
-    fn new(
-        ctx: IterContext,
-        schema: RegionSchemaRef,
-        map: Arc<RwLockMap>,
-    ) -> Result<BTreeIterator> {
-        let projected_schema = ctx
-            .projected_schema
-            .clone()
-            .unwrap_or_else(|| Arc::new(ProjectedSchema::no_projection(schema.clone())));
-        let adapter = ReadAdapter::new(schema.store_schema().clone(), projected_schema.clone())?;
-
-        Ok(BTreeIterator {
-            ctx,
-            schema,
-            projected_schema,
-            adapter,
-            map,
-            last_key: None,
-        })
-    }
-
-    fn next_batch(&mut self) -> Result<Option<Batch>> {
-        let map = self.map.read().unwrap();
-        let iter = if let Some(last_key) = &self.last_key {
-            map.range((Bound::Excluded(last_key), Bound::Unbounded))
-        } else {
-            map.range(..)
-        };
-
-        let iter = MapIterWrapper::new(iter, self.ctx.visible_sequence, self.ctx.time_range);
-        let (keys, sequences, op_types, values) = collect_iter(iter, self.ctx.batch_size);
-
-        if keys.is_empty() {
-            return Ok(None);
-        }
-        self.last_key = keys.last().map(|k| {
-            let mut last_key = (*k).clone();
-            last_key.reset_for_seek();
-            last_key
-        });
-
-        let key_data_types = self
-            .schema
-            .row_key_columns()
-            .map(|column_meta| column_meta.desc.data_type.clone());
-        let value_data_types = self
-            .schema
-            .field_columns()
-            .map(|column_meta| column_meta.desc.data_type.clone());
-
-        let key_columns = rows_to_vectors(
-            key_data_types,
-            self.adapter.source_key_needed(),
-            keys.as_slice(),
-        );
-        let field_columns = rows_to_vectors(
-            value_data_types,
-            self.adapter.source_value_needed(),
-            values.as_slice(),
-        );
-
-        let batch = self.adapter.batch_from_parts(
-            key_columns,
-            field_columns,
-            Arc::new(sequences),
-            Arc::new(op_types),
-        )?;
-
-        Ok(Some(batch))
-    }
-}
-
-fn collect_iter<'a, I: Iterator<Item = (&'a InnerKey, &'a RowValue)>>(
-    iter: I,
-    batch_size: usize,
-) -> (
-    Vec<&'a InnerKey>,
-    UInt64Vector,
-    UInt8Vector,
-    Vec<&'a RowValue>,
-) {
-    let mut keys = Vec::with_capacity(batch_size);
-    let mut sequences = UInt64VectorBuilder::with_capacity(batch_size);
-    let mut op_types = UInt8VectorBuilder::with_capacity(batch_size);
-    let mut values = Vec::with_capacity(batch_size);
-    for (inner_key, row_value) in iter.take(batch_size) {
-        keys.push(inner_key);
-        sequences.push(Some(inner_key.sequence));
-        op_types.push(Some(inner_key.op_type as u8));
-        values.push(row_value);
-    }
-
-    (keys, sequences.finish(), op_types.finish(), values)
-}
-
-/// `MapIterWrapper` removes same user key with invisible sequence.
-struct MapIterWrapper<'a, InnerKey, RowValue> {
-    iter: btree_map::Range<'a, InnerKey, RowValue>,
-    prev_key: Option<InnerKey>,
-    visible_sequence: SequenceNumber,
-    time_range: Option<TimestampRange>,
-}
-
-impl<'a> MapIterWrapper<'a, InnerKey, RowValue> {
-    fn new(
-        iter: btree_map::Range<'a, InnerKey, RowValue>,
-        visible_sequence: SequenceNumber,
-        time_range: Option<TimestampRange>,
-    ) -> MapIterWrapper<'a, InnerKey, RowValue> {
-        MapIterWrapper {
-            iter,
-            prev_key: None,
-            visible_sequence,
-            time_range,
-        }
-    }
-
-    fn next_visible_entry(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> {
-        for (k, v) in self.iter.by_ref() {
-            if k.is_visible(self.visible_sequence) && k.is_in_time_range(&self.time_range) {
-                return Some((k, v));
-            }
-        }
-
-        None
-    }
-}
-
-impl<'a> Iterator for MapIterWrapper<'a, InnerKey, RowValue> {
-    type Item = (&'a InnerKey, &'a RowValue);
-
-    fn next(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> {
-        let (mut current_key, mut current_value) = self.next_visible_entry()?;
-        if self.prev_key.is_none() {
-            self.prev_key = Some(current_key.clone());
-            return Some((current_key, current_value));
-        }
-
-        let prev_key = self.prev_key.take().unwrap();
-        while prev_key.is_row_key_equal(current_key) {
-            if let Some((next_key, next_value)) = self.next_visible_entry() {
-                (current_key, current_value) = (next_key, next_value);
-            } else {
-                return None;
-            }
-        }
-
-        self.prev_key = Some(current_key.clone());
-
-        Some((current_key, current_value))
-    }
-}
-
-struct IterRow<'a> {
-    kvs: &'a KeyValues,
-    index: usize,
-    len: usize,
-}
-
-impl<'a> IterRow<'a> {
-    fn new(kvs: &KeyValues) -> IterRow {
-        IterRow {
-            kvs,
-            index: 0,
-            len: kvs.len(),
-        }
-    }
-
-    fn fetch_row(&mut self) -> (InnerKey, RowValue) {
-        let mut row_key: Vec<_> = self
-            .kvs
-            .keys
-            .iter()
-            .map(|vector| vector.get(self.index))
-            .collect();
-
-        // unwrap safety: KeyValues always contains a timestamp as guaranteed in [Inserter::write_one_mutation]
-        row_key.push(self.kvs.timestamp.as_ref().unwrap().get(self.index));
-        let inner_key = InnerKey {
-            row_key,
-            sequence: self.kvs.sequence,
-            index_in_batch: self.kvs.start_index_in_batch + self.index,
-            op_type: self.kvs.op_type,
-        };
-
-        let row_value = RowValue {
-            values: self
-                .kvs
-                .values
-                .iter()
-                .map(|vector| vector.get(self.index))
-                .collect(),
-        };
-
-        self.index += 1;
-
-        (inner_key, row_value)
-    }
-}
-
-impl<'a> Iterator for IterRow<'a> {
-    type Item = (InnerKey, RowValue);
-
-    fn next(&mut self) -> Option<(InnerKey, RowValue)> {
-        if self.index >= self.len {
-            return None;
-        }
-
-        Some(self.fetch_row())
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (self.kvs.keys.len(), Some(self.kvs.keys.len()))
-    }
-}
-
-#[derive(Clone, Debug, PartialEq, Eq)]
-struct InnerKey {
-    /// User defined primary keys
-    row_key: Vec<Value>,
-    /// Sequence number of row
-    sequence: SequenceNumber,
-    index_in_batch: usize,
-    op_type: OpType,
-}
-
-impl Ord for InnerKey {
-    fn cmp(&self, other: &InnerKey) -> Ordering {
-        // Order by (row_key asc, sequence desc, index_in_batch desc, op_type desc), though (key,
-        // sequence, index_in_batch) should be enough to disambiguate.
-        self.row_key
-            .cmp(&other.row_key)
-            .then_with(|| other.sequence.cmp(&self.sequence))
-            .then_with(|| other.index_in_batch.cmp(&self.index_in_batch))
-            .then_with(|| other.op_type.cmp(&self.op_type))
-    }
-}
-
-impl PartialOrd for InnerKey {
-    fn partial_cmp(&self, other: &InnerKey) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl InnerKey {
-    #[inline]
-    fn timestamp(&self) -> &Value {
-        // safety: row key shall at least contain a timestamp column
-        self.row_key.last().unwrap()
-    }
-
-    #[inline]
-    fn is_row_key_equal(&self, other: &InnerKey) -> bool {
-        self.row_key == other.row_key
-    }
-
-    #[inline]
-    fn is_visible(&self, sequence: SequenceNumber) -> bool {
-        self.sequence <= sequence
-    }
-
-    #[inline]
-    fn is_in_time_range(&self, range: &Option<TimestampRange>) -> bool {
-        let Some(range) = range else {
-            return true;
-        };
-        range.contains(
-            &self
-                .timestamp()
-                .as_timestamp()
-                .expect("Timestamp field must be a valid timestamp value"),
-        )
-    }
-
-    /// Reset the `InnerKey` so that we can use it to seek next key that
-    /// has different row key.
-    fn reset_for_seek(&mut self) {
-        // sequence, index_in_batch, op_type are ordered in desc order, so
-        // we can represent the last inner key with same row key by setting them
-        // to zero (Minimum value).
-        self.sequence = 0;
-        self.index_in_batch = 0;
-        self.op_type = MIN_OP_TYPE;
-    }
-}
-
-#[derive(Clone, Debug)]
-struct RowValue {
-    values: Vec<Value>,
-}
-
-trait RowsProvider {
-    fn row_num(&self) -> usize;
-
-    fn column_num(&self) -> usize {
-        self.row_by_index(0).len()
-    }
-
-    fn is_empty(&self) -> bool {
-        self.row_num() == 0
-    }
-
-    fn row_by_index(&self, idx: usize) -> &Vec<Value>;
-}
-
-impl<'a> RowsProvider for &'a [&InnerKey] {
-    fn row_num(&self) -> usize {
-        self.len()
-    }
-
-    fn row_by_index(&self, idx: usize) -> &Vec<Value> {
-        &self[idx].row_key
-    }
-}
-
-impl<'a> RowsProvider for &'a [&RowValue] {
-    fn row_num(&self) -> usize {
-        self.len()
-    }
-
-    fn row_by_index(&self, idx: usize) -> &Vec<Value> {
-        &self[idx].values
-    }
-}
-
-fn rows_to_vectors<I: Iterator<Item = ConcreteDataType>, T: RowsProvider>(
-    data_types: I,
-    column_needed: &[bool],
-    provider: T,
-) -> Vec<VectorRef> {
-    if provider.is_empty() {
-        return Vec::new();
-    }
-
-    let column_num = provider.column_num();
-    let row_num = provider.row_num();
-    let mut builders = Vec::with_capacity(column_num);
-    for data_type in data_types {
-        builders.push(data_type.create_mutable_vector(row_num));
-    }
-
-    let mut vectors = Vec::with_capacity(column_num);
-    for (col_idx, builder) in builders.iter_mut().enumerate() {
-        if !column_needed[col_idx] {
-            continue;
-        }
-
-        for row_idx in 0..row_num {
-            let row = provider.row_by_index(row_idx);
-            let value = &row[col_idx];
-            builder.as_mut().push_value_ref(value.as_value_ref());
-        }
-
-        vectors.push(builder.to_vector());
-    }
-
-    vectors
-}
--- a/src/storage/src/memtable/inserter.rs
+++ b/src/storage/src/memtable/inserter.rs
@@ -1,251 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use api::v1::OpType;
-use store_api::storage::SequenceNumber;
-
-use super::MemtableRef;
-use crate::error::Result;
-use crate::memtable::KeyValues;
-use crate::metrics::MEMTABLE_WRITE_ELAPSED;
-use crate::write_batch::{Mutation, Payload};
-
-/// Wraps logic of inserting key/values in [WriteBatch](crate::write_batch::WriteBatch) to [Memtable](crate::memtable::Memtable).
-pub struct Inserter {
-    /// Sequence of the batch to be inserted.
-    sequence: SequenceNumber,
-    /// Used to calculate the start index in batch for `KeyValues`.
-    index_in_batch: usize,
-}
-
-impl Inserter {
-    pub fn new(sequence: SequenceNumber) -> Inserter {
-        Inserter {
-            sequence,
-            index_in_batch: 0,
-        }
-    }
-
-    /// Insert write batch payload into memtable.
-    ///
-    /// Won't do schema validation if not configured. Caller (mostly the `RegionWriter` should ensure the
-    /// schemas of `memtable` are consistent with `payload`'s.
-    pub fn insert_memtable(&mut self, payload: &Payload, memtable: &MemtableRef) -> Result<()> {
-        let _timer = MEMTABLE_WRITE_ELAPSED.start_timer();
-
-        if payload.is_empty() {
-            return Ok(());
-        }
-
-        // This function only makes effect in debug mode.
-        validate_input_and_memtable_schemas(payload, memtable);
-
-        // Enough to hold all key or value columns.
-        let total_column_num = payload.schema.num_columns();
-        // Reusable KeyValues buffer.
-        let mut kvs = KeyValues {
-            sequence: self.sequence,
-            op_type: OpType::Put,
-            start_index_in_batch: self.index_in_batch,
-            keys: Vec::with_capacity(total_column_num),
-            values: Vec::with_capacity(total_column_num),
-            timestamp: None,
-        };
-
-        for mutation in &payload.mutations {
-            self.write_one_mutation(mutation, memtable, &mut kvs)?;
-        }
-
-        Ok(())
-    }
-
-    fn write_one_mutation(
-        &mut self,
-        mutation: &Mutation,
-        memtable: &MemtableRef,
-        kvs: &mut KeyValues,
-    ) -> Result<()> {
-        let schema = memtable.schema();
-        let num_rows = mutation.record_batch.num_rows();
-
-        kvs.reset(mutation.op_type, self.index_in_batch);
-
-        let ts_idx = schema.timestamp_index();
-        kvs.timestamp = Some(mutation.record_batch.column(ts_idx).clone());
-        for key_idx in 0..ts_idx {
-            kvs.keys.push(mutation.record_batch.column(key_idx).clone());
-        }
-        for value_idx in schema.value_indices() {
-            kvs.values
-                .push(mutation.record_batch.column(value_idx).clone());
-        }
-
-        memtable.write(kvs)?;
-
-        self.index_in_batch += num_rows;
-
-        Ok(())
-    }
-}
-
-fn validate_input_and_memtable_schemas(payload: &Payload, memtable: &MemtableRef) {
-    if cfg!(debug_assertions) {
-        let payload_schema = &payload.schema;
-        let memtable_schema = memtable.schema();
-        let user_schema = memtable_schema.user_schema();
-        debug_assert_eq!(payload_schema.version(), user_schema.version());
-        // Only validate column schemas.
-        debug_assert_eq!(
-            payload_schema.column_schemas(),
-            user_schema.column_schemas()
-        );
-    }
-}
-
-/// Holds `start` and `end` indexes to get a slice `[start, end)` from the vector whose
-/// timestamps belong to same time range at `range_index`.
-#[derive(Debug, PartialEq)]
-struct SliceIndex {
-    start: usize,
-    end: usize,
-    /// Index in time ranges.
-    range_index: usize,
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-    use std::sync::Arc;
-
-    use common_time::timestamp::Timestamp;
-    use datatypes::type_id::LogicalTypeId;
-    use datatypes::value::Value;
-    use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
-    use store_api::storage::WriteRequest;
-
-    use super::*;
-    use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
-    use crate::metadata::RegionMetadata;
-    use crate::schema::RegionSchemaRef;
-    use crate::test_util::descriptor_util::RegionDescBuilder;
-    use crate::test_util::write_batch_util;
-    use crate::write_batch::WriteBatch;
-
-    fn new_test_write_batch() -> WriteBatch {
-        write_batch_util::new_write_batch(
-            &[
-                ("ts", LogicalTypeId::TimestampMillisecond, false),
-                ("value", LogicalTypeId::Int64, true),
-            ],
-            Some(0),
-            1,
-        )
-    }
-
-    fn new_region_schema() -> RegionSchemaRef {
-        let desc = RegionDescBuilder::new("test")
-            .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false))
-            .push_field_column(("value", LogicalTypeId::Int64, true))
-            .build();
-        let metadata: RegionMetadata = desc.try_into().unwrap();
-
-        metadata.schema().clone()
-    }
-
-    fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option<i64>)]) {
-        let ts = TimestampMillisecondVector::from_values(data.iter().map(|v| v.0));
-        let value = Int64Vector::from(data.iter().map(|v| v.1).collect::<Vec<_>>());
-        let put_data = HashMap::from([
-            ("ts".to_string(), Arc::new(ts) as VectorRef),
-            ("value".to_string(), Arc::new(value) as VectorRef),
-        ]);
-
-        batch.put(put_data).unwrap();
-    }
-
-    fn check_memtable_content(
-        mem: &MemtableRef,
-        sequence: SequenceNumber,
-        data: &[(i64, Option<i64>)],
-        max_ts: i64,
-        min_ts: i64,
-    ) {
-        let iter = mem.iter(IterContext::default()).unwrap();
-        assert_eq!(min_ts, mem.stats().min_timestamp.value());
-        assert_eq!(max_ts, mem.stats().max_timestamp.value());
-
-        let mut index = 0;
-        for batch in iter {
-            let batch = batch.unwrap();
-            let row_num = batch.column(0).len();
-            for i in 0..row_num {
-                let ts = batch.column(0).get(i);
-                let v = batch.column(1).get(i);
-                assert_eq!(
-                    Value::Timestamp(Timestamp::new_millisecond(data[index].0)),
-                    ts
-                );
-                assert_eq!(Value::from(data[index].1), v);
-                assert_eq!(Value::from(sequence), batch.column(2).get(i));
-
-                index += 1;
-            }
-        }
-
-        assert_eq!(data.len(), index);
-    }
-
-    #[test]
-    fn test_inserter_put_one_memtable() {
-        let sequence = 11111;
-        let memtable_schema = new_region_schema();
-        let mutable_memtable = DefaultMemtableBuilder::default().build(memtable_schema);
-        let mut inserter = Inserter::new(sequence);
-
-        let mut batch = new_test_write_batch();
-        put_batch(&mut batch, &[(1, Some(1)), (2, None)]);
-        // Also test multiple put data in one batch.
-        put_batch(
-            &mut batch,
-            &[
-                (3, None),
-                (2, None), // Duplicate entries in same put data.
-                (2, Some(2)),
-                (4, Some(4)),
-                (201, Some(201)),
-                (102, None),
-                (101, Some(101)),
-            ],
-        );
-
-        inserter
-            .insert_memtable(batch.payload(), &mutable_memtable)
-            .unwrap();
-        check_memtable_content(
-            &mutable_memtable,
-            sequence,
-            &[
-                (1, Some(1)),
-                (2, Some(2)),
-                (3, None),
-                (4, Some(4)),
-                (101, Some(101)),
-                (102, None),
-                (201, Some(201)),
-            ],
-            201,
-            1,
-        );
-    }
-}
--- a/src/storage/src/memtable/tests.rs
+++ b/src/storage/src/memtable/tests.rs
@@ -1,595 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use common_time::Timestamp;
-use datatypes::prelude::*;
-use datatypes::timestamp::TimestampMillisecond;
-use datatypes::type_id::LogicalTypeId;
-use datatypes::vectors::{
-    TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64Vector,
-    UInt64VectorBuilder, UInt8Vector,
-};
-
-use super::*;
-use crate::metadata::RegionMetadata;
-use crate::schema::{ProjectedSchema, RegionSchemaRef};
-use crate::test_util::descriptor_util::RegionDescBuilder;
-
-// Schema for testing memtable:
-// - key: Int64(timestamp), UInt64(version),
-// - value: UInt64, UInt64
-pub fn schema_for_test() -> RegionSchemaRef {
-    // Just build a region desc and use its columns metadata.
-    let desc = RegionDescBuilder::new("test")
-        .push_field_column(("v0", LogicalTypeId::UInt64, true))
-        .push_field_column(("v1", LogicalTypeId::UInt64, true))
-        .build();
-    let metadata: RegionMetadata = desc.try_into().unwrap();
-
-    metadata.schema().clone()
-}
-
-fn kvs_for_test_with_index(
-    sequence: SequenceNumber,
-    op_type: OpType,
-    start_index_in_batch: usize,
-    keys: &[TimestampMillisecond],
-    values: &[(Option<u64>, Option<u64>)],
-) -> KeyValues {
-    assert_eq!(keys.len(), values.len());
-
-    let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(keys.len());
-    for key in keys {
-        key_builders.push(Some(*key));
-    }
-    let ts_col = Arc::new(key_builders.finish()) as _;
-
-    let mut value_builders = (
-        UInt64VectorBuilder::with_capacity(values.len()),
-        UInt64VectorBuilder::with_capacity(values.len()),
-    );
-    for value in values {
-        value_builders.0.push(value.0);
-        value_builders.1.push(value.1);
-    }
-    let row_values = vec![
-        Arc::new(value_builders.0.finish()) as _,
-        Arc::new(value_builders.1.finish()) as _,
-    ];
-
-    let kvs = KeyValues {
-        sequence,
-        op_type,
-        start_index_in_batch,
-        keys: vec![],
-        values: row_values,
-        timestamp: Some(ts_col),
-    };
-
-    assert_eq!(keys.len(), kvs.len());
-    assert_eq!(keys.is_empty(), kvs.is_empty());
-
-    kvs
-}
-
-fn kvs_for_test(
-    sequence: SequenceNumber,
-    op_type: OpType,
-    keys: &[TimestampMillisecond],
-    values: &[(Option<u64>, Option<u64>)],
-) -> KeyValues {
-    kvs_for_test_with_index(sequence, op_type, 0, keys, values)
-}
-
-pub fn write_kvs(
-    memtable: &dyn Memtable,
-    sequence: SequenceNumber,
-    op_type: OpType,
-    keys: &[i64],
-    values: &[(Option<u64>, Option<u64>)],
-) {
-    let keys: Vec<TimestampMillisecond> = keys.iter().map(|l| ((*l).into())).collect();
-
-    let kvs = kvs_for_test(sequence, op_type, &keys, values);
-
-    memtable.write(&kvs).unwrap();
-}
-
-fn check_batch_valid(batch: &Batch) {
-    assert_eq!(5, batch.num_columns());
-    let row_num = batch.column(0).len();
-    for i in 1..5 {
-        assert_eq!(row_num, batch.column(i).len());
-    }
-}
-
-fn check_iter_content(
-    iter: &mut dyn BatchIterator,
-    keys: &[i64],
-    sequences: &[u64],
-    op_types: &[OpType],
-    values: &[(Option<u64>, Option<u64>)],
-) {
-    let keys: Vec<TimestampMillisecond> = keys.iter().map(|l| (*l).into()).collect();
-
-    let mut index = 0;
-    for batch in iter {
-        let batch = batch.unwrap();
-        check_batch_valid(&batch);
-
-        let row_num = batch.column(0).len();
-        for i in 0..row_num {
-            let k0 = batch.column(0).get(i);
-            let (v0, v1) = (batch.column(1).get(i), batch.column(2).get(i));
-            let sequence = batch.column(3).get(i);
-            let op_type = batch.column(4).get(i);
-
-            assert_eq!(Value::from(keys[index]), k0);
-            assert_eq!(Value::from(values[index].0), v0);
-            assert_eq!(Value::from(values[index].1), v1);
-            assert_eq!(Value::from(sequences[index]), sequence);
-            assert_eq!(Value::from(op_types[index] as u8), op_type);
-
-            index += 1;
-        }
-    }
-
-    assert_eq!(keys.len(), index);
-}
-
-struct MemtableTester {
-    schema: RegionSchemaRef,
-    builders: Vec<MemtableBuilderRef>,
-}
-
-impl Default for MemtableTester {
-    fn default() -> MemtableTester {
-        MemtableTester::new()
-    }
-}
-
-impl MemtableTester {
-    fn new() -> MemtableTester {
-        let schema = schema_for_test();
-        let builders = vec![Arc::new(DefaultMemtableBuilder::default()) as _];
-
-        MemtableTester { schema, builders }
-    }
-
-    fn new_memtables(&self) -> Vec<MemtableRef> {
-        self.builders
-            .iter()
-            .map(|b| b.build(self.schema.clone()))
-            .collect()
-    }
-
-    fn run_testcase<F>(&self, testcase: F)
-    where
-        F: Fn(TestContext),
-    {
-        for memtable in self.new_memtables() {
-            let test_ctx = TestContext {
-                schema: self.schema.clone(),
-                memtable,
-            };
-
-            testcase(test_ctx);
-        }
-    }
-}
-
-struct TestContext {
-    schema: RegionSchemaRef,
-    memtable: MemtableRef,
-}
-
-fn write_iter_memtable_case(ctx: &TestContext) {
-    // Test iterating an empty memtable.
-    let mut iter = ctx.memtable.iter(IterContext::default()).unwrap();
-    assert!(iter.next().is_none());
-    // Poll the empty iterator again.
-    assert!(iter.next().is_none());
-    assert_eq!(0, ctx.memtable.stats().bytes_allocated());
-
-    // Init test data.
-    write_kvs(
-        &*ctx.memtable,
-        10, // sequence
-        OpType::Put,
-        &[1000, 1000, 2002, 2003, 2003, 1001], // keys
-        &[
-            (Some(1), None),
-            (Some(2), None),
-            (Some(7), None),
-            (Some(8), None),
-            (Some(9), None),
-            (Some(3), None),
-        ], // values
-    );
-    write_kvs(
-        &*ctx.memtable,
-        11, // sequence
-        OpType::Put,
-        &[1002, 1003, 1004],                            // keys
-        &[(None, None), (Some(5), None), (None, None)], // values
-    );
-
-    // 9 key value pairs (6 + 3).
-    assert_eq!(576, ctx.memtable.stats().bytes_allocated());
-
-    let batch_sizes = [1, 4, 8, consts::READ_BATCH_SIZE];
-    for batch_size in batch_sizes {
-        let iter_ctx = IterContext {
-            batch_size,
-            ..Default::default()
-        };
-        let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
-        assert_eq!(
-            ctx.schema.user_schema(),
-            iter.schema().projected_user_schema()
-        );
-        assert_eq!(RowOrdering::Key, iter.ordering());
-
-        check_iter_content(
-            &mut *iter,
-            &[1000, 1001, 1002, 1003, 1004, 2002, 2003], // keys
-            &[10, 10, 11, 11, 11, 10, 10],               // sequences
-            &[
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-                OpType::Put,
-            ], // op_types
-            &[
-                (Some(2), None),
-                (Some(3), None),
-                (None, None),
-                (Some(5), None),
-                (None, None),
-                (Some(7), None),
-                (Some(9), None),
-            ], // values
-        );
-    }
-}
-
-#[test]
-fn test_iter_context_default() {
-    let ctx = IterContext::default();
-    assert_eq!(SequenceNumber::MAX, ctx.visible_sequence);
-}
-
-#[test]
-fn test_write_iter_memtable() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_iter_memtable_case(&ctx);
-    });
-}
-
-fn check_iter_batch_size(iter: &mut dyn BatchIterator, total: usize, batch_size: usize) {
-    let mut remains = total;
-    for batch in iter {
-        let batch = batch.unwrap();
-        check_batch_valid(&batch);
-
-        let row_num = batch.column(0).len();
-        if remains >= batch_size {
-            assert_eq!(batch_size, row_num);
-            remains -= batch_size;
-        } else {
-            assert_eq!(remains, row_num);
-            remains = 0;
-        }
-    }
-
-    assert_eq!(0, remains);
-}
-
-#[test]
-fn test_iter_batch_size() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1000, 1001, 2002, 2003, 2003], // keys
-            &[
-                (Some(1), None),
-                (Some(2), None),
-                (Some(3), None),
-                (Some(4), None),
-                (None, None),
-                (None, None),
-            ], // values
-        );
-
-        let total = 4;
-        // Batch size [less than, equal to, greater than] total
-        let batch_sizes = [1, 6, 8];
-        for batch_size in batch_sizes {
-            let iter_ctx = IterContext {
-                batch_size,
-                ..Default::default()
-            };
-
-            let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
-            check_iter_batch_size(&mut *iter, total, batch_size);
-        }
-    });
-}
-
-#[test]
-fn test_duplicate_key_across_batch() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1001, 2000, 2001], // keys
-            &[(Some(1), None), (None, None), (None, None), (None, None)], // values
-        );
-
-        write_kvs(
-            &*ctx.memtable,
-            11, // sequence
-            OpType::Put,
-            &[1000, 2001],                             // keys
-            &[(Some(1231), None), (Some(1232), None)], // values
-        );
-
-        let batch_sizes = [1, 2, 3, 4, 5];
-        for batch_size in batch_sizes {
-            let iter_ctx = IterContext {
-                batch_size,
-                ..Default::default()
-            };
-
-            let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
-            check_iter_content(
-                &mut *iter,
-                &[1000, 1001, 2000, 2001], // keys
-                &[11, 10, 10, 11],         // sequences
-                &[OpType::Put, OpType::Put, OpType::Put, OpType::Put], // op_types
-                &[
-                    (Some(1231), None),
-                    (None, None),
-                    (None, None),
-                    (Some(1232), None),
-                ], // values
-            );
-        }
-    });
-}
-
-#[test]
-fn test_duplicate_key_in_batch() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1000, 1001, 2001], // keys
-            &[(None, None), (None, None), (Some(1234), None), (None, None)], // values
-        );
-
-        let batch_sizes = [1, 2, 3, 4, 5];
-        for batch_size in batch_sizes {
-            let iter_ctx = IterContext {
-                batch_size,
-                ..Default::default()
-            };
-
-            let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
-            check_iter_content(
-                &mut *iter,
-                &[1000, 1001, 2001],                               // keys
-                &[10, 10, 10],                                     // sequences
-                &[OpType::Put, OpType::Put, OpType::Put],          // op_types
-                &[(None, None), (Some(1234), None), (None, None)], // values
-            );
-        }
-    });
-}
-
-#[test]
-fn test_sequence_visibility() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1000],                       // keys
-            &[(Some(1), None), (Some(2), None)], // values
-        );
-
-        write_kvs(
-            &*ctx.memtable,
-            11, // sequence
-            OpType::Put,
-            &[1000, 1000],                         // keys
-            &[(Some(11), None), (Some(12), None)], // values
-        );
-
-        write_kvs(
-            &*ctx.memtable,
-            12, // sequence
-            OpType::Put,
-            &[1000, 1000],                         // keys
-            &[(Some(21), None), (Some(22), None)], // values
-        );
-
-        {
-            let iter_ctx = IterContext {
-                batch_size: 1,
-                visible_sequence: 9,
-                projected_schema: None,
-                time_range: None,
-            };
-
-            let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
-            check_iter_content(
-                &mut *iter,
-                &[], // keys
-                &[], // sequences
-                &[], // op_types
-                &[], // values
-            );
-        }
-
-        {
-            let iter_ctx = IterContext {
-                batch_size: 1,
-                visible_sequence: 10,
-                projected_schema: None,
-                time_range: None,
-            };
-
-            let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
-            check_iter_content(
-                &mut *iter,
-                &[1000],                     // keys
-                &[10],                       // sequences
-                &[OpType::Put, OpType::Put], // op_types
-                &[(Some(2), None)],          // values
-            );
-        }
-
-        {
-            let iter_ctx = IterContext {
-                batch_size: 1,
-                visible_sequence: 11,
-                projected_schema: None,
-                time_range: None,
-            };
-
-            let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
-            check_iter_content(
-                &mut *iter,
-                &[1000],                     // keys
-                &[11],                       // sequences
-                &[OpType::Put, OpType::Put], // op_types
-                &[(Some(12), None)],         // values
-            );
-        }
-    });
-}
-
-#[test]
-fn test_iter_after_none() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1001, 1002],                                  // keys
-            &[(Some(0), None), (Some(1), None), (Some(2), None)], // values
-        );
-
-        let iter_ctx = IterContext {
-            batch_size: 4,
-            ..Default::default()
-        };
-
-        let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
-        let _ = iter.next().unwrap();
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none());
-    });
-}
-
-#[test]
-fn test_filter_memtable() {
-    let tester = MemtableTester::default();
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1001, 1002],                                  // keys
-            &[(Some(0), None), (Some(1), None), (Some(2), None)], // values
-        );
-
-        let iter_ctx = IterContext {
-            batch_size: 4,
-            time_range: Some(
-                TimestampRange::new(
-                    Timestamp::new_millisecond(0),
-                    Timestamp::new_millisecond(1001),
-                )
-                .unwrap(),
-            ),
-            ..Default::default()
-        };
-
-        let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
-        let batch = iter.next().unwrap().unwrap();
-        assert_eq!(5, batch.columns.len());
-        assert_eq!(
-            Arc::new(TimestampMillisecondVector::from_slice([1000])) as Arc<_>,
-            batch.columns[0]
-        );
-    });
-}
-
-#[test]
-fn test_memtable_projection() {
-    let tester = MemtableTester::default();
-    // Only need v0, but row key columns and internal columns would also be read.
-    let projected_schema =
-        Arc::new(ProjectedSchema::new(tester.schema.clone(), Some(vec![2])).unwrap());
-
-    tester.run_testcase(|ctx| {
-        write_kvs(
-            &*ctx.memtable,
-            9, // sequence
-            OpType::Put,
-            &[1000, 1001, 1002], // keys
-            &[
-                (Some(10), Some(20)),
-                (Some(11), Some(21)),
-                (Some(12), Some(22)),
-            ], // values
-        );
-
-        let iter_ctx = IterContext {
-            batch_size: 4,
-            projected_schema: Some(projected_schema.clone()),
-            ..Default::default()
-        };
-
-        let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
-        let batch = iter.next().unwrap().unwrap();
-        assert!(iter.next().is_none());
-
-        assert_eq!(4, batch.num_columns());
-        let k0 = Arc::new(TimestampMillisecondVector::from_slice([1000, 1001, 1002])) as VectorRef;
-        let v0 = Arc::new(UInt64Vector::from_slice([20, 21, 22])) as VectorRef;
-        let sequences = Arc::new(UInt64Vector::from_slice([9, 9, 9])) as VectorRef;
-        let op_types = Arc::new(UInt8Vector::from_slice([1, 1, 1])) as VectorRef;
-
-        assert_eq!(k0, *batch.column(0));
-        assert_eq!(v0, *batch.column(1));
-        assert_eq!(sequences, *batch.column(2));
-        assert_eq!(op_types, *batch.column(3));
-    });
-}
--- a/src/storage/src/memtable/version.rs
+++ b/src/storage/src/memtable/version.rs
@@ -1,166 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::cmp::Ordering;
-
-use common_time::RangeMillis;
-
-use crate::memtable::{MemtableId, MemtableRef};
-
-/// A version of all memtables.
-///
-/// This structure is immutable now.
-#[derive(Debug)]
-pub struct MemtableVersion {
-    mutable: MemtableRef,
-    /// Immutable memtables.
-    immutables: Vec<MemtableRef>,
-}
-
-impl MemtableVersion {
-    pub fn new(mutable: MemtableRef) -> MemtableVersion {
-        Self {
-            mutable,
-            immutables: vec![],
-        }
-    }
-
-    #[inline]
-    pub fn mutable_memtable(&self) -> &MemtableRef {
-        &self.mutable
-    }
-
-    #[inline]
-    pub fn immutable_memtables(&self) -> &[MemtableRef] {
-        &self.immutables
-    }
-
-    pub fn num_memtables(&self) -> usize {
-        // the last `1` is for `mutable`
-        self.immutable_memtables().len() + 1
-    }
-
-    /// Clone current memtable version and freeze its mutable memtables, which moves
-    /// all mutable memtables to immutable memtable list.
-    ///
-    /// This method also calls [Memtable::mark_immutable()](crate::memtable::Memtable::mark_immutable()) to
-    /// mark the mutable memtable as immutable.
-    pub fn freeze_mutable(&self, new_mutable: MemtableRef) -> MemtableVersion {
-        let mut immutables = self.immutables.clone();
-        // Marks the mutable memtable as immutable so it can free the memory usage from our
-        // soft limit.
-        self.mutable.mark_immutable();
-        immutables.push(self.mutable.clone());
-
-        MemtableVersion {
-            mutable: new_mutable,
-            immutables,
-        }
-    }
-
-    pub fn mutable_bytes_allocated(&self) -> usize {
-        self.mutable.stats().bytes_allocated()
-    }
-
-    pub fn total_bytes_allocated(&self) -> usize {
-        self.immutables
-            .iter()
-            .map(|m| m.stats().bytes_allocated())
-            .sum::<usize>()
-            + self.mutable.stats().bytes_allocated()
-    }
-
-    /// Creates a new `MemtableVersion` that removes immutable memtables
-    /// less than or equal to max_memtable_id.
-    pub fn remove_immutables(&self, max_memtable_id: MemtableId) -> MemtableVersion {
-        let immutables = self
-            .immutables
-            .iter()
-            .filter(|immem| immem.id() > max_memtable_id)
-            .cloned()
-            .collect();
-
-        MemtableVersion {
-            mutable: self.mutable.clone(),
-            immutables,
-        }
-    }
-
-    pub fn memtables_to_flush(&self) -> (Option<MemtableId>, Vec<MemtableRef>) {
-        let max_memtable_id = self.immutables.iter().map(|immem| immem.id()).max();
-        let memtables = self.immutables.clone();
-
-        (max_memtable_id, memtables)
-    }
-}
-
-// We use a new type to order time ranges by (end, start).
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-struct RangeKey(RangeMillis);
-
-impl Ord for RangeKey {
-    fn cmp(&self, other: &RangeKey) -> Ordering {
-        self.0
-            .end()
-            .cmp(other.0.end())
-            .then_with(|| self.0.start().cmp(other.0.start()))
-    }
-}
-
-impl PartialOrd for RangeKey {
-    fn partial_cmp(&self, other: &RangeKey) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use super::*;
-    use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder};
-    use crate::test_util::schema_util;
-
-    #[test]
-    fn test_memtable_version() {
-        let memtable_builder = DefaultMemtableBuilder::default();
-        let region_schema = Arc::new(schema_util::new_region_schema(1, 1));
-
-        let memtable_1 = memtable_builder.build(region_schema.clone());
-        let v1 = MemtableVersion::new(memtable_1);
-        assert_eq!(1, v1.num_memtables());
-
-        // Freeze and add new mutable.
-        let memtable_2 = memtable_builder.build(region_schema.clone());
-        let v2 = v1.freeze_mutable(memtable_2);
-        let v2_immutables = v2.immutable_memtables();
-        assert_eq!(1, v2_immutables.len());
-        assert_eq!(0, v2_immutables[0].id());
-        assert_eq!(1, v2.mutable_memtable().id());
-        assert_eq!(2, v2.num_memtables());
-
-        // Add another one and check immutable memtables that need flush
-        let memtable_3 = memtable_builder.build(region_schema);
-        let v3 = v2.freeze_mutable(memtable_3);
-        let (max_table_id, immutables) = v3.memtables_to_flush();
-        assert_eq!(1, max_table_id.unwrap());
-        assert_eq!(2, immutables.len());
-
-        // Remove memtables
-        let v4 = v3.remove_immutables(1);
-        assert_eq!(1, v4.num_memtables());
-        assert_eq!(0, v4.immutable_memtables().len());
-        assert_eq!(2, v4.mutable_memtable().id());
-    }
-}
--- a/src/storage/src/metadata.rs
+++ b/src/storage/src/metadata.rs
--- a/src/storage/src/metrics.rs
+++ b/src/storage/src/metrics.rs
@@ -1,66 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! storage metrics
-
-use lazy_static::lazy_static;
-use prometheus::*;
-
-/// Reason to flush.
-pub const FLUSH_REASON: &str = "reason";
-
-lazy_static! {
-    /// Elapsed time of updating manifest when creating regions.
-    pub static ref CREATE_REGION_UPDATE_MANIFEST: Histogram =
-        register_histogram!("storage_create_region_update_manifest", "storage create region update manifest").unwrap();
-    /// Counter of scheduled flush requests.
-    pub static ref FLUSH_REQUESTS_TOTAL: IntCounterVec =
-        register_int_counter_vec!("storage_flush_requests_total", "storage flush requests total", &[FLUSH_REASON]).unwrap();
-    /// Counter of scheduled failed flush jobs.
-    pub static ref FLUSH_ERRORS_TOTAL: IntCounter =
-        register_int_counter!("storage_flush_errors_total", "storage flush errors total").unwrap();
-    //// Elapsed time of a flush job.
-    pub static ref FLUSH_ELAPSED: Histogram =
-        register_histogram!("storage_flush_elapsed", "storage flush elapsed").unwrap();
-    /// Counter of flushed bytes.
-    pub static ref FLUSH_BYTES_TOTAL: IntCounter =
-        register_int_counter!("storage_flush_bytes_total", "storage flush bytes total").unwrap();
-    /// Gauge for open regions
-    pub static ref REGION_COUNT: IntGauge =
-        register_int_gauge!("storage_region_count", "storage region count").unwrap();
-    /// Timer for logstore write
-    pub static ref LOG_STORE_WRITE_ELAPSED: Histogram =
-        register_histogram!("storage_logstore_write_elapsed", "storage logstore write elapsed").unwrap();
-    /// Elapsed time of a compact job.
-    pub static ref COMPACT_ELAPSED: Histogram =
-        register_histogram!("storage_compact_elapsed", "storage compact elapsed").unwrap();
-    /// Elapsed time for merging SST files.
-    pub static ref MERGE_ELAPSED: Histogram =
-        register_histogram!("storage_compaction_merge_elapsed", "storage compaction merge elapsed").unwrap();
-    /// Global write buffer size in bytes.
-    pub static ref WRITE_BUFFER_BYTES: IntGauge =
-        register_int_gauge!("storage_write_buffer_bytes", "storage write buffer bytes").unwrap();
-    /// Elapsed time of inserting memtable.
-    pub static ref MEMTABLE_WRITE_ELAPSED: Histogram =
-        register_histogram!("storage_memtable_write_elapsed", "storage memtable write elapsed").unwrap();
-    /// Elapsed time of preprocessing write batch.
-    pub static ref PREPROCESS_ELAPSED: Histogram =
-        register_histogram!("storage_write_preprocess_elapsed", "storage write preprocess elapsed").unwrap();
-    /// Elapsed time for windowed scan
-    pub static ref WINDOW_SCAN_ELAPSED: Histogram =
-        register_histogram!("query_scan_window_scan_elapsed", "query scan window scan elapsed").unwrap();
-    /// Rows per window during window scan
-    pub static ref WINDOW_SCAN_ROWS_PER_WINDOW: Histogram =
-        register_histogram!("query_scan_window_scan_window_row_size", "query scan window scan window row size").unwrap();
-}
--- a/src/storage/src/proto.rs
+++ b/src/storage/src/proto.rs
@@ -1,15 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod wal;
--- a/src/storage/src/proto/wal.rs
+++ b/src/storage/src/proto/wal.rs
@@ -1,40 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#![allow(clippy::all)]
-tonic::include_proto!("greptime.storage.wal.v1");
-
-use api::v1::OpType;
-
-use crate::write_batch::Payload;
-
-pub fn gen_mutation_types(payload: &Payload) -> Vec<i32> {
-    payload
-        .mutations
-        .iter()
-        .map(|m| match m.op_type {
-            OpType::Delete => MutationType::Delete.into(),
-            OpType::Put => MutationType::Put.into(),
-        })
-        .collect::<Vec<_>>()
-}
-
-impl WalHeader {
-    pub fn with_last_manifest_version(last_manifest_version: u64) -> Self {
-        Self {
-            last_manifest_version,
-            ..Default::default()
-        }
-    }
-}
--- a/src/storage/src/read.rs
+++ b/src/storage/src/read.rs
@@ -1,271 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Common structs and utilities for read.
-
-mod chain;
-mod dedup;
-mod merge;
-mod windowed;
-
-use std::cmp::Ordering;
-
-use async_trait::async_trait;
-use common_base::BitVec;
-use datatypes::data_type::DataType;
-use datatypes::prelude::ConcreteDataType;
-use datatypes::vectors::{BooleanVector, MutableVector, VectorRef};
-use snafu::{ensure, ResultExt};
-
-use crate::error::{self, Result};
-pub use crate::read::chain::ChainReader;
-pub use crate::read::dedup::DedupReader;
-pub use crate::read::merge::{MergeReader, MergeReaderBuilder};
-pub use crate::read::windowed::WindowedReader;
-
-/// Storage internal representation of a batch of rows.
-// Now the structure of `Batch` is still unstable, all pub fields may be changed.
-#[derive(Debug, Default, PartialEq, Eq, Clone)]
-pub struct Batch {
-    /// Rows organized in columnar format.
-    ///
-    /// Columns follow the same order convention of region schema:
-    /// key, value, internal columns.
-    pub columns: Vec<VectorRef>,
-}
-
-impl Batch {
-    /// Create a new `Batch` from `columns`.
-    ///
-    /// # Panics
-    /// Panics if vectors in `columns` have different length.
-    pub fn new(columns: Vec<VectorRef>) -> Batch {
-        Self::assert_columns(&columns);
-
-        Batch { columns }
-    }
-
-    #[inline]
-    pub fn num_columns(&self) -> usize {
-        self.columns.len()
-    }
-
-    #[inline]
-    pub fn num_rows(&self) -> usize {
-        self.columns.get(0).map(|v| v.len()).unwrap_or(0)
-    }
-
-    #[inline]
-    pub fn is_empty(&self) -> bool {
-        self.num_rows() == 0
-    }
-
-    #[inline]
-    pub fn columns(&self) -> &[VectorRef] {
-        &self.columns
-    }
-
-    #[inline]
-    pub fn column(&self, idx: usize) -> &VectorRef {
-        &self.columns[idx]
-    }
-
-    /// Slice the batch, returning a new batch.
-    ///
-    /// # Panics
-    /// Panics if `offset + length > self.num_rows()`.
-    fn slice(&self, offset: usize, length: usize) -> Batch {
-        let columns = self
-            .columns
-            .iter()
-            .map(|v| v.slice(offset, length))
-            .collect();
-        Batch { columns }
-    }
-
-    fn assert_columns(columns: &[VectorRef]) {
-        if columns.is_empty() {
-            return;
-        }
-
-        let length = columns[0].len();
-        assert!(columns.iter().all(|col| col.len() == length));
-    }
-}
-
-/// Compute operations for Batch.
-pub trait BatchOp {
-    /// Compare `i-th` in `left` to `j-th` row in `right` by key (row key + internal columns).
-    ///
-    /// The caller should ensure `left` and `right` have same schema as `self`.
-    ///
-    /// # Panics
-    /// Panics if
-    /// - `i` or `j` is out of bound.
-    /// - `left` or `right` has insufficient column num.
-    fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering;
-
-    /// Find unique rows in `batch` by row key.
-    ///
-    /// If `prev` is `Some` and not empty, the last row of `prev` would be used to dedup
-    /// current `batch`. Set `i-th` bit of `selected` to `true` if `i-th` row is unique,
-    /// which means the row key of `i-th` row is different from `i+1-th`'s.
-    ///
-    /// The caller could use `selected` to build a [BooleanVector] to filter the
-    /// batch, and must ensure `selected` is initialized by filling `batch.num_rows()` bits
-    /// to zero.
-    ///
-    /// # Panics
-    /// Panics if
-    /// - `batch` and `prev` have different number of columns (unless `prev` is
-    /// empty).
-    /// - `selected.len()` is less than the number of rows.
-    fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>);
-
-    /// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
-    /// are true).
-    ///
-    /// Note that the nulls of `filter` are interpreted as `false` will lead to these elements
-    /// being masked out.
-    fn filter(&self, batch: &Batch, filter: &BooleanVector) -> Result<Batch>;
-
-    /// Unselect deleted rows according to the [`OpType`](api::v1::OpType).
-    ///
-    /// # Panics
-    /// Panics if
-    /// - `batch` doesn't have a valid op type column.
-    /// - `selected.len()` is less than the number of rows.
-    fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec);
-}
-
-/// Reusable [Batch] builder.
-pub struct BatchBuilder {
-    builders: Vec<Box<dyn MutableVector>>,
-}
-
-impl BatchBuilder {
-    /// Create a new `BatchBuilder` from data types with given `capacity`.
-    ///
-    /// # Panics
-    /// Panics if `types` is empty.
-    pub fn with_capacity<'a, I>(types: I, capacity: usize) -> BatchBuilder
-    where
-        I: IntoIterator<Item = &'a ConcreteDataType>,
-    {
-        let builders: Vec<_> = types
-            .into_iter()
-            .map(|t| t.create_mutable_vector(capacity))
-            .collect();
-        assert!(!builders.is_empty());
-
-        BatchBuilder { builders }
-    }
-
-    /// Returns number of rows already in this builder.
-    #[inline]
-    pub fn num_rows(&self) -> usize {
-        self.builders[0].len()
-    }
-
-    /// Returns true if no rows in this builder.
-    #[inline]
-    pub fn is_empty(&self) -> bool {
-        self.num_rows() == 0
-    }
-
-    /// Extend the builder by slice of batch.
-    ///
-    /// # Panics
-    /// Panics if
-    /// - `offset + length > batch.num_rows()`.
-    /// - Number of columns in `batch` is not equal to the builder's.
-    pub fn extend_slice_of(&mut self, batch: &Batch, offset: usize, length: usize) -> Result<()> {
-        assert_eq!(self.builders.len(), batch.num_columns());
-
-        for (builder, column) in self.builders.iter_mut().zip(batch.columns()) {
-            builder
-                .extend_slice_of(&**column, offset, length)
-                .context(error::PushBatchSnafu)?;
-        }
-
-        Ok(())
-    }
-
-    /// Push `i-th` row of batch into the builder.
-    ///
-    /// # Panics
-    /// Panics if
-    /// - `i` is out of bound.
-    /// - Number of columns in `batch` is not equal to the builder's.
-    pub fn push_row_of(&mut self, batch: &Batch, i: usize) -> Result<()> {
-        assert_eq!(self.builders.len(), batch.num_columns());
-
-        for (builder, column) in self.builders.iter_mut().zip(batch.columns()) {
-            let value = column.get_ref(i);
-            builder
-                .try_push_value_ref(value)
-                .context(error::PushBatchSnafu)?;
-        }
-
-        Ok(())
-    }
-
-    /// Create a new [Batch] and reset this builder.
-    pub fn build(&mut self) -> Result<Batch> {
-        // Checks length of each builder.
-        let rows = self.num_rows();
-        for (i, builder) in self.builders.iter().enumerate() {
-            ensure!(
-                rows == builder.len(),
-                error::BuildBatchSnafu {
-                    msg: format!(
-                        "expect row num {} but builder {} has {}",
-                        rows,
-                        i,
-                        builder.len()
-                    ),
-                }
-            );
-        }
-
-        let columns = self.builders.iter_mut().map(|b| b.to_vector()).collect();
-
-        Ok(Batch { columns })
-    }
-}
-
-/// Async batch reader.
-#[async_trait]
-pub trait BatchReader: Send {
-    // TODO(yingwen): Schema of batch.
-
-    /// Fetch next [Batch].
-    ///
-    /// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
-    /// again won't return batch again.
-    ///
-    /// If `Err` is returned, caller **must** not call this method again, the implementor
-    /// may or may not panic in such case.
-    async fn next_batch(&mut self) -> Result<Option<Batch>>;
-}
-
-/// Pointer to [BatchReader].
-pub type BoxedBatchReader = Box<dyn BatchReader>;
-
-#[async_trait::async_trait]
-impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        (**self).next_batch().await
-    }
-}
--- a/src/storage/src/read/chain.rs
+++ b/src/storage/src/read/chain.rs
@@ -1,124 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use crate::error::Result;
-use crate::read::{Batch, BatchReader};
-use crate::schema::ProjectedSchemaRef;
-
-/// A reader that simply chain the outputs of input readers.
-pub struct ChainReader<R> {
-    /// Schema to read
-    pub schema: ProjectedSchemaRef,
-    /// Each reader reads a slice of time window
-    pub readers: Vec<R>,
-}
-
-impl<R> ChainReader<R> {
-    /// Returns a new [ChainReader] with specific input `readers`.
-    pub fn new(schema: ProjectedSchemaRef, mut readers: Vec<R>) -> Self {
-        // Reverse readers since we iter them backward.
-        readers.reverse();
-        Self { schema, readers }
-    }
-}
-
-#[async_trait::async_trait]
-impl<R> BatchReader for ChainReader<R>
-where
-    R: BatchReader,
-{
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        while let Some(reader) = self.readers.last_mut() {
-            if let Some(batch) = reader.next_batch().await? {
-                return Ok(Some(batch));
-            } else {
-                // Remove the exhausted reader.
-                self.readers.pop();
-            }
-        }
-        Ok(None)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test_util::read_util::{self, Batches, VecBatchReader};
-
-    fn build_chain_reader(sources: &[Batches]) -> ChainReader<VecBatchReader> {
-        let schema = read_util::new_projected_schema();
-        let readers = sources
-            .iter()
-            .map(|source| read_util::build_vec_reader(source))
-            .collect();
-
-        ChainReader::new(schema, readers)
-    }
-
-    async fn check_chain_reader_result(
-        mut reader: ChainReader<VecBatchReader>,
-        input: &[Batches<'_>],
-    ) {
-        let expect: Vec<_> = input
-            .iter()
-            .flat_map(|v| v.iter())
-            .flat_map(|v| v.iter().copied())
-            .collect();
-
-        let result = read_util::collect_kv_batch(&mut reader).await;
-        assert_eq!(expect, result);
-
-        // Call next_batch() again is allowed.
-        assert!(reader.next_batch().await.unwrap().is_none());
-    }
-
-    #[tokio::test]
-    async fn test_chain_empty() {
-        let mut reader = build_chain_reader(&[]);
-
-        assert!(reader.next_batch().await.unwrap().is_none());
-        // Call next_batch() again is allowed.
-        assert!(reader.next_batch().await.unwrap().is_none());
-    }
-
-    #[tokio::test]
-    async fn test_chain_one() {
-        let input: &[Batches] = &[&[
-            &[(1, Some(1)), (2, Some(2))],
-            &[(3, Some(3)), (4, Some(4))],
-            &[(5, Some(5))],
-        ]];
-
-        let reader = build_chain_reader(input);
-
-        check_chain_reader_result(reader, input).await;
-    }
-
-    #[tokio::test]
-    async fn test_chain_multi() {
-        let input: &[Batches] = &[
-            &[
-                &[(1, Some(1)), (2, Some(2))],
-                &[(3, Some(3)), (4, Some(4))],
-                &[(5, Some(5))],
-            ],
-            &[&[(6, Some(3)), (7, Some(4)), (8, Some(8))], &[(9, Some(9))]],
-            &[&[(10, Some(10)), (11, Some(11))], &[(12, Some(12))]],
-        ];
-
-        let reader = build_chain_reader(input);
-
-        check_chain_reader_result(reader, input).await;
-    }
-}
--- a/src/storage/src/read/dedup.rs
+++ b/src/storage/src/read/dedup.rs
@@ -1,181 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use async_trait::async_trait;
-use common_base::BitVec;
-use datatypes::prelude::ScalarVector;
-use datatypes::vectors::BooleanVector;
-
-use crate::error::Result;
-use crate::read::{Batch, BatchOp, BatchReader};
-use crate::schema::ProjectedSchemaRef;
-
-/// A reader that dedup rows from inner reader.
-pub struct DedupReader<R> {
-    /// Projected schema to read.
-    schema: ProjectedSchemaRef,
-    /// The inner reader.
-    reader: R,
-    /// Previous batch from the reader.
-    prev_batch: Option<Batch>,
-    /// Reused bitmap buffer.
-    selected: BitVec,
-}
-
-impl<R> DedupReader<R> {
-    pub fn new(schema: ProjectedSchemaRef, reader: R) -> DedupReader<R> {
-        DedupReader {
-            schema,
-            reader,
-            prev_batch: None,
-            selected: BitVec::default(),
-        }
-    }
-
-    /// Take `batch` and then returns a new batch with no duplicated rows.
-    ///
-    /// This method may returns empty `Batch`.
-    fn dedup_batch(&mut self, batch: Batch) -> Result<Batch> {
-        if batch.is_empty() {
-            // No need to update `prev_batch` if current batch is empty.
-            return Ok(batch);
-        }
-
-        // Reinitialize the bit map to zeros.
-        self.selected.clear();
-        self.selected.resize(batch.num_rows(), false);
-        self.schema
-            .find_unique(&batch, &mut self.selected, self.prev_batch.as_ref());
-
-        // Store current batch to `prev_batch` so we could compare the next batch
-        // with this batch. We store batch before filtering it mainly for correctness, as
-        // once we supports `DELETE`, rows with `OpType::Delete` would be removed from the
-        // batch after filter, then we may store an incorrect `last row` of previous batch.
-        self.prev_batch
-            .get_or_insert_with(Batch::default)
-            .clone_from(&batch); // Use `clone_from` to reuse allocated memory if possible.
-
-        // Find all rows whose op_types are `OpType::Delete`, mark their `selected` to false.
-        self.schema.unselect_deleted(&batch, &mut self.selected);
-
-        let filter = BooleanVector::from_iterator(self.selected.iter().by_vals());
-        // Filter duplicate rows.
-        self.schema.filter(&batch, &filter)
-    }
-}
-
-#[async_trait]
-impl<R: BatchReader> BatchReader for DedupReader<R> {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        while let Some(batch) = self.reader.next_batch().await? {
-            let filtered = self.dedup_batch(batch)?;
-            // Skip empty batch.
-            if !filtered.is_empty() {
-                return Ok(Some(filtered));
-            }
-        }
-
-        Ok(None)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use api::v1::OpType;
-
-    use super::*;
-    use crate::test_util::read_util;
-
-    #[tokio::test]
-    async fn test_dedup_reader_empty() {
-        let schema = read_util::new_projected_schema();
-        let reader = read_util::build_vec_reader(&[]);
-        let mut reader = DedupReader::new(schema, reader);
-
-        assert!(reader.next_batch().await.unwrap().is_none());
-        // Call next_batch() again is allowed.
-        assert!(reader.next_batch().await.unwrap().is_none());
-    }
-
-    #[tokio::test]
-    async fn test_dedup_by_sequence() {
-        let schema = read_util::new_projected_schema();
-        let reader = read_util::build_full_vec_reader(&[
-            // key, value, sequence, op_type
-            &[
-                (100, 1, 1000, OpType::Put),
-                (100, 2, 999, OpType::Put),
-                (100, 3, 998, OpType::Put),
-                (101, 1, 1000, OpType::Put),
-            ],
-            &[
-                (101, 2, 999, OpType::Put),
-                (102, 12, 1000, OpType::Put),
-                (103, 13, 1000, OpType::Put),
-            ],
-            &[(103, 2, 999, OpType::Put)],
-        ]);
-        let mut reader = DedupReader::new(schema, reader);
-
-        let result = read_util::collect_kv_batch(&mut reader).await;
-        let expect = [
-            (100, Some(1)),
-            (101, Some(1)),
-            (102, Some(12)),
-            (103, Some(13)),
-        ];
-        assert_eq!(&expect, &result[..]);
-    }
-
-    #[tokio::test]
-    async fn test_dedup_contains_empty_input() {
-        let schema = read_util::new_projected_schema();
-        let reader = read_util::build_full_vec_reader(&[
-            // key, value, sequence, op_type
-            &[
-                (100, 1, 1000, OpType::Put),
-                (100, 2, 999, OpType::Put),
-                (101, 1, 1000, OpType::Put),
-            ],
-            &[],
-            &[(101, 2, 999, OpType::Put), (102, 12, 1000, OpType::Put)],
-        ]);
-        let mut reader = DedupReader::new(schema, reader);
-
-        let result = read_util::collect_kv_batch(&mut reader).await;
-        let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))];
-        assert_eq!(&expect, &result[..]);
-    }
-
-    #[tokio::test]
-    async fn test_dedup_contains_empty_output() {
-        let schema = read_util::new_projected_schema();
-        let reader = read_util::build_full_vec_reader(&[
-            // key, value, sequence, op_type
-            &[
-                (100, 1, 1000, OpType::Put),
-                (100, 2, 999, OpType::Put),
-                (101, 1, 1000, OpType::Put),
-            ],
-            &[(101, 2, 999, OpType::Put)],
-            &[(101, 3, 998, OpType::Put), (101, 4, 997, OpType::Put)],
-            &[(102, 12, 998, OpType::Put)],
-        ]);
-        let mut reader = DedupReader::new(schema, reader);
-
-        let result = read_util::collect_kv_batch(&mut reader).await;
-        let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))];
-        assert_eq!(&expect, &result[..]);
-    }
-}
--- a/src/storage/src/read/merge.rs
+++ b/src/storage/src/read/merge.rs
@@ -1,828 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Merge reader.
-//!
-//! The implementation of [`MergeReader`] is inspired by
-//!  [`kudu's MergeIterator`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L107)
-//! and [`CeresDB's MergeIterator`](https://github.com/CeresDB/ceresdb/blob/02a7e3100f47cf16aa6c245ed529a6978be20fbd/analytic_engine/src/row_iter/merge.rs)
-//!
-//! The main idea of the merge algorithm is to maintain a `merge window`. The window describes,
-//! at any given time, the key range where we expect to find the row with the smallest key.
-//! A [`Node`] (known as the sub-iterator in kudu) whose NEXT overlaps with the `merge window`
-//! is said to be actively participating in the merge.
-//!
-//! The `merge window` is defined as follows:
-//! 1.  The window's start is the smallest lower bound of all nodes. We
-//!     refer to the node that owns this lower bound as LOW.
-//! 2.  The window’s end is the smallest upper bound of all nodes whose
-//!     lower bounds are less than or equal to LOW's upper bound.
-//! 2a. The window's end could be LOW's upper bound itself, if it is the smallest
-//!     upper bound, but this isn't necessarily the case.
-//! 3.  The merge window's dimensions change as the merge proceeds, though it
-//!     only ever moves "to the right" (i.e. the window start/end only increase).
-//!
-//! We can divide the nodes into two sets, one for whose next rows overlap with the `merge window`,
-//! another for whose next rows do not. The merge steady state resembles that of a traditional
-//! heap-based merge: the top-most node is popped from HOT, the lower bound is copied to the output
-//! and advanced, and the node is pushed back to HOT.
-//!
-//! In the steady state, we need to move nodes from COLD to HOT whenever the end of the merge window
-//! moves; that's a sign that the window may now overlap with a NEXT belonging to a nodes in the
-//! second set (COLD). The end of the merge window moves when a node is fully exhausted (i.e. all rows have
-//! been copied to the output), or when a node finishes its NEXT and needs to peek again.
-//!
-//! At any given time, the NEXT belonging to the top-most node in COLD is nearest the merge window.
-//! When the merge window's end has moved and we need to refill HOT, the top-most node in COLD is
-//! the best candidate. To figure out whether it should be moved, we compare its NEXT's lower bound
-//! against the upper bound in HOT's first node: if the lower bound is less than or equal to the key,
-//! we move the node from COLD to HOT. On the flip side, when a node from HOT finishes its NEXT and peeks
-//! again, we also need to check whether it has exited the merge window. The approach is similar: if
-//! its NEXT's lower bound is greater than the upper bound of HOT'S first node, it's time to move it to COLD.
-//!
-//! A full description of the merge algorithm could be found in [`kudu's comment`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L349)
-//!  and the [google doc](https://docs.google.com/document/d/1uP0ubjM6ulnKVCRrXtwT_dqrTWjF9tlFSRk0JN2e_O0/edit#).
-
-use std::cmp::Ordering;
-use std::collections::BinaryHeap;
-use std::fmt;
-
-use async_trait::async_trait;
-use store_api::storage::consts;
-
-use crate::error::Result;
-use crate::memtable::BoxedBatchIterator;
-use crate::read::{Batch, BatchBuilder, BatchOp, BatchReader, BoxedBatchReader};
-use crate::schema::{ProjectedSchema, ProjectedSchemaRef};
-
-/// Batch data source.
-enum Source {
-    // To avoid the overhead of async-trait (typically a heap allocation), wraps the
-    // BatchIterator into an enum instead of converting the iterator into a BatchReader.
-    Iter(BoxedBatchIterator),
-    Reader(BoxedBatchReader),
-}
-
-impl Source {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        match self {
-            Source::Iter(iter) => iter.next().transpose(),
-            Source::Reader(reader) => reader.next_batch().await,
-        }
-    }
-
-    /// Fetch next non empty batch.
-    async fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
-        while let Some(batch) = self.next_batch().await? {
-            if !batch.is_empty() {
-                return Ok(Some(batch));
-            }
-        }
-        Ok(None)
-    }
-}
-
-impl fmt::Debug for Source {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Source::Iter(_) => write!(f, "Iter(..)"),
-            Source::Reader(_) => write!(f, "Reader(..)"),
-        }
-    }
-}
-
-/// Reference to a row in [BatchCursor].
-#[derive(Debug)]
-struct RowCursor<'a> {
-    batch: &'a Batch,
-    pos: usize,
-}
-
-impl<'a> RowCursor<'a> {
-    #[inline]
-    fn compare(&self, schema: &ProjectedSchema, other: &RowCursor) -> Ordering {
-        schema.compare_row(self.batch, self.pos, other.batch, other.pos)
-    }
-}
-
-/// A `BatchCursor` wraps the `Batch` and allows reading the `Batch` by row.
-#[derive(Debug)]
-struct BatchCursor {
-    /// Current buffered `Batch`.
-    ///
-    /// `Batch` must contains at least one row.
-    batch: Batch,
-    /// Index of current row.
-    ///
-    /// `pos == batch.num_rows()` indicates no more rows to read.
-    pos: usize,
-}
-
-impl BatchCursor {
-    /// Create a new `BatchCursor`.
-    ///
-    /// # Panics
-    /// Panics if `batch` is empty.
-    fn new(batch: Batch) -> BatchCursor {
-        assert!(!batch.is_empty());
-
-        BatchCursor { batch, pos: 0 }
-    }
-
-    /// Returns true if there are remaining rows to read.
-    #[inline]
-    fn is_valid(&self) -> bool {
-        !self.is_empty()
-    }
-
-    /// Returns first row of current batch.
-    ///
-    /// # Panics
-    /// Panics if `self` is invalid.
-    fn first_row(&self) -> RowCursor {
-        assert!(self.is_valid());
-
-        RowCursor {
-            batch: &self.batch,
-            pos: self.pos,
-        }
-    }
-
-    /// Returns last row of current batch.
-    ///
-    /// # Panics
-    /// Panics if `self` is invalid.
-    fn last_row(&self) -> RowCursor {
-        assert!(self.is_valid());
-
-        RowCursor {
-            batch: &self.batch,
-            pos: self.batch.num_rows() - 1,
-        }
-    }
-
-    #[inline]
-    fn is_empty(&self) -> bool {
-        self.pos >= self.batch.num_rows()
-    }
-
-    /// Take slice of batch with at most `length` rows from the cursor, then
-    /// advance the cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is invalid.
-    fn take_batch_slice(&mut self, length: usize) -> Batch {
-        let length = length.min(self.batch.num_rows() - self.pos);
-        let batch = self.batch.slice(self.pos, length);
-        self.pos += batch.num_rows();
-
-        batch
-    }
-
-    /// Push at most `length` rows from `self` to the `builder` and advance the cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is invalid.
-    fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> {
-        let length = length.min(self.batch.num_rows() - self.pos);
-        builder.extend_slice_of(&self.batch, self.pos, length)?;
-        self.pos += length;
-
-        Ok(())
-    }
-
-    /// Push next row from `self` to the `builder` and advance the cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is invalid.
-    fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> {
-        builder.push_row_of(&self.batch, self.pos)?;
-        self.pos += 1;
-
-        Ok(())
-    }
-}
-
-/// A `Node` represent an individual input data source to be merged.
-struct Node {
-    /// Schema of data source.
-    schema: ProjectedSchemaRef,
-    /// Data source of this `Node`.
-    source: Source,
-    /// Current batch to be read.
-    ///
-    /// `None` means the `source` has reached EOF.
-    cursor: Option<BatchCursor>,
-}
-
-impl fmt::Debug for Node {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_struct("Node")
-            .field("source", &self.source)
-            .field("cursor", &self.cursor)
-            .finish_non_exhaustive()
-    }
-}
-
-impl Node {
-    async fn new(schema: ProjectedSchemaRef, mut source: Source) -> Result<Node> {
-        let cursor = source.next_non_empty_batch().await?.map(BatchCursor::new);
-        Ok(Node {
-            schema,
-            source,
-            cursor,
-        })
-    }
-
-    /// Returns the reference to the cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn cursor_ref(&self) -> &BatchCursor {
-        self.cursor.as_ref().unwrap()
-    }
-
-    /// Returns first row in cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn first_row(&self) -> RowCursor {
-        self.cursor_ref().first_row()
-    }
-
-    /// Returns last row in cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn last_row(&self) -> RowCursor {
-        self.cursor_ref().last_row()
-    }
-
-    /// Compare first row of two nodes.
-    ///
-    /// # Panics
-    /// Panics if
-    /// - either `self` or `other` is EOF.
-    fn compare_first_row(&self, other: &Node) -> Ordering {
-        self.first_row().compare(&self.schema, &other.first_row())
-    }
-
-    /// Returns true if no more batch could be fetched from this node.
-    fn is_eof(&self) -> bool {
-        self.cursor.is_none()
-    }
-
-    /// Returns true if the key range of current batch in `self` is behind (exclusive) current
-    /// batch in `other`.
-    ///
-    /// # Panics
-    /// Panics if
-    /// - either `self` or `other` is EOF.
-    fn is_behind(&self, other: &Node) -> bool {
-        let first = self.first_row();
-        let last = other.last_row();
-        // `self` is after `other` if min (first) row of `self` is greater than
-        // max (last) row of `other`.
-        first.compare(&self.schema, &last) == Ordering::Greater
-    }
-
-    /// Fetch next batch and reset its cursor if `self` isn't EOF and the cursor
-    /// is empty.
-    ///
-    /// Returns true if a new batch has been fetched.
-    async fn maybe_fetch_next_batch(&mut self) -> Result<bool> {
-        let need_fetch = !self.is_eof() && self.cursor_ref().is_empty();
-        if !need_fetch {
-            // Still has remaining rows, no need to fetch.
-            return Ok(false);
-        }
-
-        // This ensure the cursor is either non empty or None (EOF).
-        match self.source.next_non_empty_batch().await? {
-            Some(batch) => {
-                self.cursor = Some(BatchCursor::new(batch));
-                Ok(true)
-            }
-            None => {
-                // EOF
-                self.cursor = None;
-                Ok(false)
-            }
-        }
-    }
-
-    /// Returns the mutable reference to the cursor.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn cursor_mut(&mut self) -> &mut BatchCursor {
-        self.cursor.as_mut().unwrap()
-    }
-
-    /// Take batch from this node.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn take_batch_slice(&mut self, length: usize) -> Batch {
-        self.cursor_mut().take_batch_slice(length)
-    }
-
-    /// Push at most `length` rows from `self` to the `builder`.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> {
-        self.cursor_mut().push_rows_to(builder, length)
-    }
-
-    /// Push next row from `self` to the `builder`.
-    ///
-    /// # Panics
-    /// Panics if `self` is EOF.
-    fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> {
-        self.cursor_mut().push_next_row_to(builder)
-    }
-}
-
-impl PartialEq for Node {
-    fn eq(&self, other: &Node) -> bool {
-        self.compare_first_row(other) == Ordering::Equal
-    }
-}
-
-impl Eq for Node {}
-
-impl PartialOrd for Node {
-    fn partial_cmp(&self, other: &Node) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl Ord for Node {
-    fn cmp(&self, other: &Node) -> Ordering {
-        // The std binary heap is a max heap, but we want the nodes are ordered in
-        // ascend order, so we compare the nodes in reverse order.
-        other.compare_first_row(self)
-    }
-}
-
-/// A reader that would sort and merge `Batch` from multiple sources by key.
-///
-/// `Batch` from each `Source` **must** be sorted.
-pub struct MergeReader {
-    /// Whether the reader has been initialized.
-    initialized: bool,
-    /// Schema of data source.
-    schema: ProjectedSchemaRef,
-    /// Input data sources.
-    ///
-    /// All data source must have same schema. Initialize the reader would
-    /// convert all `Source`s into `Node`s and then clear this vector.
-    sources: Vec<Source>,
-    /// Holds `Node` whose key range of current batch **is** overlapped with the merge window.
-    ///
-    /// `Node` in this heap **must** not be empty. A `merge window` is the key range of the
-    /// root node in the `hot` heap.
-    hot: BinaryHeap<Node>,
-    /// Holds `Node` whose key range of current batch **isn't** overlapped with the merge window.
-    ///
-    /// `Node` in this heap **must** not be empty.
-    cold: BinaryHeap<Node>,
-    /// Suggested row number of each batch.
-    ///
-    /// The size of the batch yield from this reader may not always equal to this suggested size.
-    batch_size: usize,
-    /// Buffered batch.
-    batch_builder: BatchBuilder,
-}
-
-#[async_trait]
-impl BatchReader for MergeReader {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        self.fetch_next_batch().await
-    }
-}
-
-pub struct MergeReaderBuilder {
-    schema: ProjectedSchemaRef,
-    sources: Vec<Source>,
-    batch_size: usize,
-}
-
-impl MergeReaderBuilder {
-    pub fn new(schema: ProjectedSchemaRef) -> Self {
-        MergeReaderBuilder::with_capacity(schema, 0)
-    }
-
-    pub fn with_capacity(schema: ProjectedSchemaRef, capacity: usize) -> Self {
-        MergeReaderBuilder {
-            schema,
-            sources: Vec::with_capacity(capacity),
-            batch_size: consts::READ_BATCH_SIZE,
-        }
-    }
-
-    pub fn push_batch_iter(mut self, iter: BoxedBatchIterator) -> Self {
-        self.sources.push(Source::Iter(iter));
-        self
-    }
-
-    pub fn push_batch_reader(mut self, reader: BoxedBatchReader) -> Self {
-        self.sources.push(Source::Reader(reader));
-        self
-    }
-
-    pub fn batch_size(mut self, size: usize) -> Self {
-        self.batch_size = size;
-        self
-    }
-
-    pub fn build(self) -> MergeReader {
-        let num_sources = self.sources.len();
-        let column_schemas = self.schema.schema_to_read().schema().column_schemas();
-        let batch_builder = BatchBuilder::with_capacity(
-            column_schemas.iter().map(|c| &c.data_type),
-            self.batch_size,
-        );
-
-        MergeReader {
-            initialized: false,
-            schema: self.schema,
-            sources: self.sources,
-            hot: BinaryHeap::with_capacity(num_sources),
-            cold: BinaryHeap::with_capacity(num_sources),
-            batch_size: self.batch_size,
-            batch_builder,
-        }
-    }
-}
-
-impl MergeReader {
-    /// Initialize the reader if it has not yet been initialized.
-    async fn try_init(&mut self) -> Result<()> {
-        if self.initialized {
-            return Ok(());
-        }
-
-        if self.sources.is_empty() {
-            self.initialized = true;
-            return Ok(());
-        }
-
-        for source in self.sources.drain(..) {
-            let node = Node::new(self.schema.clone(), source).await?;
-
-            if !node.is_eof() {
-                self.cold.push(node);
-            }
-        }
-
-        self.refill_hot();
-
-        self.initialized = true;
-
-        Ok(())
-    }
-
-    async fn fetch_next_batch(&mut self) -> Result<Option<Batch>> {
-        self.try_init().await?;
-
-        while !self.hot.is_empty() && self.batch_builder.num_rows() < self.batch_size {
-            if self.hot.len() == 1 {
-                // No need to do merge sort if only one batch in the hot heap.
-                let fetch_row_num = self.batch_size - self.batch_builder.num_rows();
-                if let Some(batch) = self.fetch_batch_from_hottest(fetch_row_num).await? {
-                    // The builder is empty and we have fetched a new batch from this node.
-                    return Ok(Some(batch));
-                }
-                // Otherwise, some rows may have been pushed into the builder.
-            } else {
-                // We could only fetch one row from the hottest node.
-                self.fetch_one_row_from_hottest().await?;
-            }
-        }
-
-        // Check buffered rows in the builder.
-        if self.batch_builder.is_empty() {
-            Ok(None)
-        } else {
-            self.batch_builder.build().map(Some)
-        }
-    }
-
-    /// Move nodes in `cold` heap, whose key range is overlapped with current merge
-    /// window to `hot` heap.
-    fn refill_hot(&mut self) {
-        while !self.cold.is_empty() {
-            if let Some(merge_window) = self.hot.peek() {
-                let warmest = self.cold.peek().unwrap();
-                if warmest.is_behind(merge_window) {
-                    // if the warmest node in the `cold` heap is totally after the
-                    // `merge_window`, then no need to add more nodes into the `hot`
-                    // heap for merge sorting.
-                    break;
-                }
-            }
-
-            let warmest = self.cold.pop().unwrap();
-            self.hot.push(warmest);
-        }
-    }
-
-    /// Fetch at most `fetch_row_num` from the hottest node and attempt to return them directly
-    /// instead of pushing into the builder if the `self.batch_builder` is empty.
-    async fn fetch_batch_from_hottest(&mut self, fetch_row_num: usize) -> Result<Option<Batch>> {
-        assert_eq!(1, self.hot.len());
-
-        let mut hottest = self.hot.pop().unwrap();
-        let batch = if self.batch_builder.is_empty() {
-            Some(hottest.take_batch_slice(fetch_row_num))
-        } else {
-            hottest.push_rows_to(&mut self.batch_builder, fetch_row_num)?;
-
-            None
-        };
-
-        self.reheap(hottest).await?;
-
-        Ok(batch)
-    }
-
-    /// Fetch one row from the hottest node.
-    async fn fetch_one_row_from_hottest(&mut self) -> Result<()> {
-        let mut hottest = self.hot.pop().unwrap();
-        hottest.push_next_row_to(&mut self.batch_builder)?;
-
-        self.reheap(hottest).await
-    }
-
-    /// Fetch next batch from this node and reset its cursor, then push the node back to a
-    /// proper heap.
-    async fn reheap(&mut self, mut node: Node) -> Result<()> {
-        let fetched_new_batch = node.maybe_fetch_next_batch().await?;
-
-        if node.is_eof() {
-            // The merge window would be updated, need to refill the hot heap.
-            self.refill_hot();
-        } else if fetched_new_batch {
-            // A new batch has been fetched from the node, thus the key range of this node
-            // has been changed. Try to find a proper heap for this node.
-            let node_is_cold = if let Some(hottest) = self.hot.peek() {
-                // Now key range of this node is behind the hottest node's.
-                node.is_behind(hottest)
-            } else {
-                // Setting this to false should not affect correctness but performance because
-                // `refille_hot()` ensures the hottest node is correct.
-                true
-            };
-
-            if node_is_cold {
-                self.cold.push(node);
-            } else {
-                self.hot.push(node);
-            }
-            // Anyway, the merge window has been changed, we need to refill the hot heap.
-            self.refill_hot();
-        } else {
-            // No new batch has been fetched, so the end key of merge window has not been
-            // changed, we could just put the node back to the hot heap.
-            self.hot.push(node);
-        }
-
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use datatypes::prelude::ScalarVector;
-    use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
-
-    use super::*;
-    use crate::test_util::read_util::{self, Batches};
-
-    #[tokio::test]
-    async fn test_merge_reader_empty() {
-        let schema = read_util::new_projected_schema();
-
-        let mut reader = MergeReaderBuilder::new(schema).build();
-
-        assert!(reader.next_batch().await.unwrap().is_none());
-        // Call next_batch() again is allowed.
-        assert!(reader.next_batch().await.unwrap().is_none());
-    }
-
-    #[tokio::test]
-    async fn test_node() {
-        let schema = read_util::new_projected_schema();
-        let left_source = read_util::build_boxed_iter(&[&[(1, None), (3, None), (5, None)]]);
-        let mut left = Node::new(schema.clone(), Source::Iter(left_source))
-            .await
-            .unwrap();
-
-        let right_source = read_util::build_boxed_reader(&[&[(2, None), (3, None), (6, None)]]);
-        let mut right = Node::new(schema.clone(), Source::Reader(right_source))
-            .await
-            .unwrap();
-
-        // We use reverse order for a node.
-        assert!(left > right);
-        assert_ne!(left, right);
-
-        // Advance the left and right node.
-        left.cursor_mut().pos += 1;
-        right.cursor_mut().pos += 1;
-        assert_eq!(left, right);
-
-        // Check Debug is implemented.
-        let output = format!("{left:?}");
-        assert!(output.contains("cursor"));
-        assert!(output.contains("pos: 1"));
-        let output = format!("{right:?}");
-        assert!(output.contains("cursor"));
-        let output = format!("{:?}", left.first_row());
-        assert!(output.contains("pos: 1"));
-    }
-
-    fn build_merge_reader(sources: &[Batches], num_iter: usize, batch_size: usize) -> MergeReader {
-        let schema = read_util::new_projected_schema();
-        let mut builder =
-            MergeReaderBuilder::with_capacity(schema, sources.len()).batch_size(batch_size);
-
-        for (i, source) in sources.iter().enumerate() {
-            if i < num_iter {
-                builder = builder.push_batch_iter(read_util::build_boxed_iter(source));
-            } else {
-                builder = builder.push_batch_reader(read_util::build_boxed_reader(source));
-            }
-        }
-
-        builder.build()
-    }
-
-    async fn check_merge_reader_result(mut reader: MergeReader, input: &[Batches<'_>]) {
-        let mut expect: Vec<_> = input
-            .iter()
-            .flat_map(|v| v.iter())
-            .flat_map(|v| v.iter().copied())
-            .collect();
-        expect.sort_by_key(|k| k.0);
-
-        let result = read_util::collect_kv_batch(&mut reader).await;
-        assert_eq!(expect, result);
-
-        // Call next_batch() again is allowed.
-        assert!(reader.next_batch().await.unwrap().is_none());
-    }
-
-    async fn check_merge_reader_by_batch(mut reader: MergeReader, expect_batches: Batches<'_>) {
-        let mut result = Vec::new();
-        while let Some(batch) = reader.next_batch().await.unwrap() {
-            let key = batch
-                .column(0)
-                .as_any()
-                .downcast_ref::<TimestampMillisecondVector>()
-                .unwrap();
-            let value = batch
-                .column(1)
-                .as_any()
-                .downcast_ref::<Int64Vector>()
-                .unwrap();
-
-            let batch: Vec<_> = key
-                .iter_data()
-                .zip(value.iter_data())
-                .map(|(k, v)| (k.unwrap().into(), v))
-                .collect();
-            result.push(batch);
-        }
-
-        for (expect, actual) in expect_batches.iter().zip(result.iter()) {
-            assert_eq!(expect, actual);
-        }
-    }
-
-    #[tokio::test]
-    async fn test_merge_multiple_interleave() {
-        common_telemetry::init_default_ut_logging();
-
-        let input: &[Batches] = &[
-            &[&[(1, Some(1)), (5, Some(5)), (9, Some(9))]],
-            &[&[(2, Some(2)), (3, Some(3)), (8, Some(8))]],
-            &[&[(7, Some(7)), (12, Some(12))]],
-        ];
-        let reader = build_merge_reader(input, 1, 3);
-        check_merge_reader_result(reader, input).await;
-
-        let input: &[Batches] = &[
-            &[
-                &[(1, Some(1)), (2, Some(2))],
-                &[(3, Some(3)), (4, Some(4))],
-                &[(5, Some(5)), (12, Some(12))],
-            ],
-            &[&[(6, Some(6)), (7, Some(7)), (18, Some(18))]],
-            &[&[(13, Some(13)), (15, Some(15))]],
-        ];
-        let reader = build_merge_reader(input, 1, 3);
-        check_merge_reader_by_batch(
-            reader,
-            &[
-                // The former two batches could be returned directly.
-                &[(1, Some(1)), (2, Some(2))],
-                &[(3, Some(3)), (4, Some(4))],
-                &[(5, Some(5)), (6, Some(6)), (7, Some(7))],
-                &[(12, Some(12)), (13, Some(13)), (15, Some(15))],
-                &[(18, Some(18))],
-            ],
-        )
-        .await;
-
-        let input: &[Batches] = &[
-            &[
-                &[(1, Some(1)), (2, Some(2))],
-                &[(5, Some(5)), (9, Some(9))],
-                &[(14, Some(14)), (17, Some(17))],
-            ],
-            &[&[(6, Some(6)), (7, Some(7))], &[(15, Some(15))]],
-        ];
-        let reader = build_merge_reader(input, 1, 2);
-        check_merge_reader_by_batch(
-            reader,
-            &[
-                &[(1, Some(1)), (2, Some(2))],
-                // Could not return batch (6, 7) directly.
-                &[(5, Some(5)), (6, Some(6))],
-                &[(7, Some(7)), (9, Some(9))],
-                &[(14, Some(14)), (15, Some(15))],
-                &[(17, Some(17))],
-            ],
-        )
-        .await;
-    }
-
-    #[tokio::test]
-    async fn test_merge_one_source() {
-        common_telemetry::init_default_ut_logging();
-
-        let input: &[Batches] = &[&[
-            &[(1, Some(1)), (2, Some(2)), (3, Some(3))],
-            &[(4, Some(4)), (5, Some(5)), (6, Some(6))],
-        ]];
-        let reader = build_merge_reader(input, 1, 2);
-
-        check_merge_reader_result(reader, input).await;
-    }
-
-    #[tokio::test]
-    async fn test_merge_with_empty_batch() {
-        let input: &[Batches] = &[
-            &[
-                &[(1, Some(1)), (2, Some(2))],
-                &[(3, Some(3)), (6, Some(6))],
-                &[],
-                &[],
-                &[(8, Some(8)), (12, Some(12))],
-                &[],
-            ],
-            &[
-                &[(4, Some(4)), (5, Some(5))],
-                &[],
-                &[(15, None), (18, None), (20, None)],
-            ],
-            &[&[(13, Some(13)), (19, None)], &[], &[]],
-        ];
-        let reader = build_merge_reader(input, 1, 2);
-
-        check_merge_reader_result(reader, input).await;
-    }
-
-    #[tokio::test]
-    async fn test_merge_duplicate_key() {
-        let input: &[Batches] = &[
-            &[
-                &[(1, Some(1)), (5, Some(5)), (8, Some(8))],
-                &[(9, None), (11, None)],
-                &[(12, Some(12)), (15, None)],
-            ],
-            &[&[(1, Some(1)), (3, Some(3)), (8, Some(8))], &[(16, None)]],
-            &[
-                &[(7, Some(7)), (12, Some(12))],
-                &[(15, None), (16, None), (17, None)],
-            ],
-            &[&[(15, None)]],
-        ];
-        let reader = build_merge_reader(input, 2, 2);
-        check_merge_reader_result(reader, input).await;
-    }
-}
--- a/src/storage/src/read/windowed.rs
+++ b/src/storage/src/read/windowed.rs
@@ -1,171 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use arrow::compute::SortOptions;
-use arrow::row::{RowConverter, SortField};
-use arrow_array::{Array, ArrayRef};
-use common_recordbatch::OrderOption;
-use datatypes::data_type::DataType;
-use datatypes::vectors::Helper;
-use snafu::ResultExt;
-
-use crate::error::{self, Result};
-use crate::read::{Batch, BatchReader};
-use crate::schema::{ProjectedSchemaRef, StoreSchema};
-
-/// [WindowedReader] provides a windowed record batch reader that scans all rows within a window
-/// at a time and sort these rows ordered in `[<timestamp>, <PK>]` order.
-pub struct WindowedReader<R> {
-    /// Schema to read
-    pub schema: ProjectedSchemaRef,
-    /// Each reader reads a slice of time window
-    pub readers: Vec<R>,
-    /// `order_options` defines how records within windows are sorted.
-    pub order_options: Vec<OrderOption>,
-}
-
-impl<R> WindowedReader<R> {
-    /// Creates a new [WindowedReader] from given schema and a set of boxed readers.
-    ///
-    /// ### Note
-    /// [WindowedReader] always reads the readers in a reverse order. The last reader in `readers`
-    /// gets polled first.
-    pub fn new(
-        schema: ProjectedSchemaRef,
-        readers: Vec<R>,
-        order_options: Vec<OrderOption>,
-    ) -> Self {
-        Self {
-            schema,
-            readers,
-            order_options,
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl<R> BatchReader for WindowedReader<R>
-where
-    R: BatchReader,
-{
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        let _window_scan_elapsed = crate::metrics::WINDOW_SCAN_ELAPSED.start_timer();
-        let Some(mut reader) = self.readers.pop() else {
-            return Ok(None);
-        };
-
-        let store_schema = self.schema.schema_to_read();
-        let mut batches = vec![];
-        while let Some(batch) = reader.next_batch().await? {
-            batches.push(
-                batch
-                    .columns
-                    .into_iter()
-                    .map(|v| v.to_arrow_array())
-                    .collect::<Vec<_>>(),
-            );
-        }
-
-        let Some(num_columns) = batches.get(0).map(|b| b.len()) else {
-            // the reader does not yield data, a batch of empty vectors must be returned instead of
-            // an empty batch without any column.
-            let empty_columns = store_schema
-                .columns()
-                .iter()
-                .map(|s| s.desc.data_type.create_mutable_vector(0).to_vector())
-                .collect();
-            return Ok(Some(Batch::new(empty_columns)));
-        };
-        let mut vectors_in_batch = Vec::with_capacity(num_columns);
-
-        for idx in 0..num_columns {
-            let columns: Vec<&dyn Array> =
-                batches.iter().map(|b| b[idx].as_ref()).collect::<Vec<_>>();
-            vectors_in_batch
-                .push(arrow::compute::concat(&columns).context(error::ConvertColumnsToRowsSnafu)?);
-        }
-        if let Some(v) = vectors_in_batch.get(0) {
-            crate::metrics::WINDOW_SCAN_ROWS_PER_WINDOW.observe(v.len() as f64);
-        }
-        let sorted = sort_by_rows(&self.schema, vectors_in_batch, &self.order_options)?;
-        let vectors = sorted
-            .iter()
-            .zip(store_schema.columns().iter().map(|c| &c.desc.name))
-            .map(|(arr, name)| {
-                Helper::try_into_vector(arr).context(error::ConvertChunkSnafu { name })
-            })
-            .collect::<Result<_>>()?;
-        Ok(Some(Batch::new(vectors)))
-    }
-}
-
-fn sort_by_rows(
-    schema: &ProjectedSchemaRef,
-    arrays: Vec<ArrayRef>,
-    order_options: &[OrderOption],
-) -> Result<Vec<ArrayRef>> {
-    let store_schema = schema.schema_to_read();
-    let sort_columns = build_sorted_columns(store_schema, order_options);
-    // Convert columns to rows to speed lexicographic sort
-    // TODO(hl): maybe optimize to lexsort_to_index when only timestamp column is involved.
-    let row_converter = RowConverter::new(
-        sort_columns
-            .iter()
-            .map(|(idx, descending)| {
-                SortField::new_with_options(
-                    store_schema.columns()[*idx].desc.data_type.as_arrow_type(),
-                    SortOptions {
-                        descending: *descending,
-                        nulls_first: true,
-                    },
-                )
-            })
-            .collect(),
-    )
-    .context(error::ConvertColumnsToRowsSnafu)?;
-
-    let columns_to_sort = sort_columns
-        .into_iter()
-        .map(|(idx, _)| arrays[idx].clone())
-        .collect::<Vec<_>>();
-
-    let rows_to_sort = row_converter
-        .convert_columns(&columns_to_sort)
-        .context(error::ConvertColumnsToRowsSnafu)?;
-
-    let mut sort_pairs = rows_to_sort.iter().enumerate().collect::<Vec<_>>();
-    sort_pairs.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
-
-    let idx =
-        arrow::array::UInt32Array::from_iter_values(sort_pairs.iter().map(|(i, _)| *i as u32));
-
-    let sorted = arrays
-        .iter()
-        .map(|arr| arrow::compute::take(arr, &idx, None))
-        .collect::<arrow::error::Result<Vec<_>>>()
-        .context(error::SortArraysSnafu)?;
-
-    debug_assert_eq!(sorted.len(), store_schema.num_columns());
-
-    Ok(sorted)
-}
-
-/// Builds sorted columns from `order_options`.
-/// Returns a vector of columns indices to sort and sort orders (true means descending order).
-fn build_sorted_columns(schema: &StoreSchema, order_options: &[OrderOption]) -> Vec<(usize, bool)> {
-    order_options
-        .iter()
-        .map(|o| (schema.column_index(&o.name), o.options.descending))
-        .collect()
-}
--- a/src/storage/src/region.rs
+++ b/src/storage/src/region.rs
@@ -1,808 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#[cfg(test)]
-mod tests;
-mod writer;
-
-use std::collections::BTreeMap;
-use std::fmt;
-use std::sync::atomic::{AtomicI64, Ordering};
-use std::sync::Arc;
-use std::time::Duration;
-
-use async_trait::async_trait;
-use common_telemetry::{info, logging};
-use common_time::util;
-use snafu::ResultExt;
-use store_api::logstore::LogStore;
-use store_api::manifest::{
-    self, Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator,
-};
-use store_api::storage::{
-    AlterRequest, CloseContext, CompactContext, CompactionStrategy, FlushContext, FlushReason,
-    OpenOptions, ReadContext, Region, RegionId, SequenceNumber, WriteContext, WriteResponse,
-};
-
-use crate::compaction::{
-    compaction_strategy_to_picker, CompactionPickerRef, CompactionSchedulerRef,
-};
-use crate::config::EngineConfig;
-use crate::error::{self, Error, Result};
-use crate::file_purger::FilePurgerRef;
-use crate::flush::{FlushSchedulerRef, FlushStrategyRef};
-use crate::manifest::action::{
-    RawRegionMetadata, RegionChange, RegionCheckpoint, RegionMetaAction, RegionMetaActionList,
-};
-use crate::manifest::region::RegionManifest;
-use crate::memtable::{MemtableBuilderRef, MemtableVersion};
-use crate::metadata::{RegionMetaImpl, RegionMetadata, RegionMetadataRef};
-pub(crate) use crate::region::writer::schedule_compaction;
-pub use crate::region::writer::{
-    AlterContext, RegionWriter, RegionWriterRef, WriterCompactRequest, WriterContext,
-};
-use crate::region::writer::{DropContext, TruncateContext};
-use crate::schema::compat::CompatWrite;
-use crate::snapshot::SnapshotImpl;
-use crate::sst::{AccessLayerRef, LevelMetas};
-use crate::version::{
-    Version, VersionControl, VersionControlRef, VersionEdit, INIT_COMMITTED_SEQUENCE,
-};
-use crate::wal::Wal;
-use crate::write_batch::WriteBatch;
-
-/// [Region] implementation.
-pub struct RegionImpl<S: LogStore> {
-    inner: Arc<RegionInner<S>>,
-}
-
-impl<S: LogStore> Clone for RegionImpl<S> {
-    fn clone(&self) -> Self {
-        Self {
-            inner: self.inner.clone(),
-        }
-    }
-}
-
-impl<S: LogStore> fmt::Debug for RegionImpl<S> {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_struct("RegionImpl")
-            .field("id", &self.inner.shared.id)
-            .field("name", &self.inner.shared.name)
-            .field("wal", &self.inner.wal)
-            .field("flush_strategy", &self.inner.flush_strategy)
-            .field("compaction_scheduler", &self.inner.compaction_scheduler)
-            .field("sst_layer", &self.inner.sst_layer)
-            .field("manifest", &self.inner.manifest)
-            .finish()
-    }
-}
-
-#[async_trait]
-impl<S: LogStore> Region for RegionImpl<S> {
-    type Error = Error;
-    type Meta = RegionMetaImpl;
-    type WriteRequest = WriteBatch;
-    type Snapshot = SnapshotImpl;
-
-    fn id(&self) -> RegionId {
-        self.inner.shared.id
-    }
-
-    fn name(&self) -> &str {
-        &self.inner.shared.name
-    }
-
-    fn in_memory_metadata(&self) -> RegionMetaImpl {
-        self.inner.in_memory_metadata()
-    }
-
-    async fn write(&self, ctx: &WriteContext, mut request: WriteBatch) -> Result<WriteResponse> {
-        // Compat the schema of the write batch outside of the write lock.
-        self.inner.compat_write_batch(&mut request)?;
-
-        self.inner.write(ctx, request).await
-    }
-
-    fn snapshot(&self, _ctx: &ReadContext) -> Result<SnapshotImpl> {
-        Ok(self.inner.create_snapshot())
-    }
-
-    fn write_request(&self) -> Self::WriteRequest {
-        let metadata = self.inner.version_control().metadata();
-        let user_schema = metadata.user_schema().clone();
-        let row_key_end = metadata.schema().store_schema().row_key_end();
-
-        WriteBatch::new(user_schema, row_key_end)
-    }
-
-    async fn alter(&self, request: AlterRequest) -> Result<()> {
-        self.inner.alter(request).await
-    }
-
-    async fn drop_region(&self) -> Result<()> {
-        crate::metrics::REGION_COUNT.dec();
-        self.inner.drop_region().await
-    }
-
-    fn disk_usage_bytes(&self) -> u64 {
-        let version = self.inner.version_control().current();
-        version
-            .ssts()
-            .levels()
-            .iter()
-            .map(|level_ssts| level_ssts.files().map(|sst| sst.file_size()).sum::<u64>())
-            .sum()
-    }
-
-    async fn flush(&self, ctx: &FlushContext) -> Result<()> {
-        self.inner.flush(ctx).await
-    }
-
-    async fn compact(&self, ctx: &CompactContext) -> std::result::Result<(), Self::Error> {
-        self.inner.compact(ctx).await
-    }
-
-    async fn truncate(&self) -> Result<()> {
-        self.inner.truncate().await
-    }
-}
-
-/// Storage related config for region.
-///
-/// Contains all necessary storage related components needed by the region, such as logstore,
-/// manifest, memtable builder.
-pub struct StoreConfig<S: LogStore> {
-    pub log_store: Arc<S>,
-    pub sst_layer: AccessLayerRef,
-    pub manifest: RegionManifest,
-    pub memtable_builder: MemtableBuilderRef,
-    pub flush_scheduler: FlushSchedulerRef<S>,
-    pub flush_strategy: FlushStrategyRef,
-    pub compaction_scheduler: CompactionSchedulerRef<S>,
-    pub engine_config: Arc<EngineConfig>,
-    pub file_purger: FilePurgerRef,
-    pub ttl: Option<Duration>,
-    pub write_buffer_size: usize,
-    pub compaction_strategy: CompactionStrategy,
-}
-
-pub type RecoveredMetadata = (SequenceNumber, (ManifestVersion, RawRegionMetadata));
-pub type RecoveredMetadataMap = BTreeMap<SequenceNumber, (ManifestVersion, RawRegionMetadata)>;
-
-impl<S: LogStore> RegionImpl<S> {
-    /// Create a new region and also persist the region metadata to manifest.
-    ///
-    /// The caller should avoid calling this method simultaneously.
-    pub async fn create(
-        metadata: RegionMetadata,
-        store_config: StoreConfig<S>,
-    ) -> Result<RegionImpl<S>> {
-        let metadata = Arc::new(metadata);
-
-        // Try to persist region data to manifest, ensure the new region could be recovered from
-        // the manifest.
-        let manifest_version = {
-            let _timer = crate::metrics::CREATE_REGION_UPDATE_MANIFEST.start_timer();
-            store_config
-                .manifest
-                .update(RegionMetaActionList::with_action(RegionMetaAction::Change(
-                    RegionChange {
-                        metadata: metadata.as_ref().into(),
-                        committed_sequence: INIT_COMMITTED_SEQUENCE,
-                    },
-                )))
-                .await?
-        };
-
-        let mutable_memtable = store_config
-            .memtable_builder
-            .build(metadata.schema().clone());
-        let version = Version::with_manifest_version(
-            metadata,
-            manifest_version,
-            mutable_memtable,
-            store_config.sst_layer.clone(),
-            store_config.file_purger.clone(),
-        );
-        let region = RegionImpl::new(version, store_config);
-        crate::metrics::REGION_COUNT.inc();
-
-        Ok(region)
-    }
-
-    /// Create a new region without persisting manifest.
-    fn new(version: Version, store_config: StoreConfig<S>) -> RegionImpl<S> {
-        let metadata = version.metadata();
-        let id = metadata.id();
-        let name = metadata.name().to_string();
-        let version_control = VersionControl::with_version(version);
-        let wal = Wal::new(id, store_config.log_store);
-
-        let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
-        let inner = Arc::new(RegionInner {
-            shared: Arc::new(SharedData {
-                id,
-                name,
-                version_control: Arc::new(version_control),
-                last_flush_millis: AtomicI64::new(0),
-            }),
-            writer: Arc::new(RegionWriter::new(
-                store_config.memtable_builder,
-                store_config.engine_config.clone(),
-                store_config.ttl,
-                store_config.write_buffer_size,
-                store_config.compaction_scheduler.clone(),
-                compaction_picker.clone(),
-            )),
-            wal,
-            flush_strategy: store_config.flush_strategy,
-            flush_scheduler: store_config.flush_scheduler,
-            compaction_scheduler: store_config.compaction_scheduler,
-            compaction_picker,
-            sst_layer: store_config.sst_layer,
-            manifest: store_config.manifest,
-        });
-
-        RegionImpl { inner }
-    }
-
-    /// Open an existing region and recover its data.
-    ///
-    /// The caller should avoid calling this method simultaneously.
-    pub async fn open(
-        name: String,
-        store_config: StoreConfig<S>,
-        _opts: &OpenOptions,
-    ) -> Result<Option<RegionImpl<S>>> {
-        // Load version meta data from manifest.
-        let (version, mut recovered_metadata) = match Self::recover_from_manifest(
-            &store_config.manifest,
-            &store_config.memtable_builder,
-            &store_config.sst_layer,
-            &store_config.file_purger,
-        )
-        .await?
-        {
-            (None, _) => return Ok(None),
-            (Some(v), m) => (v, m),
-        };
-
-        logging::debug!(
-            "Region recovered version from manifest, version: {:?}",
-            version
-        );
-
-        let metadata = version.metadata().clone();
-        let flushed_sequence = version.flushed_sequence();
-        let version_control = Arc::new(VersionControl::with_version(version));
-
-        let recovered_metadata_after_flushed =
-            recovered_metadata.split_off(&(flushed_sequence + 1));
-        // apply the last flushed metadata
-        if let Some((sequence, (manifest_version, metadata))) = recovered_metadata.pop_last() {
-            let metadata: RegionMetadataRef = Arc::new(
-                metadata
-                    .try_into()
-                    .context(error::InvalidRawRegionSnafu { region: &name })?,
-            );
-            let mutable_memtable = store_config
-                .memtable_builder
-                .build(metadata.schema().clone());
-            version_control.freeze_mutable_and_apply_metadata(
-                metadata,
-                manifest_version,
-                mutable_memtable,
-            );
-
-            logging::debug!(
-                "Applied the last flushed metadata to region: {}, sequence: {}, manifest: {}",
-                name,
-                sequence,
-                manifest_version,
-            );
-        }
-
-        let wal = Wal::new(metadata.id(), store_config.log_store);
-        wal.obsolete(flushed_sequence).await?;
-        info!(
-            "Obsolete WAL entries on startup, region: {}, flushed sequence: {}",
-            metadata.id(),
-            flushed_sequence
-        );
-
-        let shared = Arc::new(SharedData {
-            id: metadata.id(),
-            name,
-            version_control,
-            last_flush_millis: AtomicI64::new(0),
-        });
-
-        let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
-        let writer = Arc::new(RegionWriter::new(
-            store_config.memtable_builder,
-            store_config.engine_config.clone(),
-            store_config.ttl,
-            store_config.write_buffer_size,
-            store_config.compaction_scheduler.clone(),
-            compaction_picker.clone(),
-        ));
-
-        let writer_ctx = WriterContext {
-            shared: &shared,
-            flush_strategy: &store_config.flush_strategy,
-            flush_scheduler: &store_config.flush_scheduler,
-            compaction_scheduler: &store_config.compaction_scheduler,
-            sst_layer: &store_config.sst_layer,
-            wal: &wal,
-            writer: &writer,
-            manifest: &store_config.manifest,
-            compaction_picker: compaction_picker.clone(),
-        };
-        // Replay all unflushed data.
-        writer
-            .replay(recovered_metadata_after_flushed, writer_ctx)
-            .await?;
-
-        let inner = Arc::new(RegionInner {
-            shared,
-            writer,
-            wal,
-            flush_strategy: store_config.flush_strategy,
-            flush_scheduler: store_config.flush_scheduler,
-            compaction_scheduler: store_config.compaction_scheduler,
-            compaction_picker,
-            sst_layer: store_config.sst_layer,
-            manifest: store_config.manifest,
-        });
-
-        crate::metrics::REGION_COUNT.inc();
-        Ok(Some(RegionImpl { inner }))
-    }
-
-    /// Get ID of this region.
-    pub fn id(&self) -> RegionId {
-        self.inner.shared.id()
-    }
-
-    /// Returns last flush timestamp in millis.
-    pub(crate) fn last_flush_millis(&self) -> i64 {
-        self.inner.shared.last_flush_millis()
-    }
-
-    /// Returns the [VersionControl] of the region.
-    pub(crate) fn version_control(&self) -> &VersionControl {
-        self.inner.version_control()
-    }
-
-    fn create_version_with_checkpoint(
-        checkpoint: RegionCheckpoint,
-        memtable_builder: &MemtableBuilderRef,
-        sst_layer: &AccessLayerRef,
-        file_purger: &FilePurgerRef,
-    ) -> Result<Option<Version>> {
-        if checkpoint.checkpoint.is_none() {
-            return Ok(None);
-        }
-        // Safety: it's safe to unwrap here, checking it above.
-        let s = checkpoint.checkpoint.unwrap();
-
-        let region = s.metadata.name.clone();
-        let region_metadata: RegionMetadata = s
-            .metadata
-            .try_into()
-            .context(error::InvalidRawRegionSnafu { region })?;
-
-        let memtable = memtable_builder.build(region_metadata.schema().clone());
-        let mut version = Version::with_manifest_version(
-            Arc::new(region_metadata),
-            checkpoint.last_version,
-            memtable,
-            sst_layer.clone(),
-            file_purger.clone(),
-        );
-
-        if let Some(v) = s.version {
-            version.apply_checkpoint(
-                v.flushed_sequence,
-                v.manifest_version,
-                v.files.into_values(),
-            );
-        }
-
-        Ok(Some(version))
-    }
-
-    async fn recover_from_manifest(
-        manifest: &RegionManifest,
-        memtable_builder: &MemtableBuilderRef,
-        sst_layer: &AccessLayerRef,
-        file_purger: &FilePurgerRef,
-    ) -> Result<(Option<Version>, RecoveredMetadataMap)> {
-        let checkpoint = manifest.last_checkpoint().await?;
-
-        let (start, end, mut version) = if let Some(checkpoint) = checkpoint {
-            (
-                checkpoint.last_version + 1,
-                manifest::MAX_VERSION,
-                Self::create_version_with_checkpoint(
-                    checkpoint,
-                    memtable_builder,
-                    sst_layer,
-                    file_purger,
-                )?,
-            )
-        } else {
-            (manifest::MIN_VERSION, manifest::MAX_VERSION, None)
-        };
-
-        let mut iter = manifest.scan(start, end).await?;
-
-        let mut actions = Vec::new();
-        let mut last_manifest_version = manifest::MIN_VERSION;
-        let mut recovered_metadata = BTreeMap::new();
-
-        while let Some((manifest_version, action_list)) = iter.next_action().await? {
-            last_manifest_version = manifest_version;
-
-            for action in action_list.actions {
-                match (action, version) {
-                    (RegionMetaAction::Change(c), None) => {
-                        let region = c.metadata.name.clone();
-                        let region_metadata: RegionMetadata = c
-                            .metadata
-                            .try_into()
-                            .context(error::InvalidRawRegionSnafu { region })?;
-                        // Use current schema to build a memtable. This might be replaced later
-                        // in `freeze_mutable_and_apply_metadata()`.
-                        let memtable = memtable_builder.build(region_metadata.schema().clone());
-                        version = Some(Version::with_manifest_version(
-                            Arc::new(region_metadata),
-                            last_manifest_version,
-                            memtable,
-                            sst_layer.clone(),
-                            file_purger.clone(),
-                        ));
-                        for (manifest_version, action) in actions.drain(..) {
-                            version = Self::replay_edit(manifest_version, action, version);
-                        }
-                    }
-                    (RegionMetaAction::Change(c), Some(v)) => {
-                        let _ = recovered_metadata
-                            .insert(c.committed_sequence, (manifest_version, c.metadata));
-                        version = Some(v);
-                    }
-                    (RegionMetaAction::Remove(r), Some(v)) => {
-                        manifest.stop().await?;
-
-                        let files = v.ssts().mark_all_files_deleted();
-                        logging::info!(
-                            "Try to remove all SSTs, region: {}, files: {:?}",
-                            r.region_id,
-                            files
-                        );
-
-                        manifest
-                            .manifest_store()
-                            .delete_all(v.manifest_version())
-                            .await?;
-                        return Ok((None, recovered_metadata));
-                    }
-                    (RegionMetaAction::Truncate(t), Some(mut v)) => {
-                        let files = v.ssts().mark_all_files_deleted();
-                        logging::info!(
-                            "Try to remove all SSTs on truncate, region: {}, files: {:?}",
-                            t.region_id,
-                            files
-                        );
-                        let region_metadata = v.metadata().clone();
-                        let memtables = Arc::new(MemtableVersion::new(
-                            memtable_builder.build(region_metadata.schema().clone()),
-                        ));
-                        let ssts =
-                            Arc::new(LevelMetas::new(sst_layer.clone(), file_purger.clone()));
-                        v.reset(
-                            v.manifest_version() + 1,
-                            memtables,
-                            ssts,
-                            t.committed_sequence,
-                        );
-                        version = Some(v);
-                    }
-                    (action, None) => {
-                        actions.push((manifest_version, action));
-                        version = None;
-                    }
-                    (action, Some(v)) => {
-                        version = Self::replay_edit(manifest_version, action, Some(v));
-                    }
-                }
-            }
-        }
-
-        assert!(actions.is_empty() || version.is_none());
-
-        if let Some(version) = &version {
-            // update manifest state after recovering
-            let protocol = iter.last_protocol();
-            manifest.update_state(last_manifest_version + 1, protocol.clone());
-            manifest.set_flushed_manifest_version(version.manifest_version());
-        }
-
-        Ok((version, recovered_metadata))
-    }
-
-    fn replay_edit(
-        manifest_version: ManifestVersion,
-        action: RegionMetaAction,
-        version: Option<Version>,
-    ) -> Option<Version> {
-        if let RegionMetaAction::Edit(e) = action {
-            let edit = VersionEdit {
-                files_to_add: e.files_to_add,
-                files_to_remove: e.files_to_remove,
-                flushed_sequence: e.flushed_sequence,
-                manifest_version,
-                max_memtable_id: None,
-                compaction_time_window: e.compaction_time_window,
-            };
-            version.map(|mut v| {
-                v.apply_edit(edit);
-                v
-            })
-        } else {
-            version
-        }
-    }
-
-    /// Compact the region manually.
-    pub async fn compact(&self, ctx: &CompactContext) -> Result<()> {
-        self.inner.compact(ctx).await
-    }
-
-    pub async fn close(&self, ctx: &CloseContext) -> Result<()> {
-        crate::metrics::REGION_COUNT.dec();
-        self.inner.close(ctx).await
-    }
-}
-
-// Private methods for tests.
-#[cfg(test)]
-impl<S: LogStore> RegionImpl<S> {
-    #[inline]
-    fn committed_sequence(&self) -> store_api::storage::SequenceNumber {
-        self.inner.version_control().committed_sequence()
-    }
-
-    fn current_manifest_version(&self) -> ManifestVersion {
-        self.inner.version_control().current_manifest_version()
-    }
-
-    /// Write to inner, also the `RegionWriter` directly.
-    async fn write_inner(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
-        self.inner.write(ctx, request).await
-    }
-
-    // Replay metadata to inner.
-    async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) -> Result<()> {
-        let inner = &self.inner;
-        let writer_ctx = WriterContext {
-            shared: &inner.shared,
-            flush_strategy: &inner.flush_strategy,
-            flush_scheduler: &inner.flush_scheduler,
-            compaction_scheduler: &inner.compaction_scheduler,
-            sst_layer: &inner.sst_layer,
-            wal: &inner.wal,
-            writer: &inner.writer,
-            manifest: &inner.manifest,
-            compaction_picker: inner.compaction_picker.clone(),
-        };
-
-        inner.writer.replay(recovered_metadata, writer_ctx).await
-    }
-
-    pub(crate) async fn write_buffer_size(&self) -> usize {
-        self.inner.writer.write_buffer_size().await
-    }
-}
-
-/// Shared data of region.
-#[derive(Debug)]
-pub struct SharedData {
-    // Region id and name is immutable, so we cache them in shared data to avoid loading
-    // current version from `version_control` each time we need to access them.
-    id: RegionId,
-    name: String,
-    // TODO(yingwen): Maybe no need to use Arc for version control.
-    pub version_control: VersionControlRef,
-
-    /// Last flush time in millis.
-    last_flush_millis: AtomicI64,
-}
-
-impl SharedData {
-    #[inline]
-    pub fn id(&self) -> RegionId {
-        self.id
-    }
-
-    #[inline]
-    pub fn name(&self) -> &str {
-        &self.name
-    }
-
-    /// Update flush time to current time.
-    pub(crate) fn update_flush_millis(&self) {
-        let now = util::current_time_millis();
-        self.last_flush_millis.store(now, Ordering::Relaxed);
-    }
-
-    /// Returns last flush timestamp in millis.
-    fn last_flush_millis(&self) -> i64 {
-        self.last_flush_millis.load(Ordering::Relaxed)
-    }
-}
-
-pub type SharedDataRef = Arc<SharedData>;
-
-struct RegionInner<S: LogStore> {
-    shared: SharedDataRef,
-    writer: RegionWriterRef<S>,
-    wal: Wal<S>,
-    flush_strategy: FlushStrategyRef,
-    flush_scheduler: FlushSchedulerRef<S>,
-    compaction_scheduler: CompactionSchedulerRef<S>,
-    compaction_picker: CompactionPickerRef<S>,
-    sst_layer: AccessLayerRef,
-    manifest: RegionManifest,
-}
-
-impl<S: LogStore> RegionInner<S> {
-    #[inline]
-    fn version_control(&self) -> &VersionControl {
-        &self.shared.version_control
-    }
-
-    fn in_memory_metadata(&self) -> RegionMetaImpl {
-        let metadata = self.version_control().metadata();
-
-        RegionMetaImpl::new(metadata)
-    }
-
-    fn create_snapshot(&self) -> SnapshotImpl {
-        let version = self.version_control().current();
-        let sequence = self.version_control().committed_sequence();
-
-        SnapshotImpl::new(version, sequence, self.sst_layer.clone())
-    }
-
-    fn compat_write_batch(&self, request: &mut WriteBatch) -> Result<()> {
-        let metadata = self.version_control().metadata();
-        let schema = metadata.schema();
-
-        // Try to make request schema compatible with region's outside of write lock. Note that
-        // schema might be altered after this step.
-        request.compat_write(schema.user_schema())
-    }
-
-    /// Write to writer directly.
-    async fn write(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
-        let writer_ctx = WriterContext {
-            shared: &self.shared,
-            flush_strategy: &self.flush_strategy,
-            flush_scheduler: &self.flush_scheduler,
-            compaction_scheduler: &self.compaction_scheduler,
-            sst_layer: &self.sst_layer,
-            wal: &self.wal,
-            writer: &self.writer,
-            manifest: &self.manifest,
-            compaction_picker: self.compaction_picker.clone(),
-        };
-        // The writer would also try to compat the schema of write batch if it finds out the
-        // schema version of request is less than current schema version.
-        self.writer.write(ctx, request, writer_ctx).await
-    }
-
-    async fn alter(&self, request: AlterRequest) -> Result<()> {
-        logging::info!(
-            "Alter region {}, name: {}, request: {:?}",
-            self.shared.id,
-            self.shared.name,
-            request
-        );
-
-        let alter_ctx = AlterContext {
-            shared: &self.shared,
-            wal: &self.wal,
-            manifest: &self.manifest,
-        };
-
-        self.writer.alter(alter_ctx, request).await
-    }
-
-    async fn close(&self, ctx: &CloseContext) -> Result<()> {
-        self.writer.close().await?;
-        if ctx.flush {
-            let ctx = FlushContext {
-                wait: true,
-                reason: FlushReason::Manually,
-                force: true,
-            };
-            self.flush(&ctx).await?;
-        }
-        self.manifest.stop().await
-    }
-
-    async fn drop_region(&self) -> Result<()> {
-        logging::info!("Drop region {}, name: {}", self.shared.id, self.shared.name);
-        let drop_ctx = DropContext {
-            shared: &self.shared,
-            wal: &self.wal,
-            manifest: &self.manifest,
-            flush_scheduler: &self.flush_scheduler,
-            compaction_scheduler: &self.compaction_scheduler,
-            sst_layer: &self.sst_layer,
-        };
-
-        self.manifest.stop().await?;
-        self.writer.on_drop(drop_ctx).await
-    }
-
-    async fn flush(&self, ctx: &FlushContext) -> Result<()> {
-        let writer_ctx = WriterContext {
-            shared: &self.shared,
-            flush_strategy: &self.flush_strategy,
-            flush_scheduler: &self.flush_scheduler,
-            compaction_scheduler: &self.compaction_scheduler,
-            sst_layer: &self.sst_layer,
-            wal: &self.wal,
-            writer: &self.writer,
-            manifest: &self.manifest,
-            compaction_picker: self.compaction_picker.clone(),
-        };
-        self.writer.flush(writer_ctx, ctx).await
-    }
-
-    /// Compact the region manually.
-    async fn compact(&self, compact_ctx: &CompactContext) -> Result<()> {
-        self.writer
-            .compact(WriterCompactRequest {
-                shared_data: self.shared.clone(),
-                sst_layer: self.sst_layer.clone(),
-                manifest: self.manifest.clone(),
-                wal: self.wal.clone(),
-                region_writer: self.writer.clone(),
-                compact_ctx: *compact_ctx,
-            })
-            .await
-    }
-
-    async fn truncate(&self) -> Result<()> {
-        logging::info!(
-            "Truncate region {}, name: {}",
-            self.shared.id,
-            self.shared.name
-        );
-
-        let ctx = TruncateContext {
-            shared: &self.shared,
-            wal: &self.wal,
-            manifest: &self.manifest,
-            sst_layer: &self.sst_layer,
-        };
-
-        self.writer.truncate(&ctx).await?;
-        Ok(())
-    }
-}
--- a/src/storage/src/region/tests.rs
+++ b/src/storage/src/region/tests.rs
@@ -1,833 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region tests.
-
-use std::collections::{HashMap, HashSet};
-
-use arrow::compute::SortOptions;
-use common_base::readable_size::ReadableSize;
-use common_datasource::compression::CompressionType;
-use common_recordbatch::OrderOption;
-use common_telemetry::logging;
-use common_test_util::temp_dir::{create_temp_dir, TempDir};
-use datatypes::prelude::{LogicalTypeId, ScalarVector, WrapperType};
-use datatypes::timestamp::TimestampMillisecond;
-use datatypes::vectors::{
-    BooleanVector, Int64Vector, StringVector, TimestampMillisecondVector, VectorRef,
-};
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use log_store::NoopLogStore;
-use object_store::services::Fs;
-use object_store::ObjectStore;
-use store_api::manifest::{Manifest, MAX_VERSION};
-use store_api::storage::{
-    Chunk, ChunkReader, FlushContext, FlushReason, ReadContext, Region, RegionMeta, ScanRequest,
-    SequenceNumber, Snapshot, WriteContext, WriteRequest,
-};
-
-use super::*;
-use crate::chunk::ChunkReaderImpl;
-use crate::compaction::noop::NoopCompactionScheduler;
-use crate::engine;
-use crate::engine::RegionMap;
-use crate::file_purger::noop::NoopFilePurgeHandler;
-use crate::flush::{FlushScheduler, PickerConfig, SizeBasedStrategy};
-use crate::manifest::action::{RegionChange, RegionMetaActionList};
-use crate::manifest::manifest_compress_type;
-use crate::manifest::region::RegionManifest;
-use crate::manifest::test_utils::*;
-use crate::memtable::DefaultMemtableBuilder;
-use crate::metadata::RegionMetadata;
-use crate::region::{RegionImpl, StoreConfig};
-use crate::scheduler::{LocalScheduler, SchedulerConfig};
-use crate::sst::{FileId, FsAccessLayer};
-use crate::test_util::descriptor_util::RegionDescBuilder;
-use crate::test_util::{self, config_util, schema_util, write_batch_util};
-
-mod alter;
-mod basic;
-mod close;
-mod compact;
-mod drop;
-mod flush;
-mod projection;
-mod truncate;
-
-/// Create metadata of a region with schema: (timestamp, v0).
-pub fn new_metadata(region_name: &str) -> RegionMetadata {
-    let desc = RegionDescBuilder::new(region_name)
-        .id(123)
-        .push_field_column(("v0", LogicalTypeId::String, true))
-        .build();
-    desc.try_into().unwrap()
-}
-
-/// Test region with schema (timestamp, v0).
-pub struct TesterBase<S: LogStore> {
-    pub region: RegionImpl<S>,
-    pub write_ctx: WriteContext,
-    pub read_ctx: ReadContext,
-}
-
-impl<S: LogStore> TesterBase<S> {
-    pub fn with_region(region: RegionImpl<S>) -> TesterBase<S> {
-        TesterBase {
-            region,
-            write_ctx: WriteContext::default(),
-            read_ctx: ReadContext::default(),
-        }
-    }
-
-    pub async fn checkpoint_manifest(&self) {
-        let manifest = &self.region.inner.manifest;
-        manifest.set_flushed_manifest_version(manifest.last_version() - 1);
-        let _ = manifest.do_checkpoint().await.unwrap().unwrap();
-    }
-
-    pub async fn close(&self) {
-        self.region.inner.flush_scheduler.stop().await.unwrap();
-        self.region
-            .inner
-            .compaction_scheduler
-            .stop(true)
-            .await
-            .unwrap();
-        self.region.close(&CloseContext::default()).await.unwrap();
-        self.region.inner.wal.close().await.unwrap();
-    }
-
-    /// Put without version specified.
-    ///
-    /// Format of data: (timestamp, v0), timestamp is key, v0 is value.
-    pub async fn put(&self, data: &[(i64, Option<String>)]) -> WriteResponse {
-        self.try_put(data).await.unwrap()
-    }
-
-    /// Put without version specified, returns [`Result<WriteResponse>`]
-    ///
-    /// Format of data: (timestamp, v0), timestamp is key, v0 is value.
-    pub async fn try_put(&self, data: &[(i64, Option<String>)]) -> Result<WriteResponse> {
-        let data: Vec<(TimestampMillisecond, Option<String>)> =
-            data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect();
-        // Build a batch without version.
-        let mut batch = new_write_batch_for_test(false);
-        let put_data = new_put_data(&data);
-        batch.put(put_data).unwrap();
-
-        self.region.write(&self.write_ctx, batch).await
-    }
-
-    /// Put without version specified directly to inner writer.
-    pub async fn put_inner(&self, data: &[(i64, Option<String>)]) -> WriteResponse {
-        let data: Vec<(TimestampMillisecond, Option<String>)> =
-            data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect();
-        let mut batch = new_write_batch_for_test(false);
-        let put_data = new_put_data(&data);
-        batch.put(put_data).unwrap();
-
-        self.region
-            .write_inner(&self.write_ctx, batch)
-            .await
-            .unwrap()
-    }
-
-    pub async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) {
-        self.region.replay_inner(recovered_metadata).await.unwrap()
-    }
-
-    /// Scan all data.
-    pub async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
-        logging::info!("Full scan with ctx {:?}", self.read_ctx);
-        let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
-
-        let resp = snapshot
-            .scan(&self.read_ctx, ScanRequest::default())
-            .await
-            .unwrap();
-        let mut reader = resp.reader;
-
-        let metadata = self.region.in_memory_metadata();
-        assert_eq!(metadata.schema(), reader.user_schema());
-
-        let mut dst = Vec::new();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let chunk = reader.project_chunk(chunk);
-            append_chunk_to(&chunk, &mut dst);
-        }
-
-        dst
-    }
-
-    pub async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option<String>)> {
-        logging::info!("Full scan with ctx {:?}", self.read_ctx);
-        let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
-
-        let resp = snapshot.scan(&self.read_ctx, req).await.unwrap();
-        let mut reader = resp.reader;
-
-        let metadata = self.region.in_memory_metadata();
-        assert_eq!(metadata.schema(), reader.user_schema());
-
-        let mut dst = Vec::new();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let chunk = reader.project_chunk(chunk);
-            append_chunk_to(&chunk, &mut dst);
-        }
-        dst
-    }
-
-    pub fn committed_sequence(&self) -> SequenceNumber {
-        self.region.committed_sequence()
-    }
-
-    /// Delete by keys (timestamp).
-    pub async fn delete(&self, keys: &[i64]) -> WriteResponse {
-        let keys: Vec<TimestampMillisecond> = keys.iter().map(|v| (*v).into()).collect();
-        // Build a batch without version.
-        let mut batch = new_write_batch_for_test(false);
-        let keys = new_delete_data(&keys);
-        batch.delete(keys).unwrap();
-
-        self.region.write(&self.write_ctx, batch).await.unwrap()
-    }
-
-    /// Returns a reader to scan all data.
-    pub async fn full_scan_reader(&self) -> ChunkReaderImpl {
-        let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
-
-        let resp = snapshot
-            .scan(&self.read_ctx, ScanRequest::default())
-            .await
-            .unwrap();
-        resp.reader
-    }
-
-    /// Collect data from the reader.
-    pub async fn collect_reader(&self, mut reader: ChunkReaderImpl) -> Vec<(i64, Option<String>)> {
-        let mut dst = Vec::new();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let chunk = reader.project_chunk(chunk);
-            append_chunk_to(&chunk, &mut dst);
-        }
-
-        dst
-    }
-}
-
-pub type FileTesterBase = TesterBase<RaftEngineLogStore>;
-
-fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch {
-    if enable_version_column {
-        write_batch_util::new_write_batch(
-            &[
-                (
-                    test_util::TIMESTAMP_NAME,
-                    LogicalTypeId::TimestampMillisecond,
-                    false,
-                ),
-                ("v0", LogicalTypeId::String, true),
-            ],
-            Some(0),
-            2,
-        )
-    } else {
-        write_batch_util::new_write_batch(
-            &[
-                (
-                    test_util::TIMESTAMP_NAME,
-                    LogicalTypeId::TimestampMillisecond,
-                    false,
-                ),
-                ("v0", LogicalTypeId::String, true),
-            ],
-            Some(0),
-            1,
-        )
-    }
-}
-
-fn new_put_data(data: &[(TimestampMillisecond, Option<String>)]) -> HashMap<String, VectorRef> {
-    let timestamps =
-        TimestampMillisecondVector::from_vec(data.iter().map(|v| v.0.into()).collect());
-    let values = StringVector::from(data.iter().map(|kv| kv.1.clone()).collect::<Vec<_>>());
-
-    HashMap::from([
-        (
-            test_util::TIMESTAMP_NAME.to_string(),
-            Arc::new(timestamps) as VectorRef,
-        ),
-        ("v0".to_string(), Arc::new(values) as VectorRef),
-    ])
-}
-
-fn new_delete_data(keys: &[TimestampMillisecond]) -> HashMap<String, VectorRef> {
-    let timestamps =
-        TimestampMillisecondVector::from_vec(keys.iter().map(|v| v.0.into()).collect());
-    HashMap::from([(
-        test_util::TIMESTAMP_NAME.to_string(),
-        Arc::new(timestamps) as VectorRef,
-    )])
-}
-
-fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option<String>)>) {
-    assert_eq!(2, chunk.columns.len());
-
-    let timestamps = chunk.columns[0]
-        .as_any()
-        .downcast_ref::<TimestampMillisecondVector>()
-        .unwrap();
-    let values = chunk.columns[1]
-        .as_any()
-        .downcast_ref::<StringVector>()
-        .unwrap();
-    for (ts, value) in timestamps.iter_data().zip(values.iter_data()) {
-        dst.push((ts.unwrap().into_native(), value.map(|s| s.to_string())));
-    }
-}
-
-#[tokio::test]
-async fn test_new_region() {
-    let region_name = "region-0";
-    let desc = RegionDescBuilder::new(region_name)
-        .push_key_column(("k1", LogicalTypeId::Int32, false))
-        .push_field_column(("v0", LogicalTypeId::Float32, true))
-        .build();
-    let metadata: RegionMetadata = desc.try_into().unwrap();
-
-    let dir = create_temp_dir("test_new_region");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let store_config =
-        config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await;
-    let placeholder_memtable = store_config
-        .memtable_builder
-        .build(metadata.schema().clone());
-
-    let region = RegionImpl::new(
-        Version::new(Arc::new(metadata), placeholder_memtable),
-        store_config,
-    );
-
-    let expect_schema = schema_util::new_schema_ref(
-        &[
-            ("k1", LogicalTypeId::Int32, false),
-            (
-                test_util::TIMESTAMP_NAME,
-                LogicalTypeId::TimestampMillisecond,
-                false,
-            ),
-            ("v0", LogicalTypeId::Float32, true),
-        ],
-        Some(1),
-    );
-
-    assert_eq!(region_name, region.name());
-    assert_eq!(expect_schema, *region.in_memory_metadata().schema());
-}
-
-#[tokio::test]
-async fn test_recover_region_manifets_compress() {
-    test_recover_region_manifets(true).await;
-}
-
-#[tokio::test]
-async fn test_recover_region_manifets_uncompress() {
-    test_recover_region_manifets(false).await;
-}
-
-async fn test_recover_region_manifets(compress: bool) {
-    common_telemetry::init_default_ut_logging();
-    let tmp_dir = create_temp_dir("test_recover_region_manifets");
-    let memtable_builder = Arc::new(DefaultMemtableBuilder::default()) as _;
-
-    let mut builder = Fs::default();
-    let _ = builder.root(&tmp_dir.path().to_string_lossy());
-    let object_store = ObjectStore::new(builder).unwrap().finish();
-
-    let manifest = RegionManifest::with_checkpointer(
-        "/manifest/",
-        object_store.clone(),
-        manifest_compress_type(compress),
-        None,
-        None,
-    );
-    let region_meta = Arc::new(build_region_meta());
-
-    let sst_layer = Arc::new(FsAccessLayer::new("sst", object_store)) as _;
-    let file_purger = Arc::new(LocalScheduler::new(
-        SchedulerConfig::default(),
-        NoopFilePurgeHandler,
-    ));
-    // Recover from empty
-    assert!(RegionImpl::<NoopLogStore>::recover_from_manifest(
-        &manifest,
-        &memtable_builder,
-        &sst_layer,
-        &file_purger,
-    )
-    .await
-    .unwrap()
-    .0
-    .is_none());
-
-    let file_id_a = FileId::random();
-    let file_id_b = FileId::random();
-    let file_id_c = FileId::random();
-
-    {
-        // save some actions into region_meta
-        assert!(manifest
-            .update(RegionMetaActionList::with_action(RegionMetaAction::Change(
-                RegionChange {
-                    metadata: region_meta.as_ref().into(),
-                    committed_sequence: 40,
-                },
-            )))
-            .await
-            .is_ok());
-
-        assert!(manifest
-            .update(RegionMetaActionList::new(vec![
-                RegionMetaAction::Edit(build_region_edit(1, &[file_id_a], &[])),
-                RegionMetaAction::Edit(build_region_edit(2, &[file_id_b, file_id_c], &[])),
-            ]))
-            .await
-            .is_ok());
-
-        assert!(manifest
-            .update(RegionMetaActionList::with_action(RegionMetaAction::Change(
-                RegionChange {
-                    metadata: region_meta.as_ref().into(),
-                    committed_sequence: 42,
-                },
-            )))
-            .await
-            .is_ok());
-    }
-
-    // try to recover
-    let (version, recovered_metadata) = RegionImpl::<NoopLogStore>::recover_from_manifest(
-        &manifest,
-        &memtable_builder,
-        &sst_layer,
-        &file_purger,
-    )
-    .await
-    .unwrap();
-
-    assert_recovered_manifest(
-        version,
-        recovered_metadata,
-        &file_id_a,
-        &file_id_b,
-        &file_id_c,
-        &region_meta,
-    );
-
-    // do a manifest checkpoint
-    let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
-    assert_eq!(1, checkpoint.last_version);
-    assert_eq!(2, checkpoint.compacted_actions);
-    assert_eq!(
-        manifest.last_checkpoint().await.unwrap().unwrap(),
-        checkpoint
-    );
-    // recover from checkpoint
-    let (version, recovered_metadata) = RegionImpl::<NoopLogStore>::recover_from_manifest(
-        &manifest,
-        &memtable_builder,
-        &sst_layer,
-        &file_purger,
-    )
-    .await
-    .unwrap();
-
-    assert_recovered_manifest(
-        version,
-        recovered_metadata,
-        &file_id_a,
-        &file_id_b,
-        &file_id_c,
-        &region_meta,
-    );
-
-    // check manifest state
-    assert_eq!(3, manifest.last_version());
-    let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
-    let (version, action) = iter.next_action().await.unwrap().unwrap();
-    assert_eq!(2, version);
-    assert!(matches!(action.actions[0], RegionMetaAction::Change(..)));
-    assert!(iter.next_action().await.unwrap().is_none());
-}
-
-fn assert_recovered_manifest(
-    version: Option<Version>,
-    recovered_metadata: RecoveredMetadataMap,
-    file_id_a: &FileId,
-    file_id_b: &FileId,
-    file_id_c: &FileId,
-    region_meta: &Arc<RegionMetadata>,
-) {
-    assert_eq!(42, *recovered_metadata.first_key_value().unwrap().0);
-    let version = version.unwrap();
-    assert_eq!(*version.metadata(), *region_meta);
-    assert_eq!(version.flushed_sequence(), 2);
-    assert_eq!(version.manifest_version(), 1);
-    let ssts = version.ssts();
-    let files = ssts.levels()[0]
-        .files()
-        .map(|f| f.file_name())
-        .collect::<HashSet<_>>();
-    assert_eq!(3, files.len());
-    assert_eq!(
-        HashSet::from([
-            file_id_a.as_parquet(),
-            file_id_b.as_parquet(),
-            file_id_c.as_parquet()
-        ]),
-        files
-    );
-}
-
-fn create_region_meta(region_name: &str) -> RegionMetadata {
-    let desc = RegionDescBuilder::new(region_name)
-        .push_field_column(("v0", LogicalTypeId::Int64, true))
-        .push_field_column(("v1", LogicalTypeId::String, true))
-        .push_field_column(("v2", LogicalTypeId::Boolean, true))
-        .build();
-    desc.try_into().unwrap()
-}
-
-async fn create_store_config(region_name: &str, root: &str) -> StoreConfig<NoopLogStore> {
-    let mut builder = Fs::default();
-    let _ = builder.root(root);
-    let object_store = ObjectStore::new(builder).unwrap().finish();
-    let parent_dir = "";
-    let sst_dir = engine::region_sst_dir(parent_dir, region_name);
-    let manifest_dir = engine::region_manifest_dir(parent_dir, region_name);
-
-    let sst_layer = Arc::new(FsAccessLayer::new(&sst_dir, object_store.clone()));
-    let manifest = RegionManifest::with_checkpointer(
-        &manifest_dir,
-        object_store,
-        CompressionType::Uncompressed,
-        None,
-        None,
-    );
-    manifest.start().await.unwrap();
-
-    let compaction_scheduler = Arc::new(NoopCompactionScheduler::default());
-
-    let regions = Arc::new(RegionMap::new());
-
-    let flush_scheduler = Arc::new(
-        FlushScheduler::new(
-            SchedulerConfig::default(),
-            compaction_scheduler.clone(),
-            regions,
-            PickerConfig::default(),
-        )
-        .unwrap(),
-    );
-
-    let log_store = Arc::new(NoopLogStore);
-
-    let file_purger = Arc::new(LocalScheduler::new(
-        SchedulerConfig::default(),
-        NoopFilePurgeHandler,
-    ));
-    StoreConfig {
-        log_store,
-        sst_layer,
-        manifest,
-        memtable_builder: Arc::new(DefaultMemtableBuilder::default()),
-        flush_scheduler,
-        flush_strategy: Arc::new(SizeBasedStrategy::default()),
-        compaction_scheduler,
-        engine_config: Default::default(),
-        file_purger,
-        ttl: None,
-        write_buffer_size: ReadableSize::mb(32).0 as usize,
-        compaction_strategy: Default::default(),
-    }
-}
-
-struct WindowedReaderTester {
-    data_written: Vec<Vec<(i64, i64, String, bool)>>,
-    expected: Vec<(i64, i64, String, bool)>,
-    region: RegionImpl<NoopLogStore>,
-    _temp_dir: TempDir,
-}
-
-impl WindowedReaderTester {
-    async fn new(
-        region_name: &'static str,
-        data_written: Vec<Vec<(i64, i64, String, bool)>>,
-        expected: Vec<(i64, i64, String, bool)>,
-    ) -> Self {
-        let temp_dir = create_temp_dir(&format!("write_and_read_windowed_{}", region_name));
-        let root = temp_dir.path().to_str().unwrap();
-        let metadata = create_region_meta(region_name);
-        let store_config = create_store_config(region_name, root).await;
-        let region = RegionImpl::create(metadata, store_config).await.unwrap();
-
-        let tester = Self {
-            data_written,
-            expected,
-            region,
-            _temp_dir: temp_dir,
-        };
-        tester.prepare().await;
-        tester
-    }
-
-    async fn prepare(&self) {
-        for batch in &self.data_written {
-            let mut write_batch = self.region.write_request();
-            let ts = TimestampMillisecondVector::from_iterator(
-                batch
-                    .iter()
-                    .map(|(v, _, _, _)| TimestampMillisecond::new(*v)),
-            );
-            let v0 = Int64Vector::from_iterator(batch.iter().map(|(_, v, _, _)| *v));
-            let v1 = StringVector::from_iterator(batch.iter().map(|(_, _, v, _)| v.as_str()));
-            let v2 = BooleanVector::from_iterator(batch.iter().map(|(_, _, _, v)| *v));
-
-            let columns = [
-                ("timestamp".to_string(), Arc::new(ts) as VectorRef),
-                ("v0".to_string(), Arc::new(v0) as VectorRef),
-                ("v1".to_string(), Arc::new(v1) as VectorRef),
-                ("v2".to_string(), Arc::new(v2) as VectorRef),
-            ]
-            .into_iter()
-            .collect::<HashMap<String, VectorRef>>();
-            write_batch.put(columns).unwrap();
-
-            assert!(self
-                .region
-                .write(&WriteContext {}, write_batch)
-                .await
-                .is_ok());
-
-            // flush the region to ensure data resides across SST files.
-            self.region
-                .flush(&FlushContext {
-                    wait: true,
-                    reason: FlushReason::Others,
-                    ..Default::default()
-                })
-                .await
-                .unwrap();
-        }
-    }
-
-    async fn check(&self, order_options: Vec<OrderOption>) {
-        let read_context = ReadContext::default();
-        let snapshot = self.region.snapshot(&read_context).unwrap();
-        let response = snapshot
-            .scan(
-                &read_context,
-                ScanRequest {
-                    sequence: None,
-                    projection: None,
-                    filters: vec![],
-                    limit: None,
-                    output_ordering: Some(order_options),
-                },
-            )
-            .await
-            .unwrap();
-
-        let mut timestamps = Vec::with_capacity(self.expected.len());
-        let mut col1 = Vec::with_capacity(self.expected.len());
-        let mut col2 = Vec::with_capacity(self.expected.len());
-        let mut col3 = Vec::with_capacity(self.expected.len());
-
-        let mut reader = response.reader;
-        let ts_index = reader.user_schema().timestamp_index().unwrap();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let ts_col = &chunk.columns[ts_index];
-            let ts_col = ts_col
-                .as_any()
-                .downcast_ref::<TimestampMillisecondVector>()
-                .unwrap();
-            let v1_col = chunk.columns[1]
-                .as_any()
-                .downcast_ref::<Int64Vector>()
-                .unwrap();
-            let v2_col = chunk.columns[2]
-                .as_any()
-                .downcast_ref::<StringVector>()
-                .unwrap();
-            let v3_col = chunk.columns[3]
-                .as_any()
-                .downcast_ref::<BooleanVector>()
-                .unwrap();
-
-            for ts in ts_col.iter_data() {
-                timestamps.push(ts.unwrap().0.value());
-            }
-            for v in v1_col.iter_data() {
-                col1.push(v.unwrap());
-            }
-            for v in v2_col.iter_data() {
-                col2.push(v.unwrap().to_string());
-            }
-            for v in v3_col.iter_data() {
-                col3.push(v.unwrap());
-            }
-        }
-
-        assert_eq!(
-            timestamps,
-            self.expected
-                .iter()
-                .map(|(v, _, _, _)| *v)
-                .collect::<Vec<_>>()
-        );
-        assert_eq!(
-            col1,
-            self.expected
-                .iter()
-                .map(|(_, v, _, _)| *v)
-                .collect::<Vec<_>>()
-        );
-        assert_eq!(
-            col2,
-            self.expected
-                .iter()
-                .map(|(_, _, v, _)| v.clone())
-                .collect::<Vec<_>>()
-        );
-        assert_eq!(
-            col3,
-            self.expected
-                .iter()
-                .map(|(_, _, _, v)| *v)
-                .collect::<Vec<_>>()
-        );
-    }
-}
-
-#[tokio::test]
-async fn test_read_by_chunk_reader() {
-    common_telemetry::init_default_ut_logging();
-
-    WindowedReaderTester::new(
-        "test_region",
-        vec![vec![(1, 1, "1".to_string(), false)]],
-        vec![(1, 1, "1".to_string(), false)],
-    )
-    .await
-    .check(vec![OrderOption {
-        name: "timestamp".to_string(),
-        options: SortOptions {
-            descending: true,
-            nulls_first: true,
-        },
-    }])
-    .await;
-
-    WindowedReaderTester::new(
-        "test_region",
-        vec![
-            vec![
-                (1, 1, "1".to_string(), false),
-                (2, 2, "2".to_string(), false),
-            ],
-            vec![
-                (3, 3, "3".to_string(), false),
-                (4, 4, "4".to_string(), false),
-            ],
-        ],
-        vec![
-            (4, 4, "4".to_string(), false),
-            (3, 3, "3".to_string(), false),
-            (2, 2, "2".to_string(), false),
-            (1, 1, "1".to_string(), false),
-        ],
-    )
-    .await
-    .check(vec![OrderOption {
-        name: "timestamp".to_string(),
-        options: SortOptions {
-            descending: true,
-            nulls_first: true,
-        },
-    }])
-    .await;
-
-    WindowedReaderTester::new(
-        "test_region",
-        vec![
-            vec![
-                (1, 1, "1".to_string(), false),
-                (2, 2, "2".to_string(), false),
-                (60000, 60000, "60".to_string(), false),
-            ],
-            vec![
-                (3, 3, "3".to_string(), false),
-                (61000, 61000, "61".to_string(), false),
-            ],
-        ],
-        vec![
-            (61000, 61000, "61".to_string(), false),
-            (60000, 60000, "60".to_string(), false),
-            (3, 3, "3".to_string(), false),
-            (2, 2, "2".to_string(), false),
-            (1, 1, "1".to_string(), false),
-        ],
-    )
-    .await
-    .check(vec![OrderOption {
-        name: "timestamp".to_string(),
-        options: SortOptions {
-            descending: true,
-            nulls_first: true,
-        },
-    }])
-    .await;
-
-    WindowedReaderTester::new(
-        "test_region",
-        vec![
-            vec![
-                (1, 1, "1".to_string(), false),
-                (2, 2, "2".to_string(), false),
-                (60000, 60000, "60".to_string(), false),
-            ],
-            vec![
-                (3, 3, "3".to_string(), false),
-                (61000, 61000, "61".to_string(), false),
-            ],
-        ],
-        vec![
-            (1, 1, "1".to_string(), false),
-            (2, 2, "2".to_string(), false),
-            (3, 3, "3".to_string(), false),
-            (60000, 60000, "60".to_string(), false),
-            (61000, 61000, "61".to_string(), false),
-        ],
-    )
-    .await
-    .check(vec![OrderOption {
-        name: "timestamp".to_string(),
-        options: SortOptions {
-            descending: false,
-            nulls_first: true,
-        },
-    }])
-    .await;
-}
--- a/src/storage/src/region/tests/alter.rs
+++ b/src/storage/src/region/tests/alter.rs
@@ -1,491 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::{BTreeMap, HashMap};
-use std::sync::Arc;
-
-use common_test_util::temp_dir::create_temp_dir;
-use datatypes::prelude::*;
-use datatypes::timestamp::TimestampMillisecond;
-use datatypes::vectors::{Int64Vector, StringVector, TimestampMillisecondVector, VectorRef};
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::storage::{
-    AddColumn, AlterOperation, AlterRequest, Chunk, ChunkReader, ColumnDescriptor,
-    ColumnDescriptorBuilder, ColumnId, FlushContext, FlushReason, Region, RegionMeta, ScanRequest,
-    SchemaRef, Snapshot, WriteRequest,
-};
-
-use crate::config::EngineConfig;
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::{OpenOptions, RawRegionMetadata, RegionImpl, RegionMetadata};
-use crate::test_util;
-use crate::test_util::config_util;
-use crate::test_util::descriptor_util::RegionDescBuilder;
-
-const REGION_NAME: &str = "region-alter-0";
-
-async fn create_region_for_alter(store_dir: &str) -> RegionImpl<RaftEngineLogStore> {
-    // Always disable version column in this test.
-    let metadata = tests::new_metadata(REGION_NAME);
-
-    let store_config =
-        config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
-
-    RegionImpl::create(metadata, store_config).await.unwrap()
-}
-
-/// Tester for region alter.
-struct AlterTester {
-    store_dir: String,
-    base: Option<FileTesterBase>,
-}
-
-#[derive(Debug, Clone, PartialEq)]
-struct DataRow {
-    key: Option<i64>,
-    ts: TimestampMillisecond,
-    v0: Option<String>,
-    v1: Option<i64>,
-}
-
-impl DataRow {
-    fn new_with_string(key: Option<i64>, ts: i64, v0: Option<String>, v1: Option<i64>) -> Self {
-        DataRow {
-            key,
-            ts: ts.into(),
-            v0,
-            v1,
-        }
-    }
-
-    fn new(key: Option<i64>, ts: i64, v0: Option<i64>, v1: Option<i64>) -> Self {
-        Self::new_with_string(key, ts, v0.map(|s| s.to_string()), v1)
-    }
-}
-
-fn new_put_data(data: &[DataRow]) -> HashMap<String, VectorRef> {
-    let keys = Int64Vector::from(data.iter().map(|v| v.key).collect::<Vec<_>>());
-    let timestamps = TimestampMillisecondVector::from(
-        data.iter()
-            .map(|v| Some(v.ts.into_native()))
-            .collect::<Vec<_>>(),
-    );
-    let values1 = StringVector::from(data.iter().map(|v| v.v0.clone()).collect::<Vec<_>>());
-    let values2 = Int64Vector::from(data.iter().map(|kv| kv.v1).collect::<Vec<_>>());
-
-    HashMap::from([
-        ("k0".to_string(), Arc::new(keys) as VectorRef),
-        (
-            test_util::TIMESTAMP_NAME.to_string(),
-            Arc::new(timestamps) as VectorRef,
-        ),
-        ("v0".to_string(), Arc::new(values1) as VectorRef),
-        ("v1".to_string(), Arc::new(values2) as VectorRef),
-    ])
-}
-
-impl AlterTester {
-    async fn new(store_dir: &str) -> AlterTester {
-        let region = create_region_for_alter(store_dir).await;
-
-        AlterTester {
-            base: Some(FileTesterBase::with_region(region)),
-            store_dir: store_dir.to_string(),
-        }
-    }
-
-    async fn reopen(&mut self) {
-        // Close the old region.
-        if let Some(base) = self.base.as_ref() {
-            base.close().await;
-        }
-        self.base = None;
-        // Reopen the region.
-        let store_config =
-            config_util::new_store_config(REGION_NAME, &self.store_dir, EngineConfig::default())
-                .await;
-        let opts = OpenOptions::default();
-        let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
-            .await
-            .unwrap()
-            .unwrap();
-        self.base = Some(FileTesterBase::with_region(region));
-    }
-
-    async fn flush(&self, wait: Option<bool>) {
-        let ctx = wait
-            .map(|wait| FlushContext {
-                wait,
-                reason: FlushReason::Manually,
-                ..Default::default()
-            })
-            .unwrap_or_default();
-        self.base().region.flush(&ctx).await.unwrap();
-    }
-
-    async fn checkpoint_manifest(&self) {
-        self.base().checkpoint_manifest().await
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    fn schema(&self) -> SchemaRef {
-        let metadata = self.base().region.in_memory_metadata();
-        metadata.schema().clone()
-    }
-
-    // Put with schema k0, ts, v0, v1
-    async fn put(&self, data: &[DataRow]) {
-        let mut batch = self.base().region.write_request();
-        let put_data = new_put_data(data);
-        batch.put(put_data).unwrap();
-
-        assert!(self
-            .base()
-            .region
-            .write(&self.base().write_ctx, batch)
-            .await
-            .is_ok());
-    }
-
-    /// Put data with initial schema.
-    async fn put_with_init_schema(&self, data: &[(i64, Option<i64>)]) {
-        // put of FileTesterBase always use initial schema version.
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        let _ = self.base().put(&data).await;
-    }
-
-    /// Put data to inner writer with initial schema.
-    async fn put_inner_with_init_schema(&self, data: &[(i64, Option<i64>)]) {
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        // put of FileTesterBase always use initial schema version.
-        let _ = self.base().put_inner(&data).await;
-    }
-
-    async fn alter(&self, mut req: AlterRequest) {
-        let version = self.version();
-        req.version = version;
-
-        self.base().region.alter(req).await.unwrap();
-    }
-
-    fn version(&self) -> u32 {
-        let metadata = self.base().region.in_memory_metadata();
-        metadata.version()
-    }
-
-    async fn full_scan_with_init_schema(&self) -> Vec<(i64, Option<String>)> {
-        self.base().full_scan().await
-    }
-
-    async fn full_scan(&self) -> Vec<DataRow> {
-        let read_ctx = &self.base().read_ctx;
-        let snapshot = self.base().region.snapshot(read_ctx).unwrap();
-
-        let resp = snapshot
-            .scan(read_ctx, ScanRequest::default())
-            .await
-            .unwrap();
-        let mut reader = resp.reader;
-
-        let metadata = self.base().region.in_memory_metadata();
-        assert_eq!(metadata.schema(), reader.user_schema());
-
-        let mut dst = Vec::new();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let chunk = reader.project_chunk(chunk);
-            append_chunk_to(&chunk, &mut dst);
-        }
-
-        dst
-    }
-}
-
-fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<DataRow>) {
-    assert_eq!(4, chunk.columns.len());
-
-    let k0_vector = chunk.columns[0]
-        .as_any()
-        .downcast_ref::<Int64Vector>()
-        .unwrap();
-    let ts_vector = chunk.columns[1]
-        .as_any()
-        .downcast_ref::<TimestampMillisecondVector>()
-        .unwrap();
-    let v0_vector = chunk.columns[2]
-        .as_any()
-        .downcast_ref::<StringVector>()
-        .unwrap();
-    let v1_vector = chunk.columns[3]
-        .as_any()
-        .downcast_ref::<Int64Vector>()
-        .unwrap();
-    for i in 0..k0_vector.len() {
-        dst.push(DataRow::new_with_string(
-            k0_vector.get_data(i),
-            ts_vector.get_data(i).unwrap().into(),
-            v0_vector.get_data(i).map(|s| s.to_string()),
-            v1_vector.get_data(i),
-        ));
-    }
-}
-
-fn new_column_desc(id: ColumnId, name: &str) -> ColumnDescriptor {
-    ColumnDescriptorBuilder::new(id, name, ConcreteDataType::int64_datatype())
-        .is_nullable(true)
-        .build()
-        .unwrap()
-}
-
-fn add_column_req(desc_and_is_key: &[(ColumnDescriptor, bool)]) -> AlterRequest {
-    let columns = desc_and_is_key
-        .iter()
-        .map(|(desc, is_key)| AddColumn {
-            desc: desc.clone(),
-            is_key: *is_key,
-        })
-        .collect();
-    let operation = AlterOperation::AddColumns { columns };
-
-    AlterRequest {
-        operation,
-        version: 0,
-    }
-}
-
-fn drop_column_req(names: &[&str]) -> AlterRequest {
-    let names = names.iter().map(|s| s.to_string()).collect();
-    let operation = AlterOperation::DropColumns { names };
-
-    AlterRequest {
-        operation,
-        version: 0,
-    }
-}
-
-fn check_schema_names(schema: &SchemaRef, names: &[&str]) {
-    assert_eq!(names.len(), schema.num_columns());
-
-    for (idx, name) in names.iter().enumerate() {
-        assert_eq!(*name, schema.column_name_by_index(idx));
-        let _ = schema.column_schema_by_name(name).unwrap();
-    }
-}
-
-#[tokio::test]
-async fn test_alter_region_with_reopen() {
-    test_alter_region_with_reopen0(true).await;
-    test_alter_region_with_reopen0(false).await;
-}
-
-async fn test_alter_region_with_reopen0(flush_and_checkpoint: bool) {
-    common_telemetry::init_default_ut_logging();
-
-    let dir = create_temp_dir("alter-region");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = AlterTester::new(store_dir).await;
-
-    let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
-    tester.put_with_init_schema(&data).await;
-    assert_eq!(3, tester.full_scan_with_init_schema().await.len());
-
-    let req = add_column_req(&[
-        (new_column_desc(4, "k0"), true),  // key column k0
-        (new_column_desc(5, "v1"), false), // value column v1
-    ]);
-    tester.alter(req).await;
-
-    let schema = tester.schema();
-    check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
-
-    // Put data after schema altered.
-    let data = vec![
-        DataRow::new(Some(10000), 1003, Some(103), Some(201)),
-        DataRow::new(Some(10001), 1004, Some(104), Some(202)),
-        DataRow::new(Some(10002), 1005, Some(105), Some(203)),
-    ];
-    tester.put(&data).await;
-
-    if flush_and_checkpoint {
-        tester.flush(None).await;
-        tester.checkpoint_manifest().await;
-    }
-
-    // Scan with new schema before reopen.
-    let mut expect = vec![
-        DataRow::new(None, 1000, Some(100), None),
-        DataRow::new(None, 1001, Some(101), None),
-        DataRow::new(None, 1002, Some(102), None),
-    ];
-    expect.extend_from_slice(&data);
-    let scanned = tester.full_scan().await;
-    assert_eq!(expect, scanned);
-
-    // Reopen and put more data.
-    tester.reopen().await;
-    let data = vec![
-        DataRow::new(Some(10003), 1006, Some(106), Some(204)),
-        DataRow::new(Some(10004), 1007, Some(107), Some(205)),
-        DataRow::new(Some(10005), 1008, Some(108), Some(206)),
-    ];
-    tester.put(&data).await;
-    // Extend expected result.
-    expect.extend_from_slice(&data);
-
-    // add columns,then remove them without writing data.
-    let req = add_column_req(&[
-        (new_column_desc(6, "v2"), false), // key column k0
-        (new_column_desc(7, "v3"), false), // value column v1
-    ]);
-    tester.alter(req).await;
-
-    let req = drop_column_req(&["v2", "v3"]);
-    tester.alter(req).await;
-
-    if flush_and_checkpoint {
-        tester.flush(None).await;
-        tester.checkpoint_manifest().await;
-    }
-
-    // reopen and write again
-    tester.reopen().await;
-    let schema = tester.schema();
-    check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
-
-    let data = vec![DataRow::new(Some(10006), 1009, Some(109), Some(207))];
-    tester.put(&data).await;
-    expect.extend_from_slice(&data);
-
-    // Scan with new schema after reopen and write.
-    let scanned = tester.full_scan().await;
-    assert_eq!(expect, scanned);
-}
-
-#[tokio::test]
-async fn test_alter_region() {
-    let dir = create_temp_dir("alter-region");
-    let store_dir = dir.path().to_str().unwrap();
-    let tester = AlterTester::new(store_dir).await;
-
-    let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
-
-    tester.put_with_init_schema(&data).await;
-
-    let schema = tester.schema();
-    check_schema_names(&schema, &["timestamp", "v0"]);
-
-    let req = add_column_req(&[
-        (new_column_desc(4, "k0"), true),  // key column k0
-        (new_column_desc(5, "v1"), false), // value column v1
-    ]);
-    tester.alter(req).await;
-
-    let schema = tester.schema();
-    check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
-
-    let req = add_column_req(&[
-        (new_column_desc(6, "v2"), false),
-        (new_column_desc(7, "v3"), false),
-    ]);
-    tester.alter(req).await;
-
-    let schema = tester.schema();
-    check_schema_names(&schema, &["k0", "timestamp", "v0", "v1", "v2", "v3"]);
-
-    // Remove v0, v1
-    let req = drop_column_req(&["v0", "v1"]);
-    tester.alter(req).await;
-
-    let schema = tester.schema();
-    check_schema_names(&schema, &["k0", "timestamp", "v2", "v3"]);
-}
-
-#[tokio::test]
-async fn test_put_old_schema_after_alter() {
-    let dir = create_temp_dir("put-old");
-    let store_dir = dir.path().to_str().unwrap();
-    let tester = AlterTester::new(store_dir).await;
-
-    let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
-
-    tester.put_with_init_schema(&data).await;
-
-    let req = add_column_req(&[
-        (new_column_desc(4, "k0"), true),  // key column k0
-        (new_column_desc(5, "v1"), false), // value column v1
-    ]);
-    tester.alter(req).await;
-
-    // Put with old schema.
-    let data = vec![(1005, Some(105)), (1006, Some(106))];
-    tester.put_with_init_schema(&data).await;
-
-    // Put data with old schema directly to the inner writer, to check that the region
-    // writer could compat the schema of write batch.
-    let data = vec![(1003, Some(103)), (1004, Some(104))];
-    tester.put_inner_with_init_schema(&data).await;
-
-    let expect = vec![
-        DataRow::new(None, 1000, Some(100), None),
-        DataRow::new(None, 1001, Some(101), None),
-        DataRow::new(None, 1002, Some(102), None),
-        DataRow::new(None, 1003, Some(103), None),
-        DataRow::new(None, 1004, Some(104), None),
-        DataRow::new(None, 1005, Some(105), None),
-        DataRow::new(None, 1006, Some(106), None),
-    ];
-    let scanned = tester.full_scan().await;
-    assert_eq!(expect, scanned);
-}
-
-#[tokio::test]
-async fn test_replay_metadata_after_open() {
-    let dir = create_temp_dir("replay-metadata-after-open");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = AlterTester::new(store_dir).await;
-
-    let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
-
-    tester.put_with_init_schema(&data).await;
-
-    tester.reopen().await;
-
-    let committed_sequence = tester.base().committed_sequence();
-    let manifest_version = tester.base().region.current_manifest_version();
-    let version = tester.version();
-
-    let desc = RegionDescBuilder::new(REGION_NAME)
-        .push_key_column(("k1", LogicalTypeId::Int32, false))
-        .push_field_column(("v0", LogicalTypeId::Float32, true))
-        .build();
-    let metadata: &RegionMetadata = &desc.try_into().unwrap();
-    let mut raw_metadata: RawRegionMetadata = metadata.into();
-    raw_metadata.version = version + 1;
-
-    let recovered_metadata =
-        BTreeMap::from([(committed_sequence, (manifest_version + 1, raw_metadata))]);
-
-    tester.base().replay_inner(recovered_metadata).await;
-    let schema = tester.schema();
-    check_schema_names(&schema, &["k1", "timestamp", "v0"]);
-}
--- a/src/storage/src/region/tests/basic.rs
+++ b/src/storage/src/region/tests/basic.rs
@@ -1,288 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region read/write tests.
-
-use common_telemetry::info;
-use common_test_util::temp_dir::create_temp_dir;
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::storage::{OpenOptions, SequenceNumber};
-
-use crate::config::EngineConfig;
-use crate::error::Result;
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::RegionImpl;
-use crate::test_util::config_util;
-
-const REGION_NAME: &str = "region-basic-0";
-
-/// Create a new region for basic tests.
-async fn create_region_for_basic(
-    region_name: &str,
-    store_dir: &str,
-) -> RegionImpl<RaftEngineLogStore> {
-    let metadata = tests::new_metadata(region_name);
-    let store_config =
-        config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await;
-    RegionImpl::create(metadata, store_config).await.unwrap()
-}
-
-/// Tester for basic tests.
-struct Tester {
-    region_name: String,
-    store_dir: String,
-    base: Option<FileTesterBase>,
-}
-
-impl Tester {
-    async fn new(region_name: &str, store_dir: &str) -> Tester {
-        let region = create_region_for_basic(region_name, store_dir).await;
-
-        Tester {
-            region_name: region_name.to_string(),
-            store_dir: store_dir.to_string(),
-            base: Some(FileTesterBase::with_region(region)),
-        }
-    }
-
-    async fn empty(region_name: &str, store_dir: &str) -> Tester {
-        Tester {
-            region_name: region_name.to_string(),
-            store_dir: store_dir.to_string(),
-            base: None,
-        }
-    }
-
-    async fn reopen(&mut self) {
-        let _ = self.try_reopen().await.unwrap();
-    }
-
-    async fn try_reopen(&mut self) -> Result<bool> {
-        // Close the old region.
-        if let Some(base) = self.base.as_ref() {
-            info!("Reopen tester base");
-            base.close().await;
-        }
-
-        self.base = None;
-        // Reopen the region.
-        let store_config = config_util::new_store_config(
-            &self.region_name,
-            &self.store_dir,
-            EngineConfig::default(),
-        )
-        .await;
-        let opts = OpenOptions::default();
-        let region = RegionImpl::open(self.region_name.clone(), store_config, &opts).await?;
-        match region {
-            None => Ok(false),
-            Some(region) => {
-                let base = FileTesterBase::with_region(region);
-                self.base = Some(base);
-                Ok(true)
-            }
-        }
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    #[inline]
-    fn set_batch_size(&mut self, batch_size: usize) {
-        self.base.as_mut().unwrap().read_ctx.batch_size = batch_size;
-    }
-
-    async fn put(&self, data: &[(i64, Option<String>)]) {
-        let _ = self.base().put(data).await;
-    }
-
-    async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
-        self.base().full_scan().await
-    }
-
-    fn committed_sequence(&self) -> SequenceNumber {
-        self.base().committed_sequence()
-    }
-
-    async fn delete(&self, keys: &[i64]) {
-        let _ = self.base().delete(keys).await;
-    }
-}
-
-#[tokio::test]
-async fn test_simple_put_scan() {
-    let dir = create_temp_dir("put-scan");
-    let store_dir = dir.path().to_str().unwrap();
-    let tester = Tester::new(REGION_NAME, store_dir).await;
-
-    let data = vec![
-        (1000, Some(100.to_string())),
-        (1001, Some(101.to_string())),
-        (1002, None),
-        (1003, Some(103.to_string())),
-        (1004, Some(104.to_string())),
-    ];
-
-    tester.put(&data).await;
-
-    let output = tester.full_scan().await;
-    assert_eq!(data, output);
-}
-
-#[tokio::test]
-async fn test_sequence_increase() {
-    let dir = create_temp_dir("sequence");
-    let store_dir = dir.path().to_str().unwrap();
-    let tester = Tester::new(REGION_NAME, store_dir).await;
-
-    let mut committed_sequence = tester.committed_sequence();
-    for i in 0..100 {
-        tester.put(&[(i, Some(1234.to_string()))]).await;
-        committed_sequence += 1;
-
-        assert_eq!(committed_sequence, tester.committed_sequence());
-    }
-}
-
-#[tokio::test]
-async fn test_reopen() {
-    common_telemetry::logging::init_default_ut_logging();
-
-    let dir = create_temp_dir("reopen");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = Tester::new(REGION_NAME, store_dir).await;
-
-    let mut all_data = Vec::new();
-    // Reopen region multiple times.
-    for i in 0..5 {
-        let data = (i, Some(i.to_string()));
-        tester.put(&[data.clone()]).await;
-        all_data.push(data.clone());
-
-        let output = tester.full_scan().await;
-        assert_eq!(all_data, output);
-
-        tester.reopen().await;
-
-        // Scan after reopen.
-        let output = tester.full_scan().await;
-        assert_eq!(all_data, output);
-
-        // Check committed sequence.
-        assert_eq!(i + 1, tester.committed_sequence() as i64);
-    }
-}
-
-#[tokio::test]
-async fn test_open_empty() {
-    let dir = create_temp_dir("open-empty");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = Tester::empty(REGION_NAME, store_dir).await;
-
-    let ret = tester.try_reopen().await;
-    assert!(!ret.unwrap());
-}
-
-#[tokio::test]
-async fn test_scan_different_batch() {
-    let dir = create_temp_dir("different-batch");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = Tester::new(REGION_NAME, store_dir).await;
-
-    let data: Vec<_> = (0..=2000).map(|i| (i, Some(i.to_string()))).collect();
-
-    for chunk in data.chunks(100) {
-        tester.put(chunk).await;
-    }
-
-    let batch_sizes = [1, 2, 4, 16, 64, 128, 256, 512];
-    for batch_size in batch_sizes {
-        tester.set_batch_size(batch_size);
-
-        let output = tester.full_scan().await;
-        assert_eq!(data, output);
-    }
-}
-
-#[tokio::test]
-async fn test_put_delete_scan() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("put-delete-scan");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = Tester::new(REGION_NAME, store_dir).await;
-
-    let data = vec![
-        (1000, Some(100.to_string())),
-        (1001, Some(101.to_string())),
-        (1002, None),
-        (1003, None),
-        (1004, Some(104.to_string())),
-    ];
-
-    tester.put(&data).await;
-
-    let keys = [1001, 1003];
-
-    tester.delete(&keys).await;
-
-    let output = tester.full_scan().await;
-    let expect = vec![
-        (1000, Some(100.to_string())),
-        (1002, None),
-        (1004, Some(104.to_string())),
-    ];
-    assert_eq!(expect, output);
-
-    // Deletion is also persistent.
-    let _ = tester.try_reopen().await.unwrap();
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-}
-
-#[tokio::test]
-async fn test_put_delete_absent_key() {
-    let dir = create_temp_dir("put-delete-scan");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = Tester::new(REGION_NAME, store_dir).await;
-
-    let data = vec![
-        (1000, Some(100.to_string())),
-        (1001, Some(101.to_string())),
-        (1002, None),
-        (1003, None),
-        (1004, Some(104.to_string())),
-    ];
-
-    tester.put(&data).await;
-
-    // 999 and 1006 is absent.
-    let keys = [999, 1002, 1004, 1006];
-
-    tester.delete(&keys).await;
-
-    let output = tester.full_scan().await;
-    let expect = vec![
-        (1000, Some(100.to_string())),
-        (1001, Some(101.to_string())),
-        (1003, None),
-    ];
-    assert_eq!(expect, output);
-
-    // Deletion is also persistent.
-    let _ = tester.try_reopen().await.unwrap();
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-}
--- a/src/storage/src/region/tests/close.rs
+++ b/src/storage/src/region/tests/close.rs
@@ -1,168 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region close tests.
-
-use std::sync::Arc;
-
-use common_test_util::temp_dir::create_temp_dir;
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::storage::{
-    AlterOperation, AlterRequest, CloseContext, Region, RegionMeta, WriteResponse,
-};
-
-use crate::config::EngineConfig;
-use crate::engine;
-use crate::error::Error;
-use crate::flush::FlushStrategyRef;
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::RegionImpl;
-use crate::test_util::config_util;
-use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
-
-const REGION_NAME: &str = "region-close-0";
-
-/// Tester for region close
-struct CloseTester {
-    base: Option<FileTesterBase>,
-}
-
-/// Create a new region for close test
-async fn create_region_for_close(
-    store_dir: &str,
-    flush_strategy: FlushStrategyRef,
-) -> RegionImpl<RaftEngineLogStore> {
-    let metadata = tests::new_metadata(REGION_NAME);
-
-    let mut store_config =
-        config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
-    store_config.flush_strategy = flush_strategy;
-
-    RegionImpl::create(metadata, store_config).await.unwrap()
-}
-
-impl CloseTester {
-    async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> CloseTester {
-        let region = create_region_for_close(store_dir, flush_strategy.clone()).await;
-
-        CloseTester {
-            base: Some(FileTesterBase::with_region(region)),
-        }
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    async fn put(&self, data: &[(i64, Option<i64>)]) {
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        let _ = self.base().put(&data).await;
-    }
-
-    async fn try_put(&self, data: &[(i64, Option<i64>)]) -> Result<WriteResponse, Error> {
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        self.base().try_put(&data).await
-    }
-
-    async fn try_alter(&self, mut req: AlterRequest) -> Result<(), Error> {
-        let version = self.version();
-        req.version = version;
-
-        self.base().region.alter(req).await
-    }
-
-    fn version(&self) -> u32 {
-        let metadata = self.base().region.in_memory_metadata();
-        metadata.version()
-    }
-}
-
-#[tokio::test]
-async fn test_close_basic() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("close-basic");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = CloseTester::new(store_dir, flush_switch).await;
-
-    tester
-        .base()
-        .region
-        .close(&CloseContext::default())
-        .await
-        .unwrap();
-
-    let data = [(1000, Some(100))];
-
-    let closed_region_error = "Try to write the closed region".to_string();
-    // Put one element should return ClosedRegion error
-    assert_eq!(
-        tester.try_put(&data).await.unwrap_err().to_string(),
-        closed_region_error
-    );
-
-    // Alter table should return ClosedRegion error
-    assert_eq!(
-        tester
-            .try_alter(AlterRequest {
-                operation: AlterOperation::AddColumns {
-                    columns: Vec::new(),
-                },
-                version: 0,
-            })
-            .await
-            .unwrap_err()
-            .to_string(),
-        closed_region_error
-    );
-}
-
-#[tokio::test]
-async fn test_close_wait_flush_done() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("close-basic");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = CloseTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [(1000, Some(100))];
-
-    // Now set should flush to true to trigger flush.
-    flush_switch.set_should_flush(true);
-
-    // Put one element so we have content to flush.
-    tester.put(&data).await;
-
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    assert!(!has_parquet_file(&sst_dir));
-
-    // Close should cancel the flush.
-    tester
-        .base()
-        .region
-        .close(&CloseContext::default())
-        .await
-        .unwrap();
-
-    assert!(!has_parquet_file(&sst_dir));
-}
--- a/src/storage/src/region/tests/compact.rs
+++ b/src/storage/src/region/tests/compact.rs
@@ -1,458 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region compaction tests.
-
-use std::env;
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Arc;
-
-use common_telemetry::logging;
-use common_test_util::temp_dir::create_temp_dir;
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use object_store::services::{Fs, S3};
-use object_store::ObjectStore;
-use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region};
-use tokio::sync::{Notify, RwLock};
-
-use crate::compaction::CompactionHandler;
-use crate::config::EngineConfig;
-use crate::error::Result;
-use crate::file_purger::{FilePurgeHandler, FilePurgeRequest};
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::{CompactContext, FlushStrategyRef, RegionImpl};
-use crate::scheduler::rate_limit::BoxedRateLimitToken;
-use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig};
-use crate::test_util::config_util;
-use crate::test_util::flush_switch::FlushSwitch;
-
-const REGION_NAME: &str = "region-compact-0";
-
-fn new_object_store(store_dir: &str, s3_bucket: Option<String>) -> ObjectStore {
-    if let Some(bucket) = s3_bucket {
-        if !bucket.is_empty() {
-            logging::info!("Use S3 object store");
-
-            let root = uuid::Uuid::new_v4().to_string();
-
-            let mut builder = S3::default();
-            let _ = builder
-                .root(&root)
-                .access_key_id(&env::var("GT_S3_ACCESS_KEY_ID").unwrap())
-                .secret_access_key(&env::var("GT_S3_ACCESS_KEY").unwrap())
-                .region(&env::var("GT_S3_REGION").unwrap())
-                .bucket(&bucket);
-
-            return ObjectStore::new(builder).unwrap().finish();
-        }
-    }
-
-    logging::info!("Use local fs object store");
-
-    let mut builder = Fs::default();
-    let _ = builder.root(store_dir);
-    ObjectStore::new(builder).unwrap().finish()
-}
-
-/// Create a new region for compaction test
-async fn create_region_for_compaction<
-    H: Handler<Request = FilePurgeRequest> + Send + Sync + 'static,
->(
-    store_dir: &str,
-    engine_config: EngineConfig,
-    purge_handler: H,
-    flush_strategy: FlushStrategyRef,
-    s3_bucket: Option<String>,
-) -> (
-    RegionImpl<RaftEngineLogStore>,
-    ObjectStore,
-    Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
-) {
-    let metadata = tests::new_metadata(REGION_NAME);
-
-    let object_store = new_object_store(store_dir, s3_bucket);
-
-    let (mut store_config, _) = config_util::new_store_config_with_object_store(
-        REGION_NAME,
-        store_dir,
-        object_store.clone(),
-        EngineConfig::default(),
-    )
-    .await;
-    store_config.engine_config = Arc::new(engine_config);
-    store_config.flush_strategy = flush_strategy;
-
-    let pending_compaction_tasks = Arc::new(RwLock::new(vec![]));
-    let handler = CompactionHandler::new_with_pending_tasks(pending_compaction_tasks.clone());
-    let config = SchedulerConfig::default();
-    // Overwrite test compaction scheduler and file purger.
-    store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
-    store_config.file_purger = Arc::new(LocalScheduler::new(
-        SchedulerConfig {
-            max_inflight_tasks: store_config.engine_config.max_purge_tasks,
-        },
-        purge_handler,
-    ));
-
-    (
-        RegionImpl::create(metadata, store_config).await.unwrap(),
-        object_store,
-        pending_compaction_tasks,
-    )
-}
-
-#[derive(Debug, Default, Clone)]
-struct MockFilePurgeHandler {
-    num_deleted: Arc<AtomicUsize>,
-}
-
-#[async_trait::async_trait]
-impl Handler for MockFilePurgeHandler {
-    type Request = FilePurgeRequest;
-
-    async fn handle_request(
-        &self,
-        req: Self::Request,
-        token: BoxedRateLimitToken,
-        finish_notifier: Arc<Notify>,
-    ) -> Result<()> {
-        logging::info!(
-            "Try to delete file: {:?}, num_deleted: {:?}",
-            req.file_id,
-            self.num_deleted
-        );
-
-        let handler = FilePurgeHandler;
-        handler
-            .handle_request(req, token, finish_notifier)
-            .await
-            .unwrap();
-
-        let _ = self.num_deleted.fetch_add(1, Ordering::Relaxed);
-
-        Ok(())
-    }
-}
-
-impl MockFilePurgeHandler {
-    fn num_deleted(&self) -> usize {
-        self.num_deleted.load(Ordering::Relaxed)
-    }
-}
-
-/// Tester for region compaction.
-struct CompactionTester {
-    base: Option<FileTesterBase>,
-    purge_handler: MockFilePurgeHandler,
-    object_store: ObjectStore,
-    store_dir: String,
-    engine_config: EngineConfig,
-    flush_strategy: FlushStrategyRef,
-    pending_tasks: Arc<RwLock<Vec<tokio::task::JoinHandle<()>>>>,
-}
-
-impl CompactionTester {
-    async fn new(
-        store_dir: &str,
-        engine_config: EngineConfig,
-        flush_strategy: FlushStrategyRef,
-        s3_bucket: Option<String>,
-    ) -> CompactionTester {
-        let purge_handler = MockFilePurgeHandler::default();
-        let (region, object_store, pending_tasks) = create_region_for_compaction(
-            store_dir,
-            engine_config.clone(),
-            purge_handler.clone(),
-            flush_strategy.clone(),
-            s3_bucket,
-        )
-        .await;
-
-        CompactionTester {
-            base: Some(FileTesterBase::with_region(region)),
-            purge_handler,
-            object_store,
-            store_dir: store_dir.to_string(),
-            engine_config,
-            flush_strategy,
-            pending_tasks,
-        }
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    #[inline]
-    fn base_mut(&mut self) -> &mut FileTesterBase {
-        self.base.as_mut().unwrap()
-    }
-
-    async fn put(&self, data: &[(i64, Option<i64>)]) {
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        let _ = self.base().put(&data).await;
-    }
-
-    async fn flush(&self, wait: Option<bool>) {
-        let ctx = wait
-            .map(|wait| FlushContext {
-                wait,
-                reason: FlushReason::Manually,
-                ..Default::default()
-            })
-            .unwrap_or_default();
-        self.base().region.flush(&ctx).await.unwrap();
-    }
-
-    async fn compact(&self) {
-        // Trigger compaction and wait until it is done.
-        self.base()
-            .region
-            .compact(&CompactContext::default())
-            .await
-            .unwrap();
-    }
-
-    /// Close region and clean up files.
-    async fn clean_up(mut self) {
-        self.base = None;
-
-        self.object_store.remove_all("/").await.unwrap();
-    }
-
-    async fn reopen(&mut self) -> Result<bool> {
-        // Close the old region.
-        if let Some(base) = self.base.take() {
-            let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await;
-            base.close().await;
-        }
-
-        // Reopen the region.
-        let object_store = new_object_store(&self.store_dir, None);
-        let (mut store_config, _) = config_util::new_store_config_with_object_store(
-            REGION_NAME,
-            &self.store_dir,
-            object_store.clone(),
-            EngineConfig {
-                max_files_in_l0: usize::MAX,
-                ..Default::default()
-            },
-        )
-        .await;
-        store_config.engine_config = Arc::new(self.engine_config.clone());
-        store_config.flush_strategy = self.flush_strategy.clone();
-
-        let handler = CompactionHandler::new_with_pending_tasks(Arc::new(Default::default()));
-        let config = SchedulerConfig::default();
-        // Overwrite test compaction scheduler and file purger.
-        store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
-        store_config.file_purger = Arc::new(LocalScheduler::new(
-            SchedulerConfig {
-                max_inflight_tasks: store_config.engine_config.max_purge_tasks,
-            },
-            MockFilePurgeHandler::default(),
-        ));
-
-        let Some(region) = RegionImpl::open(
-            REGION_NAME.to_string(),
-            store_config,
-            &OpenOptions::default(),
-        )
-        .await?
-        else {
-            return Ok(false);
-        };
-        self.base = Some(FileTesterBase::with_region(region));
-        Ok(true)
-    }
-}
-
-async fn compact_during_read(s3_bucket: Option<String>) {
-    let dir = create_temp_dir("compact_read");
-    let store_dir = dir.path().to_str().unwrap();
-
-    // Use a large max_files_in_l0 to avoid compaction automatically.
-    let mut tester = CompactionTester::new(
-        store_dir,
-        EngineConfig {
-            max_files_in_l0: 100,
-            ..Default::default()
-        },
-        // Disable auto-flush.
-        Arc::new(FlushSwitch::default()),
-        s3_bucket,
-    )
-    .await;
-
-    let expect: Vec<_> = (0..200).map(|v| (v, Some(v))).collect();
-    // Put elements so we have content to flush (In SST1).
-    tester.put(&expect[0..100]).await;
-
-    // Flush content to SST1.
-    tester.flush(None).await;
-
-    // Put element (In SST2).
-    tester.put(&expect[100..200]).await;
-
-    // Flush content to SST2.
-    tester.flush(None).await;
-
-    tester.base_mut().read_ctx.batch_size = 1;
-    // Create a reader.
-    let reader = tester.base().full_scan_reader().await;
-
-    assert_eq!(0, tester.purge_handler.num_deleted());
-
-    // Trigger compaction.
-    tester.compact().await;
-
-    // The files are still referenced.
-    assert_eq!(0, tester.purge_handler.num_deleted());
-
-    // Read from the reader.
-    let output = tester.base().collect_reader(reader).await;
-
-    assert_eq!(expect.len(), output.len());
-
-    tester.clean_up().await;
-}
-
-#[tokio::test]
-async fn test_compact_during_read_on_fs() {
-    common_telemetry::init_default_ut_logging();
-
-    compact_during_read(None).await;
-}
-
-#[tokio::test]
-async fn test_compact_during_read_on_s3() {
-    common_telemetry::init_default_ut_logging();
-
-    if let Ok(bucket) = env::var("GT_S3_BUCKET") {
-        if !bucket.is_empty() {
-            compact_during_read(Some(bucket)).await;
-        }
-    }
-}
-
-#[tokio::test]
-async fn test_persist_region_compaction_time_window() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("put-delete-scan");
-    let store_dir = dir.path().to_str().unwrap();
-    let mut tester = CompactionTester::new(
-        store_dir,
-        EngineConfig {
-            max_files_in_l0: 100,
-            ..Default::default()
-        },
-        // Disable auto-flush.
-        Arc::new(FlushSwitch::default()),
-        None,
-    )
-    .await;
-
-    // initially the time window is not present since no compaction ever happened.
-    assert_eq!(
-        None,
-        tester
-            .base
-            .as_ref()
-            .unwrap()
-            .region
-            .inner
-            .shared
-            .version_control
-            .current()
-            .ssts()
-            .compaction_time_window()
-    );
-
-    // write some data with one hour span
-    for idx in 0..10 {
-        tester
-            .put(&[(idx * 1000, Some(idx)), ((idx + 360) * 1000, Some(idx))])
-            .await;
-        tester.flush(Some(true)).await;
-    }
-
-    tester.compact().await;
-    // the inferred and persisted compaction time window should be 3600 seconds.
-    assert_eq!(
-        3600,
-        tester
-            .base
-            .as_ref()
-            .unwrap()
-            .region
-            .inner
-            .shared
-            .version_control
-            .current()
-            .ssts()
-            .compaction_time_window()
-            .unwrap()
-    );
-
-    // try write data with a larger time window
-    for idx in 0..10 {
-        tester
-            .put(&[
-                (idx * 1000, Some(idx)),
-                ((idx + 2 * 60 * 60) * 1000, Some(idx)),
-            ])
-            .await;
-        tester.flush(Some(true)).await;
-    }
-    tester.compact().await;
-
-    // but we won't changed persisted compaction window for now, so it remains unchanged.
-    assert_eq!(
-        3600,
-        tester
-            .base
-            .as_ref()
-            .unwrap()
-            .region
-            .inner
-            .shared
-            .version_control
-            .current()
-            .ssts()
-            .compaction_time_window()
-            .unwrap()
-    );
-
-    let reopened = tester.reopen().await.unwrap();
-    assert!(reopened);
-    assert_eq!(
-        3600,
-        tester
-            .base
-            .as_ref()
-            .unwrap()
-            .region
-            .inner
-            .shared
-            .version_control
-            .current()
-            .ssts()
-            .compaction_time_window()
-            .unwrap()
-    );
-}
--- a/src/storage/src/region/tests/drop.rs
+++ b/src/storage/src/region/tests/drop.rs
@@ -1,192 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region drop tests.
-
-use std::path::Path;
-use std::sync::Arc;
-
-use common_telemetry::info;
-use common_test_util::temp_dir::create_temp_dir;
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::manifest::{Manifest, MetaAction};
-use store_api::storage::{FlushContext, OpenOptions, Region};
-
-use crate::config::EngineConfig;
-use crate::engine;
-use crate::flush::FlushStrategyRef;
-use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionRemove};
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::RegionImpl;
-use crate::test_util::config_util;
-use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
-
-const REGION_NAME: &str = "region-drop-0";
-
-/// Create a new region for drop tests.
-async fn create_region_for_drop(
-    store_dir: &str,
-    flush_strategy: FlushStrategyRef,
-) -> RegionImpl<RaftEngineLogStore> {
-    let metadata = tests::new_metadata(REGION_NAME);
-
-    let mut store_config =
-        config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
-    store_config.flush_strategy = flush_strategy;
-
-    RegionImpl::create(metadata, store_config).await.unwrap()
-}
-
-/// Tester for drop tests.
-struct DropTester {
-    base: Option<FileTesterBase>,
-}
-
-impl DropTester {
-    async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> DropTester {
-        let region = create_region_for_drop(store_dir, flush_strategy).await;
-        DropTester {
-            base: Some(FileTesterBase::with_region(region)),
-        }
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    async fn put(&self, data: &[(i64, Option<i64>)]) {
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        let _ = self.base().put(&data).await;
-    }
-
-    async fn flush(&self) {
-        let ctx = FlushContext::default();
-        self.base().region.flush(&ctx).await.unwrap();
-    }
-
-    async fn close(&mut self) {
-        if let Some(base) = self.base.take() {
-            base.close().await;
-        }
-    }
-}
-
-fn get_all_files(path: &str) -> Vec<String> {
-    let mut files = Vec::new();
-    for entry in std::fs::read_dir(path).unwrap() {
-        let entry = entry.unwrap();
-        let path = entry.path();
-        if path.is_file() {
-            files.push(path.to_str().unwrap().to_string());
-        } else if path.is_dir() {
-            files.extend(get_all_files(path.to_str().unwrap()));
-        }
-    }
-    files
-}
-
-#[tokio::test]
-async fn test_drop_basic() {
-    let dir = create_temp_dir("drop-basic");
-    common_telemetry::init_default_ut_logging();
-    let store_dir = dir.path().to_str().unwrap();
-
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    let manifest_dir = format!(
-        "{}/{}",
-        store_dir,
-        engine::region_manifest_dir("", REGION_NAME)
-    );
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let mut tester = DropTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [(1000, Some(100))];
-
-    // Put one element so we have content to flush.
-    tester.put(&data).await;
-
-    // Manually trigger flush.
-    tester.flush().await;
-
-    assert!(has_parquet_file(&sst_dir));
-
-    tester.base().checkpoint_manifest().await;
-    let manifest_files = get_all_files(&manifest_dir);
-    info!("manifest_files: {:?}", manifest_files);
-
-    tester.base().region.drop_region().await.unwrap();
-    tester.close().await;
-
-    assert!(!Path::new(&manifest_dir).exists());
-}
-
-#[tokio::test]
-async fn test_drop_reopen() {
-    let dir = create_temp_dir("drop-basic");
-    common_telemetry::init_default_ut_logging();
-    let store_dir = dir.path().to_str().unwrap();
-
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    let manifest_dir = format!(
-        "{}/{}",
-        store_dir,
-        engine::region_manifest_dir("", REGION_NAME)
-    );
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let mut tester = DropTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [(1000, Some(100))];
-
-    // Put one element so we have content to flush.
-    tester.put(&data).await;
-    // Manually trigger flush.
-    tester.flush().await;
-
-    assert!(has_parquet_file(&sst_dir));
-
-    tester.base().checkpoint_manifest().await;
-    let version_control = tester.base().region.version_control();
-
-    let mut action_list =
-        RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove {
-            region_id: tester.base().region.id(),
-        }));
-    let prev_version = version_control.current_manifest_version();
-    action_list.set_prev_version(prev_version);
-    let manifest = &tester.base().region.inner.manifest;
-    let _ = manifest.update(action_list).await.unwrap();
-    tester.close().await;
-
-    // Reopen the region.
-    let store_config = config_util::new_store_config(
-        REGION_NAME,
-        store_dir,
-        EngineConfig {
-            max_files_in_l0: usize::MAX,
-            ..Default::default()
-        },
-    )
-    .await;
-
-    let opts = OpenOptions::default();
-    let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
-        .await
-        .unwrap();
-    assert!(region.is_none());
-    assert!(!Path::new(&manifest_dir).exists());
-}
--- a/src/storage/src/region/tests/flush.rs
+++ b/src/storage/src/region/tests/flush.rs
@@ -1,462 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region flush tests.
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use arrow::compute::SortOptions;
-use common_query::prelude::Expr;
-use common_recordbatch::OrderOption;
-use common_test_util::temp_dir::create_temp_dir;
-use common_time::timestamp::TimeUnit;
-use datafusion_common::Column;
-use datatypes::value::timestamp_to_scalar_value;
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region, ScanRequest};
-
-use crate::config::EngineConfig;
-use crate::engine::{self, RegionMap};
-use crate::flush::{FlushStrategyRef, FlushType};
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::RegionImpl;
-use crate::test_util::config_util;
-use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
-
-const REGION_NAME: &str = "region-flush-0";
-
-/// Create a new region for flush test
-async fn create_region_for_flush(
-    store_dir: &str,
-    flush_strategy: FlushStrategyRef,
-) -> (
-    RegionImpl<RaftEngineLogStore>,
-    Arc<RegionMap<RaftEngineLogStore>>,
-) {
-    let metadata = tests::new_metadata(REGION_NAME);
-
-    let (mut store_config, regions) = config_util::new_store_config_and_region_map(
-        REGION_NAME,
-        store_dir,
-        EngineConfig {
-            max_files_in_l0: usize::MAX,
-            ..Default::default()
-        },
-    )
-    .await;
-    store_config.flush_strategy = flush_strategy;
-
-    (
-        RegionImpl::create(metadata, store_config).await.unwrap(),
-        regions,
-    )
-}
-
-/// Tester for region flush.
-struct FlushTester {
-    base: Option<FileTesterBase>,
-    store_dir: String,
-    flush_strategy: FlushStrategyRef,
-    regions: Arc<RegionMap<RaftEngineLogStore>>,
-}
-
-impl FlushTester {
-    async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> FlushTester {
-        let (region, regions) = create_region_for_flush(store_dir, flush_strategy.clone()).await;
-
-        FlushTester {
-            base: Some(FileTesterBase::with_region(region)),
-            store_dir: store_dir.to_string(),
-            flush_strategy: flush_strategy.clone(),
-            regions,
-        }
-    }
-
-    async fn reopen(&mut self) {
-        self.regions.clear();
-        // Close the old region.
-        if let Some(base) = self.base.take() {
-            base.close().await;
-        }
-        // Reopen the region.
-        let mut store_config = config_util::new_store_config(
-            REGION_NAME,
-            &self.store_dir,
-            EngineConfig {
-                max_files_in_l0: usize::MAX,
-                ..Default::default()
-            },
-        )
-        .await;
-        store_config.flush_strategy = self.flush_strategy.clone();
-        let opts = OpenOptions::default();
-        let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
-            .await
-            .unwrap()
-            .unwrap();
-        self.base = Some(FileTesterBase::with_region(region));
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    async fn put(&self, data: &[(i64, Option<i64>)]) {
-        let data = data
-            .iter()
-            .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
-            .collect::<Vec<_>>();
-        let _ = self.base().put(&data).await;
-    }
-
-    async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
-        self.base().full_scan().await
-    }
-
-    async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option<String>)> {
-        self.base().scan(req).await
-    }
-
-    async fn flush(&self, wait: Option<bool>) {
-        let ctx = wait
-            .map(|wait| FlushContext {
-                wait,
-                reason: FlushReason::Manually,
-                ..Default::default()
-            })
-            .unwrap_or_default();
-        self.base().region.flush(&ctx).await.unwrap();
-    }
-}
-
-impl Drop for FlushTester {
-    fn drop(&mut self) {
-        self.regions.clear();
-    }
-}
-
-#[tokio::test]
-async fn test_flush_and_stall() {
-    common_telemetry::init_default_ut_logging();
-
-    let dir = create_temp_dir("flush-stall");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [(1000, Some(100))];
-    // Put one element so we have content to flush.
-    tester.put(&data).await;
-
-    // Now set should flush to true to trigger flush.
-    flush_switch.set_should_flush(true);
-    // Put element to trigger flush.
-    tester.put(&data).await;
-
-    // Now put another data to trigger write stall and wait until last flush done to
-    // ensure at least one parquet file is generated.
-    tester.put(&data).await;
-
-    // Check parquet files.
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    assert!(has_parquet_file(&sst_dir));
-}
-
-#[tokio::test]
-async fn test_manual_flush() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("manual_flush");
-
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [(1000, Some(100))];
-    // Put one element so we have content to flush.
-    tester.put(&data).await;
-
-    // No parquet file should be flushed.
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    assert!(!has_parquet_file(&sst_dir));
-
-    tester.flush(None).await;
-
-    assert!(has_parquet_file(&sst_dir));
-}
-
-#[tokio::test]
-async fn test_flush_and_reopen() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("manual_flush");
-    let store_dir = dir.path().to_str().unwrap();
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let mut tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    tester.put(&[(1000, Some(100))]).await;
-    tester.flush(Some(true)).await;
-    tester.reopen().await;
-    let i = tester
-        .base()
-        .region
-        .inner
-        .shared
-        .version_control
-        .committed_sequence();
-
-    // we wrote a request and flushed the region (involving writing a manifest), thus
-    // committed_sequence should be 2.
-    assert_eq!(2, i);
-}
-
-#[tokio::test]
-async fn test_flush_empty() {
-    let dir = create_temp_dir("flush-empty");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    // Flush empty table.
-    tester.flush(None).await;
-    let data = [(1000, Some(100))];
-    // Put element to trigger flush.
-    tester.put(&data).await;
-
-    // Put again.
-    let data = [(2000, Some(200))];
-    tester.put(&data).await;
-
-    // No parquet file should be flushed.
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    assert!(!has_parquet_file(&sst_dir));
-
-    let expect = vec![(1000, Some(100.to_string())), (2000, Some(200.to_string()))];
-
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-}
-
-#[tokio::test]
-async fn test_read_after_flush_across_window() {
-    common_telemetry::init_default_ut_logging();
-
-    let dir = create_temp_dir("read-flush");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    // Put elements so we have content to flush.
-    tester.put(&[(1000, Some(100))]).await;
-    tester.put(&[(2000, Some(200))]).await;
-
-    // Flush.
-    tester.flush(None).await;
-
-    // Put element again.
-    tester.put(&[(3000, Some(300))]).await;
-
-    let expect = vec![
-        (1000, Some(100.to_string())),
-        (2000, Some(200.to_string())),
-        (3000, Some(300.to_string())),
-    ];
-
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-
-    // Reopen
-    let mut tester = tester;
-    tester.reopen().await;
-
-    // Scan after reopen.
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-}
-
-#[tokio::test]
-async fn test_read_after_flush_same_window() {
-    common_telemetry::init_default_ut_logging();
-
-    let dir = create_temp_dir("read-flush");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    // Put elements so we have content to flush.
-    tester.put(&[(1000, Some(100))]).await;
-    tester.put(&[(2000, Some(200))]).await;
-
-    // Flush.
-    tester.flush(None).await;
-
-    // Put element again.
-    tester.put(&[(1003, Some(300))]).await;
-
-    let expect = vec![
-        (1000, Some(100.to_string())),
-        (1003, Some(300.to_string())),
-        (2000, Some(200.to_string())),
-    ];
-
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-
-    // Reopen
-    let mut tester = tester;
-    tester.reopen().await;
-
-    // Scan after reopen.
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-}
-
-#[tokio::test]
-async fn test_merge_read_after_flush() {
-    let dir = create_temp_dir("merge-read-flush");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    // Put elements so we have content to flush (In SST1).
-    tester.put(&[(3000, Some(300))]).await;
-    tester.put(&[(2000, Some(200))]).await;
-
-    // Flush content to SST1.
-    tester.flush(None).await;
-
-    // Put element (In SST2).
-    tester.put(&[(2000, Some(201))]).await;
-
-    // In SST2.
-    tester.put(&[(2000, Some(202))]).await;
-    tester.put(&[(1000, Some(100))]).await;
-
-    // Trigger flush.
-    tester.flush(None).await;
-
-    // Overwrite row (In memtable).
-    tester.put(&[(2000, Some(203))]).await;
-
-    let expect = vec![
-        (1000, Some(100.to_string())),
-        (2000, Some(203.to_string())),
-        (3000, Some(300.to_string())),
-    ];
-
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-
-    // Reopen
-    let mut tester = tester;
-    tester.reopen().await;
-
-    // Scan after reopen.
-    let output = tester.full_scan().await;
-    assert_eq!(expect, output);
-}
-
-#[tokio::test]
-async fn test_schedule_engine_flush() {
-    common_telemetry::init_default_ut_logging();
-
-    let dir = create_temp_dir("engine-flush");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-    assert_eq!(0, tester.base().region.last_flush_millis());
-
-    // Insert the region to the region map.
-    let _ = tester.regions.get_or_occupy_slot(
-        REGION_NAME,
-        engine::RegionSlot::Ready(tester.base().region.clone()),
-    );
-
-    // Put elements so we have content to flush.
-    tester.put(&[(1000, Some(100))]).await;
-    tester.put(&[(2000, Some(200))]).await;
-
-    flush_switch.set_flush_type(FlushType::Engine);
-
-    // Put element and trigger an engine level flush.
-    tester.put(&[(3000, Some(300))]).await;
-
-    // Wait for flush.
-    let mut count = 0;
-    while tester.base().region.last_flush_millis() == 0 && count < 50 {
-        tokio::time::sleep(Duration::from_millis(100)).await;
-        count += 1;
-    }
-
-    // Check parquet files.
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    assert!(has_parquet_file(&sst_dir));
-}
-
-#[tokio::test]
-async fn test_flush_and_query_empty() {
-    common_telemetry::init_default_ut_logging();
-    let dir = create_temp_dir("flush_and_query_empty_range");
-    let store_dir = dir.path().to_str().unwrap();
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
-
-    tester
-        .put(
-            &(20000..30000)
-                .map(|v| (v as i64, Some(v as i64)))
-                .collect::<Vec<_>>(),
-        )
-        .await;
-    tester.flush(Some(true)).await;
-
-    tester
-        .put(
-            &(20100..20200)
-                .map(|v| (v as i64, Some(v as i64)))
-                .collect::<Vec<_>>(),
-        )
-        .await;
-    tester.flush(Some(true)).await;
-
-    use datafusion_expr::Expr as DfExpr;
-    let req = ScanRequest {
-        sequence: None,
-        projection: None,
-        filters: vec![Expr::from(datafusion_expr::binary_expr(
-            DfExpr::Column(Column::from("timestamp")),
-            datafusion_expr::Operator::GtEq,
-            datafusion_expr::lit(timestamp_to_scalar_value(
-                TimeUnit::Millisecond,
-                Some(20000),
-            )),
-        ))],
-        output_ordering: Some(vec![OrderOption {
-            name: "timestamp".to_string(),
-            options: SortOptions {
-                descending: true,
-                nulls_first: true,
-            },
-        }]),
-        limit: Some(1),
-    };
-    let _ = tester.scan(req).await;
-}
--- a/src/storage/src/region/tests/projection.rs
+++ b/src/storage/src/region/tests/projection.rs
@@ -1,206 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::sync::Arc;
-
-use common_test_util::temp_dir::create_temp_dir;
-use datatypes::data_type::ConcreteDataType;
-use datatypes::prelude::ScalarVector;
-use datatypes::type_id::LogicalTypeId;
-use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::logstore::LogStore;
-use store_api::storage::{
-    Chunk, ChunkReader, ReadContext, Region, ScanRequest, Snapshot, WriteContext, WriteRequest,
-};
-
-use crate::config::EngineConfig;
-use crate::region::{RegionImpl, RegionMetadata};
-use crate::test_util::{self, config_util, descriptor_util, write_batch_util};
-use crate::write_batch::WriteBatch;
-
-/// Create metadata with schema (k0, timestamp, v0, v1)
-fn new_metadata(region_name: &str) -> RegionMetadata {
-    let desc = descriptor_util::desc_with_field_columns(region_name, 2);
-    desc.try_into().unwrap()
-}
-
-fn new_write_batch_for_test() -> WriteBatch {
-    write_batch_util::new_write_batch(
-        &[
-            ("k0", LogicalTypeId::Int64, false),
-            (
-                test_util::TIMESTAMP_NAME,
-                LogicalTypeId::TimestampMillisecond,
-                false,
-            ),
-            ("v0", LogicalTypeId::Int64, true),
-            ("v1", LogicalTypeId::Int64, true),
-        ],
-        Some(1),
-        2,
-    )
-}
-
-/// Build put data
-///
-/// ```text
-/// k0: [key_start, key_start + 1, ... key_start + len - 1]
-/// timestamp: [ts_start, ts_start + 1, ... ts_start + len - 1]
-/// v0: [initial_value, ...., initial_value]
-/// v1: [initial_value, ..., initial_value + len - 1]
-/// ```
-fn new_put_data(
-    len: usize,
-    key_start: i64,
-    ts_start: i64,
-    initial_value: i64,
-) -> HashMap<String, VectorRef> {
-    let k0 = Arc::new(Int64Vector::from_values(
-        (0..len).map(|v| key_start + v as i64),
-    )) as VectorRef;
-    let ts = Arc::new(TimestampMillisecondVector::from_values(
-        (0..len).map(|v| ts_start + v as i64),
-    )) as VectorRef;
-    let v0 = Arc::new(Int64Vector::from_values(
-        std::iter::repeat(initial_value).take(len),
-    )) as VectorRef;
-    let v1 = Arc::new(Int64Vector::from_values(
-        (0..len).map(|v| initial_value + v as i64),
-    )) as VectorRef;
-
-    HashMap::from([
-        ("k0".to_string(), k0),
-        (test_util::TIMESTAMP_NAME.to_string(), ts),
-        ("v0".to_string(), v0),
-        ("v1".to_string(), v1),
-    ])
-}
-
-fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<Vec<i64>>) {
-    if chunk.columns.is_empty() {
-        return;
-    }
-    let num_rows = chunk.columns[0].len();
-    dst.resize(num_rows, Vec::new());
-    for (i, row) in dst.iter_mut().enumerate() {
-        for col in &chunk.columns {
-            match col.data_type() {
-                ConcreteDataType::Int64(_) => {
-                    let val = col
-                        .as_any()
-                        .downcast_ref::<Int64Vector>()
-                        .unwrap()
-                        .get_data(i)
-                        .unwrap();
-                    row.push(val);
-                }
-                ConcreteDataType::Timestamp(_) => {
-                    let val = col
-                        .as_any()
-                        .downcast_ref::<TimestampMillisecondVector>()
-                        .unwrap()
-                        .get_data(i)
-                        .unwrap();
-                    row.push(val.into());
-                }
-                _ => unreachable!(),
-            }
-        }
-    }
-}
-
-struct ProjectionTester<S: LogStore> {
-    region: RegionImpl<S>,
-    write_ctx: WriteContext,
-    read_ctx: ReadContext,
-}
-
-impl<S: LogStore> ProjectionTester<S> {
-    fn with_region(region: RegionImpl<S>) -> ProjectionTester<S> {
-        ProjectionTester {
-            region,
-            write_ctx: WriteContext::default(),
-            read_ctx: ReadContext::default(),
-        }
-    }
-
-    async fn put(&self, len: usize, key_start: i64, ts_start: i64, initial_value: i64) {
-        let mut batch = new_write_batch_for_test();
-        let put_data = new_put_data(len, key_start, ts_start, initial_value);
-        batch.put(put_data).unwrap();
-
-        let _ = self.region.write(&self.write_ctx, batch).await.unwrap();
-    }
-
-    async fn scan(&self, projection: Option<Vec<usize>>) -> Vec<Vec<i64>> {
-        let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
-
-        let request = ScanRequest {
-            projection,
-            ..Default::default()
-        };
-        let resp = snapshot.scan(&self.read_ctx, request).await.unwrap();
-        let mut reader = resp.reader;
-
-        let mut dst = Vec::new();
-        while let Some(chunk) = reader.next_chunk().await.unwrap() {
-            let chunk = reader.project_chunk(chunk);
-            append_chunk_to(&chunk, &mut dst);
-        }
-
-        dst
-    }
-}
-
-const REGION_NAME: &str = "region-projection-0";
-
-async fn new_tester(store_dir: &str) -> ProjectionTester<RaftEngineLogStore> {
-    let metadata = new_metadata(REGION_NAME);
-
-    let store_config =
-        config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
-    let region = RegionImpl::create(metadata, store_config).await.unwrap();
-
-    ProjectionTester::with_region(region)
-}
-
-#[tokio::test]
-async fn test_projection_ordered() {
-    let dir = create_temp_dir("projection-ordered");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let tester = new_tester(store_dir).await;
-    tester.put(4, 1, 10, 100).await;
-
-    // timestamp, v1
-    let output = tester.scan(Some(vec![1, 3])).await;
-    let expect = vec![vec![10, 100], vec![11, 101], vec![12, 102], vec![13, 103]];
-    assert_eq!(expect, output);
-}
-
-#[tokio::test]
-async fn test_projection_unordered() {
-    let dir = create_temp_dir("projection-unordered");
-    let store_dir = dir.path().to_str().unwrap();
-
-    let tester = new_tester(store_dir).await;
-    tester.put(4, 1, 10, 100).await;
-
-    // v1, k0
-    let output = tester.scan(Some(vec![3, 0])).await;
-    let expect = vec![vec![100, 1], vec![101, 2], vec![102, 3], vec![103, 4]];
-    assert_eq!(expect, output);
-}
--- a/src/storage/src/region/tests/truncate.rs
+++ b/src/storage/src/region/tests/truncate.rs
@@ -1,242 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Region truncate tests.
-
-use std::sync::Arc;
-
-use common_test_util::temp_dir::create_temp_dir;
-use log_store::raft_engine::log_store::RaftEngineLogStore;
-use store_api::manifest::{Manifest, MetaAction};
-use store_api::storage::{FlushContext, OpenOptions, Region};
-
-use crate::config::EngineConfig;
-use crate::engine;
-use crate::flush::FlushStrategyRef;
-use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionTruncate};
-use crate::region::tests::{self, FileTesterBase};
-use crate::region::RegionImpl;
-use crate::test_util::config_util;
-use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
-
-const REGION_NAME: &str = "region-truncate-0";
-
-/// Create a new region for truncate tests.
-async fn create_region_for_truncate(
-    store_dir: &str,
-    flush_strategy: FlushStrategyRef,
-) -> RegionImpl<RaftEngineLogStore> {
-    let metadata = tests::new_metadata(REGION_NAME);
-
-    let mut store_config =
-        config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
-    store_config.flush_strategy = flush_strategy;
-
-    RegionImpl::create(metadata, store_config).await.unwrap()
-}
-
-/// Tester for truncate tests.
-struct TruncateTester {
-    store_dir: String,
-    base: Option<FileTesterBase>,
-}
-
-impl TruncateTester {
-    async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> TruncateTester {
-        let region = create_region_for_truncate(store_dir, flush_strategy).await;
-        TruncateTester {
-            store_dir: store_dir.to_string(),
-            base: Some(FileTesterBase::with_region(region)),
-        }
-    }
-
-    #[inline]
-    fn base(&self) -> &FileTesterBase {
-        self.base.as_ref().unwrap()
-    }
-
-    async fn flush(&self) {
-        let ctx = FlushContext::default();
-        self.base().region.flush(&ctx).await.unwrap();
-    }
-
-    async fn truncate(&self) {
-        self.base().region.truncate().await.unwrap();
-    }
-
-    async fn reopen(&mut self) {
-        // Close the old region.
-        if let Some(base) = self.base.as_ref() {
-            base.close().await;
-        }
-        self.base = None;
-        // Reopen the region.
-        let store_config = config_util::new_store_config(
-            REGION_NAME,
-            &self.store_dir,
-            EngineConfig {
-                max_files_in_l0: usize::MAX,
-                ..Default::default()
-            },
-        )
-        .await;
-
-        let opts = OpenOptions::default();
-        let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
-            .await
-            .unwrap()
-            .unwrap();
-
-        self.base = Some(FileTesterBase::with_region(region));
-    }
-}
-
-#[tokio::test]
-async fn test_truncate_basic() {
-    let dir = create_temp_dir("truncate-basic");
-    common_telemetry::init_default_ut_logging();
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [
-        (1000, Some("1000".to_string())),
-        (1001, Some("1001".to_string())),
-        (1002, Some("1002".to_string())),
-        (1003, Some("1003".to_string())),
-    ];
-
-    // Data in Memtable
-    tester.base().put(&data).await;
-    let res = tester.base().full_scan().await;
-    assert_eq!(4, res.len());
-
-    // Truncate region.
-    tester.truncate().await;
-
-    let res = tester.base().full_scan().await;
-    assert_eq!(0, res.len());
-}
-
-#[tokio::test]
-async fn test_put_data_after_truncate() {
-    let dir = create_temp_dir("put_data_after_truncate");
-    common_telemetry::init_default_ut_logging();
-    let store_dir = dir.path().to_str().unwrap();
-
-    let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [
-        (1000, Some("1000".to_string())),
-        (1001, Some("1001".to_string())),
-        (1002, None),
-        (1003, Some("1003".to_string())),
-    ];
-
-    tester.base().put(&data).await;
-
-    // Manually trigger flush.
-    tester.flush().await;
-    assert!(has_parquet_file(&sst_dir));
-
-    let data = [
-        (1002, Some("1002".to_string())),
-        (1004, Some("1004".to_string())),
-        (1005, Some("1005".to_string())),
-    ];
-    tester.base().put(&data).await;
-
-    // Truncate region.
-    tester.truncate().await;
-    let res = tester.base().full_scan().await;
-    assert_eq!(0, res.len());
-
-    let new_data = [
-        (1010, Some("0".to_string())),
-        (1011, Some("1".to_string())),
-        (1012, Some("2".to_string())),
-        (1013, Some("3".to_string())),
-    ];
-    tester.base().put(&new_data).await;
-
-    let res = tester.base().full_scan().await;
-    assert_eq!(new_data, res.as_slice());
-}
-
-#[tokio::test]
-async fn test_truncate_reopen() {
-    let dir = create_temp_dir("put_data_after_truncate");
-    common_telemetry::init_default_ut_logging();
-    let store_dir = dir.path().to_str().unwrap();
-
-    let flush_switch = Arc::new(FlushSwitch::default());
-    let mut tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
-
-    let data = [
-        (1000, Some("1000".to_string())),
-        (1001, Some("1001".to_string())),
-        (1002, None),
-        (1003, Some("1003".to_string())),
-    ];
-
-    tester.base().put(&data).await;
-
-    // Manually trigger flush.
-    tester.flush().await;
-
-    let data = [
-        (1002, Some("1002".to_string())),
-        (1004, Some("1004".to_string())),
-        (1005, Some("1005".to_string())),
-    ];
-    tester.base().put(&data).await;
-
-    let manifest = &tester.base().region.inner.manifest;
-    let manifest_version = tester
-        .base()
-        .region
-        .version_control()
-        .current_manifest_version();
-
-    let committed_sequence = tester.base().committed_sequence();
-    let mut action_list =
-        RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
-            region_id: 0.into(),
-            committed_sequence,
-        }));
-
-    // Persist the meta action.
-    let prev_version = manifest_version;
-    action_list.set_prev_version(prev_version);
-    manifest.update(action_list).await.unwrap();
-
-    // Reopen and put data.
-    tester.reopen().await;
-    let res = tester.base().full_scan().await;
-    assert_eq!(0, res.len());
-
-    let new_data = [
-        (0, Some("0".to_string())),
-        (1, Some("1".to_string())),
-        (2, Some("2".to_string())),
-        (3, Some("3".to_string())),
-    ];
-
-    tester.base().put(&new_data).await;
-    let res = tester.base().full_scan().await;
-    assert_eq!(new_data, res.as_slice());
-}
--- a/src/storage/src/region/writer.rs
+++ b/src/storage/src/region/writer.rs
@@ -1,984 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use common_base::readable_size::ReadableSize;
-use common_telemetry::logging;
-use futures::TryStreamExt;
-use snafu::{ensure, ResultExt};
-use store_api::logstore::LogStore;
-use store_api::manifest::{Manifest, ManifestLogStorage, ManifestVersion, MetaAction};
-use store_api::storage::{
-    AlterRequest, FlushContext, FlushReason, SequenceNumber, WriteContext, WriteResponse,
-};
-use tokio::sync::{oneshot, Mutex};
-
-use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
-use crate::config::EngineConfig;
-use crate::error::{self, Result};
-use crate::flush::{
-    FlushHandle, FlushRegionRequest, FlushSchedulerRef, FlushStrategyRef, FlushType, RegionStatus,
-};
-use crate::manifest::action::{
-    RawRegionMetadata, RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList,
-    RegionRemove, RegionTruncate,
-};
-use crate::memtable::{Inserter, MemtableBuilderRef, MemtableId, MemtableRef, MemtableVersion};
-use crate::metadata::RegionMetadataRef;
-use crate::metrics::{FLUSH_REQUESTS_TOTAL, PREPROCESS_ELAPSED};
-use crate::proto::wal::WalHeader;
-use crate::region::{
-    CompactContext, RecoveredMetadata, RecoveredMetadataMap, RegionManifest, SharedDataRef,
-};
-use crate::schema::compat::CompatWrite;
-use crate::sst::{AccessLayerRef, LevelMetas};
-use crate::version::{VersionControl, VersionControlRef, VersionEdit};
-use crate::wal::Wal;
-use crate::write_batch::WriteBatch;
-
-pub type RegionWriterRef<S> = Arc<RegionWriter<S>>;
-
-// TODO(yingwen): Add benches for write and support group commit to improve write throughput.
-
-/// Region writer manages all write operations to the region.
-#[derive(Debug)]
-pub struct RegionWriter<S: LogStore> {
-    // To avoid dead lock, we need to ensure the lock order is: inner -> version_mutex.
-    /// Inner writer guarded by write lock, the write lock is used to ensure
-    /// all write operations are serialized.
-    inner: Mutex<WriterInner>,
-    /// Version lock, protects read-write-update to region `Version`.
-    ///
-    /// Increasing committed sequence should be guarded by this lock.
-    version_mutex: Mutex<()>,
-
-    compaction_scheduler: CompactionSchedulerRef<S>,
-    compaction_picker: CompactionPickerRef<S>,
-}
-
-impl<S> RegionWriter<S>
-where
-    S: LogStore,
-{
-    pub fn new(
-        memtable_builder: MemtableBuilderRef,
-        config: Arc<EngineConfig>,
-        ttl: Option<Duration>,
-        write_buffer_size: usize,
-        compaction_scheduler: CompactionSchedulerRef<S>,
-        compaction_picker: CompactionPickerRef<S>,
-    ) -> RegionWriter<S> {
-        RegionWriter {
-            inner: Mutex::new(WriterInner::new(
-                memtable_builder,
-                config,
-                ttl,
-                write_buffer_size,
-            )),
-            version_mutex: Mutex::new(()),
-            compaction_scheduler,
-            compaction_picker,
-        }
-    }
-
-    /// Write to region in the write lock.
-    pub async fn write(
-        &self,
-        ctx: &WriteContext,
-        request: WriteBatch,
-        writer_ctx: WriterContext<'_, S>,
-    ) -> Result<WriteResponse> {
-        let mut inner = self.inner.lock().await;
-
-        ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
-
-        inner
-            .write(&self.version_mutex, ctx, request, writer_ctx)
-            .await
-    }
-
-    /// Replay data to memtables.
-    pub async fn replay(
-        &self,
-        recovered_metadata: RecoveredMetadataMap,
-        writer_ctx: WriterContext<'_, S>,
-    ) -> Result<()> {
-        let mut inner = self.inner.lock().await;
-        inner
-            .replay(&self.version_mutex, recovered_metadata, writer_ctx)
-            .await
-    }
-
-    /// Write and apply the region edit.
-    pub(crate) async fn write_edit_and_apply(
-        &self,
-        wal: &Wal<S>,
-        shared: &SharedDataRef,
-        manifest: &RegionManifest,
-        edit: RegionEdit,
-        max_memtable_id: Option<MemtableId>,
-    ) -> Result<()> {
-        let _lock = self.version_mutex.lock().await;
-        // HACK: We won't acquire the write lock here because write stall would hold
-        // write lock thus we have no chance to get the lock and apply the version edit.
-        // So we add a version lock to ensure modification to `VersionControl` is
-        // serialized.
-        let version_control = &shared.version_control;
-        let prev_version = version_control.current_manifest_version();
-
-        logging::debug!(
-            "Write region edit: {:?} to manifest, prev_version: {}.",
-            edit,
-            prev_version,
-        );
-
-        let files_to_add = edit.files_to_add.clone();
-        let files_to_remove = edit.files_to_remove.clone();
-        let flushed_sequence = edit.flushed_sequence;
-        let compaction_time_window = edit.compaction_time_window;
-        // Persist the meta action.
-        let mut action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit));
-        action_list.set_prev_version(prev_version);
-        let manifest_version = manifest.update(action_list).await?;
-
-        // Notify checkpointer the flushed manifest version after flushing memtable
-        if flushed_sequence.is_some() {
-            manifest.set_flushed_manifest_version(manifest_version);
-        }
-
-        let version_edit = VersionEdit {
-            files_to_add,
-            files_to_remove,
-            flushed_sequence,
-            manifest_version,
-            max_memtable_id,
-            compaction_time_window,
-        };
-
-        // We could tolerate failure during persisting manifest version to the WAL, since it won't
-        // affect how we applying the edit to the version.
-        version_control.apply_edit(version_edit);
-        // TODO(yingwen): We should set the flush handle to `None`, but we can't acquire
-        // write lock here.
-
-        // Persist the manifest version to notify subscriber of the wal that the manifest has been
-        // updated. This should be done at the end of the method.
-        self.persist_manifest_version(wal, version_control, manifest_version)
-            .await
-    }
-
-    /// Alter schema of the region.
-    pub async fn alter(&self, alter_ctx: AlterContext<'_, S>, request: AlterRequest) -> Result<()> {
-        // To alter the schema, we need to acquire the write lock first, so we could
-        // avoid other writers write to the region and switch the memtable safely.
-        // Another potential benefit is that the write lock also protect against concurrent
-        // alter request to the region.
-        let inner = self.inner.lock().await;
-
-        ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
-
-        let version_control = alter_ctx.version_control();
-
-        let old_metadata = version_control.metadata();
-        old_metadata
-            .validate_alter(&request)
-            .context(error::InvalidAlterRequestSnafu)?;
-
-        // The write lock protects us against other alter request, so we could build the new
-        // metadata struct outside of the version mutex.
-        let new_metadata = old_metadata
-            .alter(&request)
-            .context(error::AlterMetadataSnafu)?;
-
-        let raw = RawRegionMetadata::from(&new_metadata);
-
-        // Acquire the version lock before altering the metadata.
-        let _lock = self.version_mutex.lock().await;
-
-        let committed_sequence = version_control.committed_sequence();
-        let mut action_list =
-            RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
-                metadata: raw,
-                committed_sequence,
-            }));
-        let new_metadata = Arc::new(new_metadata);
-
-        // Persist the meta action.
-        let prev_version = version_control.current_manifest_version();
-        action_list.set_prev_version(prev_version);
-
-        logging::debug!(
-            "Try to alter schema of region {}, region_id: {}, action_list: {:?}",
-            new_metadata.name(),
-            new_metadata.id(),
-            action_list
-        );
-
-        let manifest_version = alter_ctx.manifest.update(action_list).await?;
-
-        // Now we could switch memtables and apply the new metadata to the version.
-        let new_mutable = inner.memtable_builder.build(new_metadata.schema().clone());
-        version_control.freeze_mutable_and_apply_metadata(
-            new_metadata,
-            manifest_version,
-            new_mutable,
-        );
-
-        self.persist_manifest_version(alter_ctx.wal, version_control, manifest_version)
-            .await
-    }
-
-    /// Allocate a sequence and persist the manifest version using that sequence to the wal.
-    ///
-    /// This method should be protected by the `version_mutex`.
-    async fn persist_manifest_version(
-        &self,
-        wal: &Wal<S>,
-        version_control: &VersionControlRef,
-        manifest_version: ManifestVersion,
-    ) -> Result<()> {
-        // We always bump the committed sequence regardless whether persisting the manifest version
-        // to wal is success, to avoid RegionMetaAction use same committed sequence in accident.
-        let next_sequence = version_control.committed_sequence() + 1;
-        version_control.set_committed_sequence(next_sequence);
-
-        let header = WalHeader::with_last_manifest_version(manifest_version);
-        let _ = wal.write_to_wal(next_sequence, header, None).await?;
-
-        Ok(())
-    }
-
-    pub async fn close(&self) -> Result<()> {
-        // In order to close a writer
-        // 1. Acquires the write lock.
-        // 2. Sets a memory flag to reject any potential writing.
-        // 3. Waits for the pending flush task.
-        {
-            let mut inner = self.inner.lock().await;
-
-            if inner.is_closed() {
-                return Ok(());
-            }
-
-            inner.mark_closed();
-        }
-        // we release the writer lock once for rejecting any following potential writing requests immediately.
-
-        self.wait_flush().await?;
-
-        // TODO: cancel the compaction task
-
-        Ok(())
-    }
-
-    pub async fn on_drop(&self, drop_ctx: DropContext<'_, S>) -> Result<()> {
-        // 1. Acquires the write lock.
-        // 2. Close writer reject any potential writing.
-        // 3. Waits or cancels the flush job.
-        // 4. Add `RegionMetaAction::Remove` to recover from manifest in case of failure.
-        //    The main task is to restore the cleaning of sst files. If there is a failure
-        //    in the previous stops, it can be restored through the `Procedure` framework.
-        // 5. Mark all data obsolete in the WAL.
-        // 6. Delete the namespace of the region from the WAL.
-        // 7. Mark all SSTs deleted.
-        // 8. Remove all manifests.
-        let mut inner = self.inner.lock().await;
-        inner.mark_closed();
-
-        if let Some(handle) = inner.flush_handle.take() {
-            handle.wait().await?;
-        }
-
-        let version_control = drop_ctx.version_control();
-
-        let _lock = self.version_mutex.lock().await;
-        let committed_sequence = version_control.committed_sequence();
-        let current_version = version_control.current();
-
-        let mut action_list =
-            RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove {
-                region_id: drop_ctx.shared.id,
-            }));
-
-        // Persist the meta action.
-        let prev_version = version_control.current_manifest_version();
-        action_list.set_prev_version(prev_version);
-
-        logging::info!(
-            "Try to remove region {}, action_list: {:?}",
-            drop_ctx.shared.id(),
-            action_list
-        );
-
-        let remove_action_version = drop_ctx.manifest.update(action_list).await?;
-
-        // Mark all data obsolete and delete the namespace in the WAL
-        drop_ctx.wal.obsolete(committed_sequence).await?;
-        drop_ctx.wal.delete_namespace().await?;
-        logging::info!(
-            "Remove WAL entries in region: {}, committed sequence: {}",
-            drop_ctx.shared.id(),
-            committed_sequence
-        );
-
-        // Mark all SSTs deleted
-        let files = current_version.ssts().mark_all_files_deleted();
-        logging::info!(
-            "Try to remove all SSTs, region: {}, files: {:?}",
-            drop_ctx.shared.id(),
-            files
-        );
-
-        drop_ctx
-            .manifest
-            .manifest_store()
-            .delete_all(remove_action_version)
-            .await?;
-        Ok(())
-    }
-
-    /// Flush task manually
-    pub async fn flush(&self, writer_ctx: WriterContext<'_, S>, ctx: &FlushContext) -> Result<()> {
-        let mut inner = self.inner.lock().await;
-
-        if !ctx.force {
-            ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
-        }
-
-        inner.manual_flush(writer_ctx, ctx.reason).await?;
-
-        if ctx.wait {
-            if let Some(handle) = inner.flush_handle.take() {
-                handle.wait().await?;
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Compact manually.
-    pub async fn compact(&self, request: WriterCompactRequest<S>) -> Result<()> {
-        let mut inner = self.inner.lock().await;
-
-        ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
-        let sst_write_buffer_size = ReadableSize::mb(8); // deprecated usage
-
-        inner
-            .manual_compact(
-                request,
-                self.compaction_picker.clone(),
-                self.compaction_scheduler.clone(),
-                sst_write_buffer_size,
-            )
-            .await
-    }
-
-    /// Wait flush task if any
-    async fn wait_flush(&self) -> Result<()> {
-        let mut inner = self.inner.lock().await;
-
-        if let Some(handle) = inner.flush_handle.take() {
-            handle.wait().await?;
-        }
-
-        Ok(())
-    }
-
-    pub async fn truncate(&self, ctx: &TruncateContext<'_, S>) -> Result<()> {
-        // Acquires the write lock.
-        let mut inner = self.inner.lock().await;
-        ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
-
-        if let Some(handle) = inner.flush_handle.take() {
-            handle.wait().await?;
-        }
-
-        let version_control = ctx.version_control();
-        let _lock = self.version_mutex.lock().await;
-        let committed_sequence = version_control.committed_sequence();
-
-        // Add `RegionMetaAction::Truncate` to recover from manifest in case of failure.
-        let mut action_list =
-            RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
-                region_id: ctx.shared.id,
-                committed_sequence,
-            }));
-
-        // Persist the meta action.
-        let current_version = version_control.current();
-        let manifest_version = version_control.current_manifest_version();
-        let prev_version = manifest_version;
-        action_list.set_prev_version(prev_version);
-        ctx.manifest.update(action_list).await?;
-
-        // Mark all data obsolete
-        ctx.wal.obsolete(committed_sequence).await?;
-
-        // Mark all SSTs deleted
-        let files = current_version.ssts().mark_all_files_deleted();
-        logging::info!(
-            "Try to remove all SSTs, region: {}, files: {:?}",
-            ctx.shared.id(),
-            files
-        );
-
-        // Reset version
-        let memtables = Arc::new(MemtableVersion::new(inner.alloc_memtable(version_control)));
-        let ssts = Arc::new(LevelMetas::new(
-            ctx.sst_layer.clone(),
-            current_version.ssts().file_purger(),
-        ));
-        version_control.reset_version(manifest_version + 1, memtables, ssts);
-
-        Ok(())
-    }
-}
-
-// Methods for tests.
-#[cfg(test)]
-impl<S> RegionWriter<S>
-where
-    S: LogStore,
-{
-    pub(crate) async fn write_buffer_size(&self) -> usize {
-        self.inner.lock().await.write_buffer_size
-    }
-}
-
-/// Structs needed by triggering a compaction.
-pub struct WriterCompactRequest<S: LogStore> {
-    pub shared_data: SharedDataRef,
-    pub sst_layer: AccessLayerRef,
-    pub manifest: RegionManifest,
-    pub wal: Wal<S>,
-    pub region_writer: RegionWriterRef<S>,
-    pub compact_ctx: CompactContext,
-}
-
-pub struct WriterContext<'a, S: LogStore> {
-    pub shared: &'a SharedDataRef,
-    pub flush_strategy: &'a FlushStrategyRef,
-    pub flush_scheduler: &'a FlushSchedulerRef<S>,
-    pub compaction_scheduler: &'a CompactionSchedulerRef<S>,
-    pub sst_layer: &'a AccessLayerRef,
-    pub wal: &'a Wal<S>,
-    pub writer: &'a RegionWriterRef<S>,
-    pub manifest: &'a RegionManifest,
-    pub compaction_picker: CompactionPickerRef<S>,
-}
-
-impl<'a, S: LogStore> WriterContext<'a, S> {
-    #[inline]
-    fn version_control(&self) -> &VersionControlRef {
-        &self.shared.version_control
-    }
-}
-
-pub struct AlterContext<'a, S: LogStore> {
-    pub shared: &'a SharedDataRef,
-    pub wal: &'a Wal<S>,
-    pub manifest: &'a RegionManifest,
-}
-
-impl<'a, S: LogStore> AlterContext<'a, S> {
-    #[inline]
-    fn version_control(&self) -> &VersionControlRef {
-        &self.shared.version_control
-    }
-}
-
-pub struct DropContext<'a, S: LogStore> {
-    pub shared: &'a SharedDataRef,
-    pub wal: &'a Wal<S>,
-    pub manifest: &'a RegionManifest,
-    pub flush_scheduler: &'a FlushSchedulerRef<S>,
-    pub compaction_scheduler: &'a CompactionSchedulerRef<S>,
-    pub sst_layer: &'a AccessLayerRef,
-}
-
-impl<'a, S: LogStore> DropContext<'a, S> {
-    #[inline]
-    fn version_control(&self) -> &VersionControlRef {
-        &self.shared.version_control
-    }
-}
-
-pub struct TruncateContext<'a, S: LogStore> {
-    pub shared: &'a SharedDataRef,
-    pub wal: &'a Wal<S>,
-    pub manifest: &'a RegionManifest,
-    pub sst_layer: &'a AccessLayerRef,
-}
-
-impl<'a, S: LogStore> TruncateContext<'a, S> {
-    #[inline]
-    fn version_control(&self) -> &VersionControlRef {
-        &self.shared.version_control
-    }
-}
-
-#[derive(Debug)]
-struct WriterInner {
-    memtable_builder: MemtableBuilderRef,
-    flush_handle: Option<FlushHandle>,
-
-    /// `WriterInner` will reject any future writing, if the closed flag is set.
-    ///
-    /// It should protected by upper mutex
-    closed: bool,
-    engine_config: Arc<EngineConfig>,
-    ttl: Option<Duration>,
-    /// Size in bytes to freeze the mutable memtable.
-    write_buffer_size: usize,
-}
-
-impl WriterInner {
-    fn new(
-        memtable_builder: MemtableBuilderRef,
-        engine_config: Arc<EngineConfig>,
-        ttl: Option<Duration>,
-        write_buffer_size: usize,
-    ) -> WriterInner {
-        WriterInner {
-            memtable_builder,
-            flush_handle: None,
-            engine_config,
-            closed: false,
-            ttl,
-            write_buffer_size,
-        }
-    }
-
-    /// Write `WriteBatch` to region, now the schema of batch needs to be validated outside.
-    ///
-    /// Mutable reference of writer ensure no other reference of this writer can modify the
-    /// version control (write is exclusive).
-    async fn write<S: LogStore>(
-        &mut self,
-        version_mutex: &Mutex<()>,
-        _ctx: &WriteContext,
-        mut request: WriteBatch,
-        writer_ctx: WriterContext<'_, S>,
-    ) -> Result<WriteResponse> {
-        self.preprocess_write(&writer_ctx).await?;
-        let version_control = writer_ctx.version_control();
-
-        let _lock = version_mutex.lock().await;
-
-        let metadata = version_control.metadata();
-        // We need to check the schema again since it might has been altered. We need
-        // to compat request's schema before writing it into the WAL otherwise some
-        // default constraint like `current_timestamp()` would yield different value
-        // during replay.
-        request.compat_write(metadata.schema().user_schema())?;
-
-        let committed_sequence = version_control.committed_sequence();
-        // Sequence for current write batch.
-        let next_sequence = committed_sequence + 1;
-
-        let version = version_control.current();
-        let wal_header = WalHeader::with_last_manifest_version(version.manifest_version());
-        let _ = writer_ctx
-            .wal
-            .write_to_wal(next_sequence, wal_header, Some(request.payload()))
-            .await?;
-
-        // Insert batch into memtable.
-        let mut inserter = Inserter::new(next_sequence);
-        inserter.insert_memtable(request.payload(), version.mutable_memtable())?;
-
-        // Update committed_sequence to make current batch visible. The `&mut self` of WriterInner
-        // guarantees the writer is exclusive.
-        version_control.set_committed_sequence(next_sequence);
-
-        Ok(WriteResponse {})
-    }
-
-    async fn replay<S: LogStore>(
-        &mut self,
-        version_mutex: &Mutex<()>,
-        mut recovered_metadata: RecoveredMetadataMap,
-        writer_ctx: WriterContext<'_, S>,
-    ) -> Result<()> {
-        let version_control = writer_ctx.version_control();
-
-        let (flushed_sequence, mut last_sequence);
-        let mut num_requests = 0;
-        let mut num_recovered_metadata = 0;
-        let mut next_apply_metadata = recovered_metadata.pop_first();
-        {
-            let _lock = version_mutex.lock().await;
-
-            // Data after flushed sequence need to be recovered.
-            flushed_sequence = version_control.current().flushed_sequence();
-            last_sequence = flushed_sequence;
-            // Read starts from the first entry after last flushed entry, so the start sequence
-            // should be flushed_sequence + 1.
-            let mut stream = writer_ctx.wal.read_from_wal(flushed_sequence + 1).await?;
-            while let Some((req_sequence, _header, payload)) = stream.try_next().await? {
-                while let Some((sequence_before_alter, _)) = next_apply_metadata {
-                    // There might be multiple metadata changes to be applied, so a loop is necessary.
-                    if req_sequence > sequence_before_alter {
-                        // This is the first request that use the new metadata.
-                        self.apply_metadata(
-                            &writer_ctx,
-                            sequence_before_alter,
-                            next_apply_metadata,
-                            version_control,
-                        )?;
-
-                        num_recovered_metadata += 1;
-                        next_apply_metadata = recovered_metadata.pop_first();
-                    } else {
-                        // Keep the next_apply_metadata until req_sequence > sequence_before_alter
-                        break;
-                    }
-                }
-
-                if req_sequence > last_sequence {
-                    last_sequence = req_sequence;
-                } else {
-                    logging::error!(
-                            "Sequence should not decrease during replay, found {} <= {}, \
-                             region_id: {}, region_name: {}, flushed_sequence: {}, num_requests: {}",
-                            req_sequence,
-                            last_sequence,
-                            writer_ctx.shared.id,
-                            writer_ctx.shared.name,
-                            flushed_sequence,
-                            num_requests,
-                        );
-
-                    error::SequenceNotMonotonicSnafu {
-                        prev: last_sequence,
-                        given: req_sequence,
-                    }
-                    .fail()?;
-                }
-
-                if let Some(payload) = payload {
-                    num_requests += 1;
-                    // Note that memtables of `Version` may be updated during replay.
-                    let version = version_control.current();
-                    // TODO(yingwen): Trigger flush if the size of memtables reach the flush threshold to avoid
-                    // out of memory during replay, but we need to do it carefully to avoid dead lock.
-                    let mut inserter = Inserter::new(last_sequence);
-                    inserter.insert_memtable(&payload, version.mutable_memtable())?;
-                }
-            }
-
-            // Apply metadata after last WAL entry
-            while let Some((sequence_before_alter, _)) = next_apply_metadata {
-                assert!(
-                    sequence_before_alter >= last_sequence,
-                    "The sequence in metadata after last WAL entry is less than last sequence, \
-                         metadata sequence: {}, last_sequence: {}, region_id: {}, region_name: {}",
-                    sequence_before_alter,
-                    last_sequence,
-                    writer_ctx.shared.id,
-                    writer_ctx.shared.name
-                );
-
-                self.apply_metadata(
-                    &writer_ctx,
-                    sequence_before_alter,
-                    next_apply_metadata,
-                    version_control,
-                )?;
-
-                num_recovered_metadata += 1;
-                next_apply_metadata = recovered_metadata.pop_first();
-            }
-
-            version_control.set_committed_sequence(last_sequence);
-        }
-
-        logging::info!(
-            "Region replay finished, region_id: {}, region_name: {}, flushed_sequence: {}, last_sequence: {}, num_requests: {}, num_recovered_metadata: {}",
-            writer_ctx.shared.id,
-            writer_ctx.shared.name,
-            flushed_sequence,
-            last_sequence,
-            num_requests,
-            num_recovered_metadata,
-        );
-
-        Ok(())
-    }
-
-    fn apply_metadata<S: LogStore>(
-        &self,
-        writer_ctx: &WriterContext<'_, S>,
-        sequence: SequenceNumber,
-        mut metadata: Option<RecoveredMetadata>,
-        version_control: &VersionControl,
-    ) -> Result<()> {
-        // It's safe to unwrap here, it's checked outside.
-        // Move out metadata to avoid cloning it.
-
-        let (_, (manifest_version, metadata)) = metadata.take().unwrap();
-        let region_metadata: RegionMetadataRef =
-            Arc::new(metadata.try_into().context(error::InvalidRawRegionSnafu {
-                region: &writer_ctx.shared.name,
-            })?);
-        let new_mutable = self
-            .memtable_builder
-            .build(region_metadata.schema().clone());
-        version_control.freeze_mutable_and_apply_metadata(
-            region_metadata,
-            manifest_version,
-            new_mutable,
-        );
-        logging::debug!(
-            "Applied metadata to region: {} when replaying WAL: sequence={} manifest={} ",
-            writer_ctx.shared.name,
-            sequence,
-            manifest_version
-        );
-
-        Ok(())
-    }
-
-    /// Preprocess before write.
-    ///
-    /// Creates needed mutable memtables, ensures there is enough capacity in memtable and trigger
-    /// flush if necessary. Returns time ranges of the input write batch.
-    async fn preprocess_write<S: LogStore>(
-        &mut self,
-        writer_ctx: &WriterContext<'_, S>,
-    ) -> Result<()> {
-        let _timer = PREPROCESS_ELAPSED.start_timer();
-
-        let version_control = writer_ctx.version_control();
-        // Check whether memtable is full or flush should be triggered. We need to do this first since
-        // switching memtables will clear all mutable memtables.
-        if let Some(flush_type) = self.should_flush(
-            writer_ctx.shared,
-            version_control,
-            writer_ctx.flush_strategy,
-        ) {
-            // Trigger flush according to the flush type.
-            match flush_type {
-                FlushType::Region => {
-                    // Trigger flush for current region.
-                    self.trigger_flush(writer_ctx, FlushReason::MemtableFull)
-                        .await?;
-                }
-                FlushType::Engine => {
-                    // Trigger engine level flush. This wakeup the flush handler
-                    // to pick region to flush.
-                    writer_ctx.flush_scheduler.schedule_engine_flush()?;
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Create a new mutable memtable.
-    fn alloc_memtable(&self, version_control: &VersionControlRef) -> MemtableRef {
-        let memtable_schema = version_control.current().schema().clone();
-        self.memtable_builder.build(memtable_schema)
-    }
-
-    fn should_flush(
-        &self,
-        shared: &SharedDataRef,
-        version_control: &VersionControlRef,
-        flush_strategy: &FlushStrategyRef,
-    ) -> Option<FlushType> {
-        let current = version_control.current();
-        let memtables = current.memtables();
-        let status = RegionStatus {
-            region_id: shared.id(),
-            bytes_mutable: memtables.mutable_bytes_allocated(),
-            write_buffer_size: self.write_buffer_size,
-        };
-        flush_strategy.should_flush(status)
-    }
-
-    async fn trigger_flush<S: LogStore>(
-        &mut self,
-        ctx: &WriterContext<'_, S>,
-        reason: FlushReason,
-    ) -> Result<()> {
-        let version_control = &ctx.shared.version_control;
-        let new_mutable = self.alloc_memtable(version_control);
-        // Freeze all mutable memtables so we can flush them later.
-        version_control.freeze_mutable(new_mutable);
-
-        FLUSH_REQUESTS_TOTAL
-            .with_label_values(&[reason.as_str()])
-            .inc();
-
-        if let Some(flush_handle) = self.flush_handle.take() {
-            // Previous flush job is incomplete, wait util it is finished.
-            // However the last flush job may fail, in which case, we just return error
-            // and abort current write request. The flush handle is left empty, so the next
-            // time we still have chance to trigger a new flush.
-            // TODO(yingwen): We should release the write lock during waiting flush done, which
-            // needs something like async condvar.
-            flush_handle.wait().await.map_err(|e| {
-                logging::error!(e; "Previous flush job failed, region: {}", ctx.shared.name);
-                e
-            })?;
-        }
-
-        let current_version = version_control.current();
-        let (max_memtable_id, mem_to_flush) = current_version.memtables().memtables_to_flush();
-
-        if max_memtable_id.is_none() {
-            // We still update the flush time to avoid the picker picks this region again.
-            ctx.shared.update_flush_millis();
-
-            logging::info!("No memtables to flush in region: {}", ctx.shared.name);
-            return Ok(());
-        }
-
-        let flush_req = FlushRegionRequest {
-            max_memtable_id: max_memtable_id.unwrap(),
-            memtables: mem_to_flush,
-            // In write thread, safe to use current committed sequence.
-            flush_sequence: version_control.committed_sequence(),
-            shared: ctx.shared.clone(),
-            sst_layer: ctx.sst_layer.clone(),
-            writer: ctx.writer.clone(),
-            wal: ctx.wal.clone(),
-            manifest: ctx.manifest.clone(),
-            engine_config: self.engine_config.clone(),
-            ttl: self.ttl,
-            compaction_time_window: current_version.ssts().compaction_time_window(),
-            compaction_picker: ctx.compaction_picker.clone(),
-        };
-
-        let flush_handle = ctx
-            .flush_scheduler
-            .schedule_region_flush(flush_req)
-            .map_err(|e| {
-                logging::error!(e; "Failed to schedule flush request");
-                e
-            })?;
-        self.flush_handle = Some(flush_handle);
-
-        Ok(())
-    }
-
-    async fn manual_compact<S: LogStore>(
-        &mut self,
-        request: WriterCompactRequest<S>,
-        compaction_picker: CompactionPickerRef<S>,
-        compaction_scheduler: CompactionSchedulerRef<S>,
-        sst_write_buffer_size: ReadableSize,
-    ) -> Result<()> {
-        let region_id = request.shared_data.id();
-        let compaction_time_window = request
-            .shared_data
-            .version_control
-            .current()
-            .ssts()
-            .compaction_time_window();
-        let mut compaction_request = CompactionRequestImpl {
-            region_id,
-            sst_layer: request.sst_layer,
-            writer: request.region_writer,
-            shared: request.shared_data.clone(),
-            manifest: request.manifest,
-            wal: request.wal,
-            ttl: self.ttl,
-            compaction_time_window,
-            sender: None,
-            picker: compaction_picker,
-            sst_write_buffer_size,
-            // manual compaction does not reschedule itself.
-            reschedule_on_finish: false,
-        };
-
-        let compaction_scheduler = compaction_scheduler.clone();
-        logging::info!(
-            "Manual compact, region_id: {}, compact_ctx: {:?}",
-            region_id,
-            request.compact_ctx
-        );
-
-        if request.compact_ctx.wait {
-            let (sender, receiver) = oneshot::channel();
-            compaction_request.sender = Some(sender);
-
-            if schedule_compaction(
-                request.shared_data,
-                compaction_scheduler,
-                compaction_request,
-            ) {
-                receiver
-                    .await
-                    .context(error::CompactTaskCancelSnafu { region_id })??;
-            }
-        } else {
-            let _ = schedule_compaction(
-                request.shared_data,
-                compaction_scheduler,
-                compaction_request,
-            );
-        }
-
-        Ok(())
-    }
-
-    async fn manual_flush<S: LogStore>(
-        &mut self,
-        writer_ctx: WriterContext<'_, S>,
-        reason: FlushReason,
-    ) -> Result<()> {
-        self.trigger_flush(&writer_ctx, reason).await?;
-        Ok(())
-    }
-
-    #[inline]
-    fn is_closed(&self) -> bool {
-        self.closed
-    }
-
-    #[inline]
-    fn mark_closed(&mut self) {
-        self.closed = true;
-    }
-}
-
-/// Schedule compaction task, returns whether the task is scheduled.
-pub(crate) fn schedule_compaction<S: LogStore>(
-    shared_data: SharedDataRef,
-    compaction_scheduler: CompactionSchedulerRef<S>,
-    compaction_request: CompactionRequestImpl<S>,
-) -> bool {
-    let region_id = shared_data.id();
-
-    match compaction_scheduler.schedule(compaction_request) {
-        Ok(scheduled) => {
-            logging::info!(
-                "Schedule region {} compaction request result: {}",
-                region_id,
-                scheduled
-            );
-
-            scheduled
-        }
-        Err(e) => {
-            logging::error!(e;"Failed to schedule region compaction request {}", region_id);
-
-            false
-        }
-    }
-}
--- a/src/storage/src/scheduler.rs
+++ b/src/storage/src/scheduler.rs
@@ -1,652 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::fmt::{Debug, Formatter};
-use std::hash::Hash;
-use std::sync::atomic::{AtomicU8, Ordering};
-use std::sync::{Arc, Mutex, RwLock};
-
-use async_trait::async_trait;
-use common_telemetry::{debug, error, info};
-use snafu::{ensure, ResultExt};
-use tokio::sync::Notify;
-use tokio::task::JoinHandle;
-use tokio_util::sync::CancellationToken;
-
-use crate::error::{IllegalSchedulerStateSnafu, Result, StopSchedulerSnafu};
-use crate::scheduler::dedup_deque::DedupDeque;
-use crate::scheduler::rate_limit::{
-    BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter, RateLimiter,
-};
-
-pub mod dedup_deque;
-pub mod rate_limit;
-
-/// Request that can be scheduled.
-/// It must contain a key for deduplication.
-pub trait Request: Send + Sync + 'static {
-    /// Type of request key.
-    type Key: Eq + Hash + Clone + Debug + Send + Sync;
-
-    /// Returns the request key.
-    fn key(&self) -> Self::Key;
-
-    /// Notify the request result.
-    fn complete(self, result: Result<()>);
-}
-
-#[async_trait::async_trait]
-pub trait Handler {
-    type Request;
-
-    async fn handle_request(
-        &self,
-        req: Self::Request,
-        token: BoxedRateLimitToken,
-        finish_notifier: Arc<Notify>,
-    ) -> Result<()>;
-}
-
-/// [Scheduler] defines a set of API to schedule requests.
-#[async_trait]
-pub trait Scheduler: Debug {
-    type Request;
-
-    /// Schedules a request.
-    /// Returns true if request is scheduled. Returns false if task queue already
-    /// contains the request with same key.
-    fn schedule(&self, request: Self::Request) -> Result<bool>;
-
-    /// Stops scheduler. If `await_termination` is set to true, the scheduler will
-    /// wait until all queued requests are processed.
-    async fn stop(&self, await_termination: bool) -> Result<()>;
-}
-
-/// Scheduler config.
-#[derive(Debug)]
-pub struct SchedulerConfig {
-    pub max_inflight_tasks: usize,
-}
-
-impl Default for SchedulerConfig {
-    fn default() -> Self {
-        Self {
-            max_inflight_tasks: 4,
-        }
-    }
-}
-
-const STATE_RUNNING: u8 = 0;
-const STATE_STOP: u8 = 1;
-const STATE_AWAIT_TERMINATION: u8 = 2;
-
-/// Request scheduler based on local state.
-pub struct LocalScheduler<R: Request> {
-    /// Request FIFO with key deduplication.
-    request_queue: Arc<RwLock<DedupDeque<R::Key, R>>>,
-    /// Token used to halt the scheduler.
-    cancel_token: CancellationToken,
-    /// Tasks use a cooperative manner to notify scheduler that another request can be scheduled.
-    task_notifier: Arc<Notify>,
-    /// Join handle of spawned request handling loop.
-    join_handle: Mutex<Option<JoinHandle<()>>>,
-    /// State of scheduler.
-    state: Arc<AtomicU8>,
-}
-
-impl<R> Debug for LocalScheduler<R>
-where
-    R: Request + Send + Sync,
-{
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("LocalScheduler")
-            .field("state", &self.state)
-            .finish()
-    }
-}
-
-impl<R> Drop for LocalScheduler<R>
-where
-    R: Request,
-{
-    fn drop(&mut self) {
-        self.state.store(STATE_STOP, Ordering::Relaxed);
-
-        self.cancel_token.cancel();
-
-        // Clear all requests
-        self.request_queue.write().unwrap().clear();
-    }
-}
-
-#[async_trait]
-impl<R> Scheduler for LocalScheduler<R>
-where
-    R: Request + Send,
-{
-    type Request = R;
-
-    fn schedule(&self, request: Self::Request) -> Result<bool> {
-        ensure!(self.running(), IllegalSchedulerStateSnafu);
-        debug!(
-            "Schedule request: {:?}, queue size: {}",
-            request.key(),
-            self.remaining_requests()
-        );
-        let mut queue = self.request_queue.write().unwrap();
-        let res = queue.push_back(request.key(), request);
-        self.task_notifier.notify_one();
-        Ok(res)
-    }
-
-    async fn stop(&self, await_termination: bool) -> Result<()> {
-        let state = if await_termination {
-            STATE_AWAIT_TERMINATION
-        } else {
-            STATE_STOP
-        };
-        self.state.store(state, Ordering::Relaxed);
-
-        self.cancel_token.cancel();
-        let handle = { self.join_handle.lock().unwrap().take() };
-        if let Some(handle) = handle {
-            handle.await.context(StopSchedulerSnafu)?;
-        }
-        Ok(())
-    }
-}
-
-impl<R> LocalScheduler<R>
-where
-    R: Request,
-{
-    /// Creates a new scheduler instance with given config and request handler.
-    pub fn new<H>(config: SchedulerConfig, handler: H) -> Self
-    where
-        H: Handler<Request = R> + Send + Sync + 'static,
-    {
-        let request_queue = Arc::new(RwLock::new(DedupDeque::default()));
-        let cancel_token = CancellationToken::new();
-        let task_notifier = Arc::new(Notify::new());
-        let state = Arc::new(AtomicU8::new(STATE_RUNNING));
-        let handle_loop = HandlerLoop {
-            task_notifier: task_notifier.clone(),
-            req_queue: request_queue.clone(),
-            cancel_token: cancel_token.child_token(),
-            limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new(
-                MaxInflightTaskLimiter::new(config.max_inflight_tasks),
-            )])),
-            request_handler: handler,
-            state: state.clone(),
-        };
-        let join_handle = common_runtime::spawn_bg(async move {
-            debug!("Task handler loop spawned");
-            handle_loop.run().await;
-        });
-        Self {
-            join_handle: Mutex::new(Some(join_handle)),
-            request_queue,
-            cancel_token,
-            task_notifier,
-            state,
-        }
-    }
-
-    /// Returns remaining requests number.
-    #[inline]
-    fn remaining_requests(&self) -> usize {
-        self.request_queue.read().unwrap().len()
-    }
-
-    #[inline]
-    fn running(&self) -> bool {
-        self.state.load(Ordering::Relaxed) == STATE_RUNNING
-    }
-}
-
-pub struct HandlerLoop<R: Request, H: Handler> {
-    pub req_queue: Arc<RwLock<DedupDeque<R::Key, R>>>,
-    pub cancel_token: CancellationToken,
-    pub task_notifier: Arc<Notify>,
-    pub request_handler: H,
-    pub limiter: Arc<CascadeRateLimiter<R>>,
-    pub state: Arc<AtomicU8>,
-}
-
-impl<R, H> HandlerLoop<R, H>
-where
-    R: Request,
-    H: Handler<Request = R>,
-{
-    /// Runs scheduled requests dispatch loop.
-    pub async fn run(&self) {
-        let limiter = self.limiter.clone();
-        while self.running() {
-            tokio::select! {
-                _ = self.task_notifier.notified() => {
-                    debug!("Notified, queue size: {:?}",self.req_queue.read().unwrap().len());
-                    self.poll_and_execute(&limiter).await;
-                }
-                _ = self.cancel_token.cancelled() => {
-                    info!("Task scheduler cancelled.");
-                    break;
-                }
-            }
-        }
-        // For correctness, we need to poll requests from fifo again.
-        if self.state.load(Ordering::Relaxed) == STATE_AWAIT_TERMINATION {
-            info!("Waiting for all pending tasks to finish.");
-            self.poll_and_execute(&limiter).await;
-            self.state.store(STATE_STOP, Ordering::Relaxed);
-        }
-        info!("Task scheduler stopped");
-    }
-
-    /// Polls and executes requests as many as possible until rate limited.
-    async fn poll_and_execute(&self, limiter: &Arc<CascadeRateLimiter<R>>) {
-        while let Some((task_key, req)) = self.poll_task().await {
-            if let Ok(token) = limiter.acquire_token(&req) {
-                debug!("Executing request: {:?}", task_key);
-                if let Err(e) = self
-                    .handle_request(req, token, self.task_notifier.clone())
-                    .await
-                {
-                    error!(e; "Failed to submit request: {:?}", task_key);
-                } else {
-                    info!("Submitted task: {:?}", task_key);
-                }
-            } else {
-                // rate limited, put back to req queue to wait for next schedule
-                debug!(
-                    "Put back request {:?}, queue size: {}",
-                    task_key,
-                    self.req_queue.read().unwrap().len()
-                );
-                self.put_back_req(task_key, req).await;
-                break;
-            }
-        }
-    }
-
-    #[inline]
-    async fn poll_task(&self) -> Option<(R::Key, R)> {
-        let mut queue = self.req_queue.write().unwrap();
-        queue.pop_front()
-    }
-
-    /// Puts request back to the front of request queue.
-    #[inline]
-    async fn put_back_req(&self, key: R::Key, req: R) {
-        let mut queue = self.req_queue.write().unwrap();
-        let _ = queue.push_front(key, req);
-    }
-
-    // Handles request, submit task to bg runtime.
-    async fn handle_request(
-        &self,
-        req: R,
-        token: BoxedRateLimitToken,
-        finish_notifier: Arc<Notify>,
-    ) -> Result<()> {
-        self.request_handler
-            .handle_request(req, token, finish_notifier)
-            .await
-    }
-
-    #[inline]
-    fn running(&self) -> bool {
-        self.state.load(Ordering::Relaxed) == STATE_RUNNING
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::atomic::{AtomicBool, AtomicI32};
-    use std::time::Duration;
-
-    use futures_util::future::BoxFuture;
-    use store_api::storage::RegionId;
-
-    use super::*;
-    use crate::scheduler::dedup_deque::DedupDeque;
-    use crate::scheduler::rate_limit::{
-        BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter,
-    };
-    use crate::scheduler::{HandlerLoop, LocalScheduler, Scheduler, SchedulerConfig};
-
-    struct CountdownLatch {
-        counter: std::sync::Mutex<usize>,
-        notify: Notify,
-    }
-
-    impl CountdownLatch {
-        fn new(size: usize) -> Self {
-            Self {
-                counter: std::sync::Mutex::new(size),
-                notify: Notify::new(),
-            }
-        }
-
-        fn countdown(&self) {
-            let mut counter = self.counter.lock().unwrap();
-            if *counter >= 1 {
-                *counter -= 1;
-                if *counter == 0 {
-                    self.notify.notify_one();
-                }
-            }
-        }
-
-        /// Users should only call this once.
-        async fn wait(&self) {
-            self.notify.notified().await
-        }
-    }
-
-    #[tokio::test]
-    async fn test_schedule_handler() {
-        common_telemetry::init_default_ut_logging();
-        let queue = Arc::new(std::sync::RwLock::new(DedupDeque::default()));
-        let latch = Arc::new(CountdownLatch::new(2));
-        let latch_cloned = latch.clone();
-        let handler = Arc::new(HandlerLoop {
-            req_queue: queue.clone(),
-            cancel_token: Default::default(),
-            task_notifier: Arc::new(Default::default()),
-            request_handler: MockHandler {
-                cb: move || {
-                    latch_cloned.countdown();
-                },
-            },
-            limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new(
-                MaxInflightTaskLimiter::new(3),
-            )])),
-            state: Arc::new(AtomicU8::default()),
-        });
-
-        let handler_cloned = handler.clone();
-        let _handle = common_runtime::spawn_bg(async move { handler_cloned.run().await });
-
-        let _ = queue
-            .write()
-            .unwrap()
-            .push_back(1.into(), MockRequest::default());
-        handler.task_notifier.notify_one();
-        let _ = queue
-            .write()
-            .unwrap()
-            .push_back(2.into(), MockRequest::default());
-        handler.task_notifier.notify_one();
-
-        tokio::time::timeout(Duration::from_secs(1), latch.wait())
-            .await
-            .unwrap();
-    }
-
-    #[derive(Default, Debug)]
-    struct MockRequest {
-        region_id: RegionId,
-    }
-
-    struct MockHandler<F> {
-        cb: F,
-    }
-
-    #[async_trait::async_trait]
-    impl<F> Handler for MockHandler<F>
-    where
-        F: Fn() + Send + Sync,
-    {
-        type Request = MockRequest;
-
-        async fn handle_request(
-            &self,
-            _req: Self::Request,
-            token: BoxedRateLimitToken,
-            finish_notifier: Arc<Notify>,
-        ) -> Result<()> {
-            (self.cb)();
-            token.try_release();
-            finish_notifier.notify_one();
-            Ok(())
-        }
-    }
-
-    impl Request for MockRequest {
-        type Key = RegionId;
-
-        fn key(&self) -> Self::Key {
-            self.region_id
-        }
-
-        fn complete(self, _result: Result<()>) {}
-    }
-
-    #[tokio::test]
-    async fn test_scheduler() {
-        let latch = Arc::new(CountdownLatch::new(2));
-        let latch_cloned = latch.clone();
-
-        let handler = MockHandler {
-            cb: move || {
-                latch_cloned.countdown();
-            },
-        };
-        let scheduler: LocalScheduler<MockRequest> = LocalScheduler::new(
-            SchedulerConfig {
-                max_inflight_tasks: 3,
-            },
-            handler,
-        );
-
-        let _ = scheduler
-            .schedule(MockRequest {
-                region_id: 1.into(),
-            })
-            .unwrap();
-        let _ = scheduler
-            .schedule(MockRequest {
-                region_id: 2.into(),
-            })
-            .unwrap();
-
-        tokio::time::timeout(Duration::from_secs(1), latch.wait())
-            .await
-            .unwrap();
-    }
-
-    #[tokio::test]
-    async fn test_scheduler_many() {
-        common_telemetry::init_default_ut_logging();
-        let task_size = 100;
-
-        let latch = Arc::new(CountdownLatch::new(task_size));
-        let latch_clone = latch.clone();
-
-        let handler = MockHandler {
-            cb: move || {
-                latch_clone.countdown();
-            },
-        };
-
-        let config = SchedulerConfig {
-            max_inflight_tasks: 3,
-        };
-        let scheduler = LocalScheduler::new(config, handler);
-
-        for i in 0..task_size {
-            assert!(scheduler
-                .schedule(MockRequest {
-                    region_id: RegionId::from(i as u64),
-                })
-                .is_ok());
-        }
-
-        tokio::time::timeout(Duration::from_secs(3), latch.wait())
-            .await
-            .unwrap();
-    }
-
-    #[tokio::test]
-    async fn test_scheduler_interval() {
-        common_telemetry::init_default_ut_logging();
-        let task_size = 100;
-        let latch = Arc::new(CountdownLatch::new(task_size));
-        let latch_clone = latch.clone();
-
-        let handler = MockHandler {
-            cb: move || {
-                latch_clone.countdown();
-            },
-        };
-
-        let config = SchedulerConfig {
-            max_inflight_tasks: 3,
-        };
-        let scheduler = LocalScheduler::new(config, handler);
-
-        for i in 0..task_size / 2 {
-            assert!(scheduler
-                .schedule(MockRequest {
-                    region_id: RegionId::from(i as u64),
-                })
-                .is_ok());
-        }
-
-        tokio::time::sleep(Duration::from_millis(100)).await;
-        for i in task_size / 2..task_size {
-            assert!(scheduler
-                .schedule(MockRequest {
-                    region_id: RegionId::from(i as u64),
-                })
-                .is_ok());
-        }
-
-        tokio::time::timeout(Duration::from_secs(6), latch.wait())
-            .await
-            .unwrap();
-    }
-
-    struct MockAsyncHandler<F> {
-        cb: F,
-    }
-
-    #[async_trait::async_trait]
-    impl<F> Handler for MockAsyncHandler<F>
-    where
-        F: Fn() -> BoxFuture<'static, ()> + Send + Sync,
-    {
-        type Request = MockRequest;
-
-        async fn handle_request(
-            &self,
-            _req: Self::Request,
-            token: BoxedRateLimitToken,
-            finish_notifier: Arc<Notify>,
-        ) -> Result<()> {
-            let fut = (self.cb)();
-            fut.await;
-            token.try_release();
-            finish_notifier.notify_one();
-            Ok(())
-        }
-    }
-
-    #[tokio::test]
-    async fn test_schedule_duplicate_tasks() {
-        common_telemetry::init_default_ut_logging();
-        let (tx, rx) = tokio::sync::watch::channel(false);
-        let handler = MockAsyncHandler {
-            cb: move || {
-                let mut rx = rx.clone();
-                Box::pin(async move {
-                    // Block the handler so it can't handle more requests.
-                    loop {
-                        rx.changed().await.unwrap();
-                        if *rx.borrow() {
-                            break;
-                        }
-                    }
-                }) as _ // Casts the Pin<Box<async block>> to Pin<Box<dyn Future>>
-            },
-        };
-        let config = SchedulerConfig {
-            max_inflight_tasks: 30,
-        };
-        let scheduler = LocalScheduler::new(config, handler);
-
-        let mut scheduled_task = 0;
-        for _ in 0..10 {
-            if scheduler
-                .schedule(MockRequest {
-                    region_id: 1.into(),
-                })
-                .unwrap()
-            {
-                scheduled_task += 1;
-            }
-        }
-        tx.send(true).unwrap();
-        scheduler.stop(true).await.unwrap();
-        debug!("Schedule tasks: {}", scheduled_task);
-        assert!(scheduled_task < 10);
-    }
-
-    #[tokio::test]
-    async fn test_await_termination() {
-        common_telemetry::init_default_ut_logging();
-
-        let finished = Arc::new(AtomicI32::new(0));
-        let finished_clone = finished.clone();
-        let handler = MockHandler {
-            cb: move || {
-                let _ = finished_clone.fetch_add(1, Ordering::Relaxed);
-            },
-        };
-
-        let config = SchedulerConfig {
-            max_inflight_tasks: 3,
-        };
-        let scheduler = Arc::new(LocalScheduler::new(config, handler));
-        let scheduler_cloned = scheduler.clone();
-        let task_scheduled = Arc::new(AtomicI32::new(0));
-        let task_scheduled_cloned = task_scheduled.clone();
-
-        let scheduling = Arc::new(AtomicBool::new(true));
-        let scheduling_clone = scheduling.clone();
-        let handle = common_runtime::spawn_write(async move {
-            for i in 0..10000 {
-                if let Ok(res) = scheduler_cloned.schedule(MockRequest {
-                    region_id: RegionId::from(i as u64),
-                }) {
-                    if res {
-                        let _ = task_scheduled_cloned.fetch_add(1, Ordering::Relaxed);
-                    }
-                }
-
-                if !scheduling_clone.load(Ordering::Relaxed) {
-                    break;
-                }
-            }
-        });
-
-        scheduler.stop(true).await.unwrap();
-        scheduling.store(false, Ordering::Relaxed);
-
-        let finished = finished.load(Ordering::Relaxed);
-        handle.await.unwrap();
-
-        assert_eq!(finished, task_scheduled.load(Ordering::Relaxed));
-    }
-}
--- a/src/storage/src/scheduler/dedup_deque.rs
+++ b/src/storage/src/scheduler/dedup_deque.rs
@@ -1,124 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::hash_map::Entry;
-use std::collections::{HashMap, VecDeque};
-use std::fmt::{Debug, Formatter};
-use std::hash::Hash;
-
-/// Deque with key deduplication.
-pub struct DedupDeque<K, V> {
-    deque: VecDeque<K>,
-    existing: HashMap<K, V>,
-}
-
-impl<K, V> Default for DedupDeque<K, V> {
-    fn default() -> Self {
-        Self {
-            deque: VecDeque::new(),
-            existing: HashMap::new(),
-        }
-    }
-}
-
-impl<K: Eq + Hash + Clone, V> DedupDeque<K, V> {
-    /// Pushes a key value to the back of deque.
-    /// Returns true if the deque does not already contain value with the same key, otherwise
-    /// returns false.
-    pub fn push_back(&mut self, key: K, value: V) -> bool {
-        debug_assert_eq!(self.deque.len(), self.existing.len());
-        if let Entry::Vacant(entry) = self.existing.entry(key.clone()) {
-            let _ = entry.insert(value);
-            self.deque.push_back(key);
-            return true;
-        }
-        false
-    }
-
-    /// Pushes a key value to the front of deque.
-    /// Returns true if the deque does not already contain value with the same key, otherwise
-    /// returns false.
-    pub fn push_front(&mut self, key: K, value: V) -> bool {
-        if let Entry::Vacant(entry) = self.existing.entry(key.clone()) {
-            let _ = entry.insert(value);
-            self.deque.push_front(key);
-            return true;
-        }
-        false
-    }
-
-    /// Pops a pair from the back of deque. Returns [None] if the deque is empty.
-    pub fn pop_front(&mut self) -> Option<(K, V)> {
-        debug_assert_eq!(self.deque.len(), self.existing.len());
-        let key = self.deque.pop_front()?;
-        let value = self.existing.remove(&key)?;
-        Some((key, value))
-    }
-
-    #[inline]
-    pub fn len(&self) -> usize {
-        debug_assert_eq!(self.deque.len(), self.existing.len());
-        self.deque.len()
-    }
-
-    #[inline]
-    pub fn is_empty(&self) -> bool {
-        self.deque.is_empty()
-    }
-
-    #[inline]
-    pub fn clear(&mut self) {
-        self.deque.clear();
-        self.existing.clear();
-    }
-}
-
-impl<K, V> Debug for DedupDeque<K, V>
-where
-    K: Debug,
-    V: Debug,
-{
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("DedupDeque")
-            .field("deque", &self.deque)
-            .field("existing", &self.existing)
-            .finish()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_dedup_deque() {
-        let mut deque = DedupDeque::default();
-        assert!(deque.push_back(1, "hello".to_string()));
-        assert_eq!(1, deque.len());
-        assert!(deque.push_back(2, "world".to_string()));
-        assert_eq!(2, deque.len());
-        assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap());
-        assert_eq!(1, deque.len());
-        assert_eq!((2, "world".to_string()), deque.pop_front().unwrap());
-        assert_eq!(0, deque.len());
-
-        // insert duplicated item
-        assert!(deque.push_back(1, "hello".to_string()));
-        assert!(!deque.push_back(1, "world".to_string()));
-        assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap());
-
-        deque.clear();
-        assert!(deque.is_empty());
-    }
-}
--- a/src/storage/src/scheduler/rate_limit.rs
+++ b/src/storage/src/scheduler/rate_limit.rs
@@ -1,185 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::marker::PhantomData;
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::Arc;
-
-use crate::error::{RateLimitedSnafu, Result};
-
-pub trait RateLimitToken {
-    /// Releases the token.
-    /// ### Note
-    /// Implementation should guarantee the idempotency.
-    fn try_release(&self);
-}
-
-pub type BoxedRateLimitToken = Box<dyn RateLimitToken + Send + Sync>;
-
-impl<T: RateLimitToken + ?Sized> RateLimitToken for Box<T> {
-    fn try_release(&self) {
-        (**self).try_release()
-    }
-}
-
-/// Rate limiter
-pub trait RateLimiter {
-    type Request;
-
-    /// Acquires a token from rate limiter. Returns `Err` on failure.  
-    fn acquire_token(&self, req: &Self::Request) -> Result<BoxedRateLimitToken>;
-}
-
-pub type BoxedRateLimiter<R> = Box<dyn RateLimiter<Request = R> + Send + Sync>;
-
-/// Limits max inflight tasks number.
-pub struct MaxInflightTaskLimiter<R> {
-    max_inflight_tasks: usize,
-    inflight_tasks: Arc<AtomicUsize>,
-    _phantom_data: PhantomData<R>,
-}
-
-impl<R> MaxInflightTaskLimiter<R> {
-    pub fn new(max_inflight_tasks: usize) -> Self {
-        Self {
-            max_inflight_tasks,
-            inflight_tasks: Arc::new(AtomicUsize::new(0)),
-            _phantom_data: Default::default(),
-        }
-    }
-}
-
-impl<R> RateLimiter for MaxInflightTaskLimiter<R> {
-    type Request = R;
-
-    fn acquire_token(&self, _: &Self::Request) -> Result<BoxedRateLimitToken> {
-        if self.inflight_tasks.fetch_add(1, Ordering::Relaxed) >= self.max_inflight_tasks {
-            let _ = self.inflight_tasks.fetch_sub(1, Ordering::Relaxed);
-            return RateLimitedSnafu {
-                msg: format!(
-                    "Max inflight task num exceeds, current: {}, max: {}",
-                    self.inflight_tasks.load(Ordering::Relaxed),
-                    self.max_inflight_tasks
-                ),
-            }
-            .fail();
-        }
-
-        Ok(Box::new(MaxInflightLimiterToken::new(
-            self.inflight_tasks.clone(),
-        )))
-    }
-}
-
-pub struct MaxInflightLimiterToken {
-    counter: Arc<AtomicUsize>,
-    released: AtomicBool,
-}
-
-impl MaxInflightLimiterToken {
-    pub fn new(counter: Arc<AtomicUsize>) -> Self {
-        Self {
-            counter,
-            released: AtomicBool::new(false),
-        }
-    }
-}
-
-impl RateLimitToken for MaxInflightLimiterToken {
-    fn try_release(&self) {
-        if self
-            .released
-            .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
-            .is_ok()
-        {
-            let _ = self.counter.fetch_sub(1, Ordering::Relaxed);
-        }
-    }
-}
-
-/// A composite rate limiter that allows token acquisition only when all internal limiters allow.
-pub struct CascadeRateLimiter<T> {
-    limits: Vec<BoxedRateLimiter<T>>,
-}
-
-impl<T> CascadeRateLimiter<T> {
-    pub fn new(limits: Vec<BoxedRateLimiter<T>>) -> Self {
-        Self { limits }
-    }
-}
-
-impl<T> RateLimiter for CascadeRateLimiter<T> {
-    type Request = T;
-
-    fn acquire_token(&self, req: &Self::Request) -> Result<BoxedRateLimitToken> {
-        let mut res = vec![];
-        for limit in &self.limits {
-            match limit.acquire_token(req) {
-                Ok(token) => {
-                    res.push(token);
-                }
-                Err(e) => {
-                    res.iter().for_each(RateLimitToken::try_release);
-                    return Err(e);
-                }
-            }
-        }
-        Ok(Box::new(CompositeToken { tokens: res }))
-    }
-}
-
-/// Composite token that releases all acquired token when released.
-pub struct CompositeToken {
-    tokens: Vec<BoxedRateLimitToken>,
-}
-
-impl RateLimitToken for CompositeToken {
-    fn try_release(&self) {
-        for token in &self.tokens {
-            token.try_release();
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_max_inflight_limiter() {
-        let limiter = MaxInflightTaskLimiter::new(3);
-        let t1 = limiter.acquire_token(&1).unwrap();
-        assert_eq!(1, limiter.inflight_tasks.load(Ordering::Relaxed));
-        let _t2 = limiter.acquire_token(&1).unwrap();
-        assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed));
-        let _t3 = limiter.acquire_token(&1).unwrap();
-        assert_eq!(3, limiter.inflight_tasks.load(Ordering::Relaxed));
-        assert!(limiter.acquire_token(&1).is_err());
-        t1.try_release();
-        assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed));
-        let _t4 = limiter.acquire_token(&1).unwrap();
-    }
-
-    #[test]
-    fn test_cascade_limiter() {
-        let limiter: CascadeRateLimiter<usize> =
-            CascadeRateLimiter::new(vec![Box::new(MaxInflightTaskLimiter::new(3))]);
-        let t1 = limiter.acquire_token(&1).unwrap();
-        let _t2 = limiter.acquire_token(&1).unwrap();
-        let _t3 = limiter.acquire_token(&1).unwrap();
-        assert!(limiter.acquire_token(&1).is_err());
-        t1.try_release();
-        let _t4 = limiter.acquire_token(&1).unwrap();
-    }
-}
--- a/src/storage/src/schema.rs
+++ b/src/storage/src/schema.rs
@@ -1,59 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub mod compat;
-mod projected;
-mod region;
-mod store;
-
-pub use crate::schema::projected::{ProjectedSchema, ProjectedSchemaRef};
-pub use crate::schema::region::{RegionSchema, RegionSchemaRef};
-pub use crate::schema::store::{StoreSchema, StoreSchemaRef};
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use datatypes::vectors::{
-        Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector, VectorRef,
-    };
-
-    use crate::read::Batch;
-
-    pub const REGION_NAME: &str = "test";
-
-    pub(crate) fn new_batch() -> Batch {
-        new_batch_with_num_values(1)
-    }
-
-    pub(crate) fn new_batch_with_num_values(num_field_columns: usize) -> Batch {
-        let k0 = Int64Vector::from_slice([1, 2, 3]);
-        let timestamp = TimestampMillisecondVector::from_vec(vec![4, 5, 6]);
-
-        let mut columns: Vec<VectorRef> = vec![Arc::new(k0), Arc::new(timestamp)];
-
-        for i in 0..num_field_columns {
-            let vi = Int64Vector::from_slice([i as i64, i as i64, i as i64]);
-            columns.push(Arc::new(vi));
-        }
-
-        let sequences = UInt64Vector::from_slice([100, 100, 100]);
-        let op_types = UInt8Vector::from_slice([0, 0, 0]);
-
-        columns.push(Arc::new(sequences));
-        columns.push(Arc::new(op_types));
-
-        Batch::new(columns)
-    }
-}
--- a/src/storage/src/schema/compat.rs
+++ b/src/storage/src/schema/compat.rs
@@ -1,611 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Utilities for resolving schema compatibility problems.
-
-use datatypes::arrow::record_batch::RecordBatch;
-use datatypes::schema::SchemaRef;
-use datatypes::vectors::{Helper, VectorRef};
-use snafu::{ensure, OptionExt, ResultExt};
-
-use crate::error::{self, Result};
-use crate::metadata::ColumnMetadata;
-use crate::read::Batch;
-use crate::schema::{ProjectedSchemaRef, StoreSchemaRef};
-
-/// Make schema compatible to write to target with another schema.
-pub trait CompatWrite {
-    /// Makes the schema of `self` compatible with `dest_schema`.
-    ///
-    /// For column in `dest_schema` but not in `self`, this method would insert a
-    /// vector with default value.
-    ///
-    /// If there are columns not in `dest_schema`, an error would be returned.
-    fn compat_write(&mut self, dest_schema: &SchemaRef) -> Result<()>;
-}
-
-/// Checks whether column with `source_column` could be read as a column with `dest_column`.
-///
-/// Returns
-/// - `Ok(true)` if `source_column` is compatible to read using `dest_column` as schema.
-/// - `Ok(false)` if they are considered different columns.
-/// - `Err` if there is incompatible issue that could not be resolved.
-fn is_source_column_compatible(
-    source_column: &ColumnMetadata,
-    dest_column: &ColumnMetadata,
-) -> Result<bool> {
-    ensure!(
-        source_column.name() == dest_column.name(),
-        error::CompatReadSnafu {
-            reason: format!(
-                "try to use column in {} for column {}",
-                source_column.name(),
-                dest_column.name()
-            ),
-        }
-    );
-
-    if source_column.id() != dest_column.id() {
-        return Ok(false);
-    }
-
-    ensure!(
-        source_column.desc.data_type == dest_column.desc.data_type,
-        error::CompatReadSnafu {
-            reason: format!(
-                "could not read column {} from {:?} type as {:?} type",
-                dest_column.name(),
-                source_column.desc.data_type,
-                dest_column.desc.data_type
-            ),
-        }
-    );
-
-    ensure!(
-        dest_column.desc.is_nullable() || !source_column.desc.is_nullable(),
-        error::CompatReadSnafu {
-            reason: format!(
-                "unable to read nullable data for non null column {}",
-                dest_column.name()
-            ),
-        }
-    );
-
-    Ok(true)
-}
-
-/// Adapter to help reading data with source schema as data with dest schema.
-#[derive(Debug)]
-pub struct ReadAdapter {
-    /// Schema of data source.
-    source_schema: StoreSchemaRef,
-    /// Schema user expects to read.
-    dest_schema: ProjectedSchemaRef,
-    /// For each column in dest schema, stores the index in read result for
-    /// this column, or None if the column is not in result.
-    ///
-    /// This vec would be left empty if `source_version == dest_version`.
-    indices_in_result: Vec<Option<usize>>,
-    /// For each column in source schema, stores whether we need to read that column. All
-    /// columns are needed by default.
-    is_source_needed: Vec<bool>,
-}
-
-impl ReadAdapter {
-    /// Creates a new [ReadAdapter] that could convert data with `source_schema` into data
-    /// with `dest_schema`.
-    pub fn new(
-        source_schema: StoreSchemaRef,
-        dest_schema: ProjectedSchemaRef,
-    ) -> Result<ReadAdapter> {
-        if source_schema.version() == dest_schema.schema_to_read().version() {
-            ReadAdapter::from_same_version(source_schema, dest_schema)
-        } else {
-            ReadAdapter::from_different_version(source_schema, dest_schema)
-        }
-    }
-
-    fn from_same_version(
-        source_schema: StoreSchemaRef,
-        dest_schema: ProjectedSchemaRef,
-    ) -> Result<ReadAdapter> {
-        let mut is_source_needed = vec![true; source_schema.num_columns()];
-        if source_schema.num_columns() != dest_schema.schema_to_read().num_columns() {
-            // `dest_schema` might be projected, so we need to find out value columns that not be read
-            // by the `dest_schema`.
-
-            for (offset, field_column) in source_schema.field_columns().iter().enumerate() {
-                // Iterate value columns in source and mark those not in destination as unneeded.
-                if !dest_schema.is_needed(field_column.id()) {
-                    is_source_needed[source_schema.field_column_index_by_offset(offset)] = false;
-                }
-            }
-        }
-
-        Ok(ReadAdapter {
-            source_schema,
-            dest_schema,
-            indices_in_result: Vec::new(),
-            is_source_needed,
-        })
-    }
-
-    fn from_different_version(
-        source_schema: StoreSchemaRef,
-        dest_schema: ProjectedSchemaRef,
-    ) -> Result<ReadAdapter> {
-        let schema_to_read = dest_schema.schema_to_read();
-        let mut indices_in_result = vec![None; schema_to_read.num_columns()];
-        let mut is_source_needed = vec![true; source_schema.num_columns()];
-        // Number of columns in result from source data.
-        let mut num_columns_in_result = 0;
-
-        for (idx, source_column) in source_schema.columns().iter().enumerate() {
-            // For each column in source schema, check whether we need to read it.
-            if let Some(dest_idx) = schema_to_read
-                .schema()
-                .column_index_by_name(source_column.name())
-            {
-                let dest_column = &schema_to_read.columns()[dest_idx];
-                // Check whether we could read this column.
-                if is_source_column_compatible(source_column, dest_column)? {
-                    // Mark that this column could be read from source data, since some
-                    // columns in source schema would be skipped, we should not use
-                    // the source column's index directly.
-                    indices_in_result[dest_idx] = Some(num_columns_in_result);
-                    num_columns_in_result += 1;
-                } else {
-                    // This column is not the same column in dest schema, should be fill by default value
-                    // instead of reading from source data.
-                    is_source_needed[idx] = false;
-                }
-            } else {
-                // The column is not in `dest_schema`, we don't need to read it.
-                is_source_needed[idx] = false;
-            }
-        }
-
-        Ok(ReadAdapter {
-            source_schema,
-            dest_schema,
-            indices_in_result,
-            is_source_needed,
-        })
-    }
-
-    /// Returns a bool slice to denote which key column in source is needed.
-    #[inline]
-    pub fn source_key_needed(&self) -> &[bool] {
-        &self.is_source_needed[..self.source_schema.row_key_end()]
-    }
-
-    /// Returns a bool slice to denote which value column in source is needed.
-    #[inline]
-    pub fn source_value_needed(&self) -> &[bool] {
-        &self.is_source_needed
-            [self.source_schema.row_key_end()..self.source_schema.user_column_end()]
-    }
-
-    /// Construct a new [Batch] from row key, value, sequence and op_type.
-    ///
-    /// # Panics
-    /// Panics if input `VectorRef` is empty.
-    pub fn batch_from_parts(
-        &self,
-        row_key_columns: Vec<VectorRef>,
-        mut field_columns: Vec<VectorRef>,
-        sequences: VectorRef,
-        op_types: VectorRef,
-    ) -> Result<Batch> {
-        // Each vector should has same length, so here we just use the length of `sequence`.
-        let num_rows = sequences.len();
-
-        let mut source = row_key_columns;
-        // Reserve space for value, sequence and op_type
-        source.reserve(field_columns.len() + 2);
-        source.append(&mut field_columns);
-        // Internal columns are push in sequence, op_type order.
-        source.push(sequences);
-        source.push(op_types);
-
-        if !self.need_compat() {
-            return Ok(Batch::new(source));
-        }
-
-        self.source_columns_to_batch(source, num_rows)
-    }
-
-    /// Returns list of fields indices need to read from the parquet file.
-    pub fn fields_to_read(&self) -> Vec<usize> {
-        self.is_source_needed
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, needed)| if *needed { Some(idx) } else { None })
-            .collect::<Vec<_>>()
-    }
-
-    /// Convert [RecordBatch] read from the parquet file into [Batch].
-    ///
-    /// The [RecordBatch] should have the same schema as [`ReadAdapter::fields_to_read()`].
-    pub fn arrow_record_batch_to_batch(&self, record_batch: &RecordBatch) -> Result<Batch> {
-        let names = self
-            .source_schema
-            .schema()
-            .column_schemas()
-            .iter()
-            .zip(self.is_source_needed.iter())
-            .filter_map(|(column_schema, is_needed)| {
-                if *is_needed {
-                    Some(&column_schema.name)
-                } else {
-                    None
-                }
-            });
-        let source = record_batch
-            .columns()
-            .iter()
-            .zip(names)
-            .map(|(column, name)| {
-                Helper::try_into_vector(column.clone()).context(error::ConvertChunkSnafu { name })
-            })
-            .collect::<Result<_>>()?;
-
-        if !self.need_compat() || record_batch.num_rows() == 0 {
-            return Ok(Batch::new(source));
-        }
-
-        let num_rows = record_batch.num_rows();
-        self.source_columns_to_batch(source, num_rows)
-    }
-
-    #[inline]
-    fn need_compat(&self) -> bool {
-        self.source_schema.version() != self.dest_schema.schema_to_read().version()
-    }
-
-    fn source_columns_to_batch(&self, source: Vec<VectorRef>, num_rows: usize) -> Result<Batch> {
-        let column_schemas = self.dest_schema.schema_to_read().schema().column_schemas();
-        let columns = self
-            .indices_in_result
-            .iter()
-            .zip(column_schemas)
-            .map(|(index_opt, column_schema)| {
-                if let Some(idx) = index_opt {
-                    Ok(source[*idx].clone())
-                } else {
-                    let vector = column_schema
-                        .create_default_vector(num_rows)
-                        .context(error::CreateDefaultToReadSnafu {
-                            column: &column_schema.name,
-                        })?
-                        .context(error::NoDefaultToReadSnafu {
-                            column: &column_schema.name,
-                        })?;
-                    Ok(vector)
-                }
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        Ok(Batch::new(columns))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use datatypes::data_type::ConcreteDataType;
-    use datatypes::schema::Schema;
-    use store_api::storage::ColumnDescriptorBuilder;
-
-    use super::*;
-    use crate::error::Error;
-    use crate::metadata::RegionMetadata;
-    use crate::schema::{tests, ProjectedSchema, RegionSchema};
-    use crate::test_util::{descriptor_util, schema_util};
-
-    fn call_batch_from_parts(
-        adapter: &ReadAdapter,
-        batch: &Batch,
-        num_field_columns: usize,
-    ) -> Batch {
-        let key = batch.columns()[0..2].to_vec();
-        let value = batch.columns()[2..2 + num_field_columns].to_vec();
-        let sequence = batch.column(2 + num_field_columns).clone();
-        let op_type = batch.column(2 + num_field_columns + 1).clone();
-
-        adapter
-            .batch_from_parts(key, value, sequence, op_type)
-            .unwrap()
-    }
-
-    fn check_batch_from_parts_without_padding(
-        adapter: &ReadAdapter,
-        batch: &Batch,
-        num_field_columns: usize,
-    ) {
-        let new_batch = call_batch_from_parts(adapter, batch, num_field_columns);
-        assert_eq!(*batch, new_batch);
-    }
-
-    fn call_arrow_chunk_to_batch(adapter: &ReadAdapter, batch: &Batch) -> Batch {
-        let columns_schema = adapter
-            .source_schema
-            .columns()
-            .iter()
-            .zip(adapter.is_source_needed.iter())
-            .filter_map(|(field, is_needed)| {
-                if *is_needed {
-                    Some(field.to_column_schema().unwrap())
-                } else {
-                    None
-                }
-            })
-            .collect::<Vec<_>>();
-        let arrow_schema = Schema::try_new(columns_schema)
-            .unwrap()
-            .arrow_schema()
-            .clone();
-        let arrays = batch.columns().iter().map(|v| v.to_arrow_array()).collect();
-        let chunk = RecordBatch::try_new(arrow_schema, arrays).unwrap();
-        adapter.arrow_record_batch_to_batch(&chunk).unwrap()
-    }
-
-    fn check_arrow_chunk_to_batch_without_padding(adapter: &ReadAdapter, batch: &Batch) {
-        let new_batch = call_arrow_chunk_to_batch(adapter, batch);
-        assert_eq!(*batch, new_batch);
-    }
-
-    fn check_batch_with_null_padding(batch: &Batch, new_batch: &Batch, null_columns: &[usize]) {
-        assert_eq!(
-            batch.num_columns() + null_columns.len(),
-            new_batch.num_columns()
-        );
-
-        let columns_from_source = new_batch
-            .columns()
-            .iter()
-            .enumerate()
-            .filter_map(|(i, v)| {
-                if null_columns.contains(&i) {
-                    None
-                } else {
-                    Some(v.clone())
-                }
-            })
-            .collect::<Vec<_>>();
-
-        assert_eq!(batch.columns(), &columns_from_source);
-
-        for idx in null_columns {
-            assert!(new_batch.column(*idx).only_null());
-        }
-    }
-
-    #[test]
-    fn test_compat_same_schema() {
-        // (k0, timestamp, v0, v1) with version 0.
-        let region_schema = Arc::new(schema_util::new_region_schema(0, 2));
-        let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema.clone()));
-        let source_schema = region_schema.store_schema().clone();
-        let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
-
-        assert_eq!(&[true, true], adapter.source_key_needed());
-        assert_eq!(&[true, true], adapter.source_value_needed());
-
-        let batch = tests::new_batch_with_num_values(2);
-        check_batch_from_parts_without_padding(&adapter, &batch, 2);
-
-        assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4, 5],);
-
-        check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
-    }
-
-    #[test]
-    fn test_compat_same_version_with_projection() {
-        // (k0, timestamp, v0, v1) with version 0.
-        let region_schema = Arc::new(schema_util::new_region_schema(0, 2));
-        // Just read v0, k0.
-        let projected_schema =
-            Arc::new(ProjectedSchema::new(region_schema.clone(), Some(vec![2, 0])).unwrap());
-
-        let source_schema = region_schema.store_schema().clone();
-        let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
-
-        assert_eq!(&[true, true], adapter.source_key_needed());
-        assert_eq!(&[true, false], adapter.source_value_needed());
-
-        // One value column has been filtered out, so the result batch should only contains one value column.
-        let batch = tests::new_batch_with_num_values(1);
-        check_batch_from_parts_without_padding(&adapter, &batch, 1);
-
-        assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5]);
-
-        check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
-    }
-
-    #[test]
-    fn test_compat_old_column() {
-        // (k0, timestamp, v0) with version 0.
-        let region_schema_old = Arc::new(schema_util::new_region_schema(0, 1));
-        // (k0, timestamp, v0, v1) with version 1.
-        let region_schema_new = Arc::new(schema_util::new_region_schema(1, 1));
-
-        // Just read v0, k0
-        let projected_schema =
-            Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![2, 0])).unwrap());
-
-        let source_schema = region_schema_old.store_schema().clone();
-        let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
-
-        assert_eq!(&[true, true], adapter.source_key_needed());
-        assert_eq!(&[true], adapter.source_value_needed());
-
-        let batch = tests::new_batch_with_num_values(1);
-        check_batch_from_parts_without_padding(&adapter, &batch, 1);
-
-        assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4],);
-
-        check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
-    }
-
-    #[test]
-    fn test_compat_new_column() {
-        // (k0, timestamp, v0, v1) with version 0.
-        let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2));
-        // (k0, timestamp, v0, v1, v2) with version 1.
-        let region_schema_new = Arc::new(schema_util::new_region_schema(1, 3));
-
-        // Just read v2, v0, k0
-        let projected_schema =
-            Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![4, 2, 0])).unwrap());
-
-        let source_schema = region_schema_old.store_schema().clone();
-        let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
-
-        assert_eq!(&[true, true], adapter.source_key_needed());
-        assert_eq!(&[true, false], adapter.source_value_needed());
-
-        // Only read one value column from source.
-        let batch = tests::new_batch_with_num_values(1);
-        // New batch should contains k0, timestamp, v0, sequence, op_type.
-        let new_batch = call_batch_from_parts(&adapter, &batch, 1);
-        // v2 is filled by null.
-        check_batch_with_null_padding(&batch, &new_batch, &[3]);
-
-        assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5],);
-
-        let new_batch = call_arrow_chunk_to_batch(&adapter, &batch);
-        check_batch_with_null_padding(&batch, &new_batch, &[3]);
-    }
-
-    #[test]
-    fn test_compat_different_column() {
-        // (k0, timestamp, v0, v1) with version 0.
-        let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2));
-
-        let mut descriptor = descriptor_util::desc_with_field_columns(tests::REGION_NAME, 2);
-        // Assign a much larger column id to v0.
-        descriptor.default_cf.columns[0].id = descriptor.default_cf.columns.last().unwrap().id + 10;
-        let metadata: RegionMetadata = descriptor.try_into().unwrap();
-        let columns = metadata.columns;
-        // (k0, timestamp, v0, v1) with version 2, and v0 has different column id.
-        let region_schema_new = Arc::new(RegionSchema::new(columns, 2).unwrap());
-
-        let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema_new));
-        let source_schema = region_schema_old.store_schema().clone();
-        let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
-
-        assert_eq!(&[true, true], adapter.source_key_needed());
-        // v0 is discarded as it has different column id than new schema's.
-        assert_eq!(&[false, true], adapter.source_value_needed());
-
-        // New batch should contains k0, timestamp, v1, sequence, op_type, so we need to remove v0
-        // from the created batch.
-        let batch = tests::new_batch_with_num_values(2);
-        let mut columns = batch.columns().to_vec();
-        // Remove v0.
-        let _ = columns.remove(2);
-        let batch = Batch::new(columns);
-
-        let new_batch = call_batch_from_parts(&adapter, &batch, 1);
-        // v0 is filled by null.
-        check_batch_with_null_padding(&batch, &new_batch, &[2]);
-
-        assert_eq!(&adapter.fields_to_read(), &[0, 1, 3, 4, 5],);
-
-        let new_batch = call_arrow_chunk_to_batch(&adapter, &batch);
-        check_batch_with_null_padding(&batch, &new_batch, &[2]);
-    }
-
-    #[inline]
-    fn new_column_desc_builder() -> ColumnDescriptorBuilder {
-        ColumnDescriptorBuilder::new(10, "test", ConcreteDataType::int32_datatype())
-    }
-
-    #[test]
-    fn test_is_source_column_compatible() {
-        let desc = new_column_desc_builder().build().unwrap();
-        let source = ColumnMetadata { cf_id: 1, desc };
-
-        // Same column is always compatible, also tests read nullable column
-        // as a nullable column.
-        assert!(is_source_column_compatible(&source, &source).unwrap());
-
-        // Different id.
-        let desc = new_column_desc_builder()
-            .id(source.desc.id + 1)
-            .build()
-            .unwrap();
-        let dest = ColumnMetadata { cf_id: 1, desc };
-        assert!(!is_source_column_compatible(&source, &dest).unwrap());
-    }
-
-    #[test]
-    fn test_nullable_column_read_by_not_null() {
-        let desc = new_column_desc_builder().build().unwrap();
-        assert!(desc.is_nullable());
-        let source = ColumnMetadata { cf_id: 1, desc };
-
-        let desc = new_column_desc_builder()
-            .is_nullable(false)
-            .build()
-            .unwrap();
-        let dest = ColumnMetadata { cf_id: 1, desc };
-
-        let err = is_source_column_compatible(&source, &dest).unwrap_err();
-        assert!(
-            matches!(err, Error::CompatRead { .. }),
-            "{err:?} is not CompatRead",
-        );
-    }
-
-    #[test]
-    fn test_read_not_null_column() {
-        let desc = new_column_desc_builder()
-            .is_nullable(false)
-            .build()
-            .unwrap();
-        let source = ColumnMetadata { cf_id: 1, desc };
-
-        let desc = new_column_desc_builder()
-            .is_nullable(false)
-            .build()
-            .unwrap();
-        let not_null_dest = ColumnMetadata { cf_id: 1, desc };
-        assert!(is_source_column_compatible(&source, &not_null_dest).unwrap());
-
-        let desc = new_column_desc_builder().build().unwrap();
-        let null_dest = ColumnMetadata { cf_id: 1, desc };
-        assert!(is_source_column_compatible(&source, &null_dest).unwrap());
-    }
-
-    #[test]
-    fn test_read_column_with_different_name() {
-        let desc = new_column_desc_builder().build().unwrap();
-        let source = ColumnMetadata { cf_id: 1, desc };
-
-        let desc = new_column_desc_builder()
-            .name(format!("{}_other", source.desc.name))
-            .build()
-            .unwrap();
-        let dest = ColumnMetadata { cf_id: 1, desc };
-
-        let err = is_source_column_compatible(&source, &dest).unwrap_err();
-        assert!(
-            matches!(err, Error::CompatRead { .. }),
-            "{err:?} is not CompatRead",
-        );
-    }
-}
--- a/src/storage/src/schema/projected.rs
+++ b/src/storage/src/schema/projected.rs
@@ -1,590 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::cmp::Ordering;
-use std::collections::{BTreeSet, HashMap};
-use std::sync::Arc;
-
-use api::v1::OpType;
-use common_base::BitVec;
-use datatypes::prelude::ScalarVector;
-use datatypes::schema::{SchemaBuilder, SchemaRef};
-use datatypes::vectors::{BooleanVector, UInt8Vector};
-use snafu::{ensure, ResultExt};
-use store_api::storage::{Chunk, ColumnId};
-
-use crate::error;
-use crate::metadata::{self, Result};
-use crate::read::{Batch, BatchOp};
-use crate::schema::{RegionSchema, RegionSchemaRef, StoreSchema, StoreSchemaRef};
-
-/// Metadata about projection.
-#[derive(Debug, Default)]
-struct Projection {
-    /// Column indices of projection.
-    projected_columns: Vec<usize>,
-    /// Sorted and deduplicated indices of columns to read, includes all row key columns
-    /// and internal columns.
-    ///
-    /// We use these indices to read from data sources.
-    columns_to_read: Vec<usize>,
-    /// Maps column id to its index in `columns_to_read`.
-    ///
-    /// Used to ask whether the column with given column id is needed in projection.
-    id_to_read_idx: HashMap<ColumnId, usize>,
-    /// Maps index of `projected_columns` to index of the column in `columns_to_read`.
-    ///
-    /// Invariant:
-    /// - `projected_idx_to_read_idx.len() == projected_columns.len()`
-    projected_idx_to_read_idx: Vec<usize>,
-    /// Number of user columns to read.
-    num_user_columns: usize,
-}
-
-impl Projection {
-    fn new(region_schema: &RegionSchema, projected_columns: Vec<usize>) -> Projection {
-        // Get a sorted list of column indices to read.
-        let mut column_indices: BTreeSet<_> = projected_columns.iter().cloned().collect();
-        column_indices.extend(region_schema.row_key_indices());
-        let num_user_columns = column_indices.len();
-        // Now insert internal columns.
-        column_indices.extend([
-            region_schema.sequence_index(),
-            region_schema.op_type_index(),
-        ]);
-        let columns_to_read: Vec<_> = column_indices.into_iter().collect();
-
-        // The region schema ensure that last two column must be internal columns.
-        assert_eq!(
-            region_schema.sequence_index(),
-            columns_to_read[num_user_columns]
-        );
-        assert_eq!(
-            region_schema.op_type_index(),
-            columns_to_read[num_user_columns + 1]
-        );
-
-        // Mapping: <column id> => <index in `columns_to_read`>
-        let id_to_read_idx: HashMap<_, _> = columns_to_read
-            .iter()
-            .enumerate()
-            .map(|(idx, col_idx)| (region_schema.column_metadata(*col_idx).id(), idx))
-            .collect();
-        // Use column id to find index in `columns_to_read` of a column in `projected_columns`.
-        let projected_idx_to_read_idx = projected_columns
-            .iter()
-            .map(|col_idx| {
-                let column_id = region_schema.column_metadata(*col_idx).id();
-                // This unwrap() should be safe since `columns_to_read` must contains all columns in `projected_columns`.
-                let read_idx = id_to_read_idx.get(&column_id).unwrap();
-                *read_idx
-            })
-            .collect();
-
-        Projection {
-            projected_columns,
-            columns_to_read,
-            id_to_read_idx,
-            projected_idx_to_read_idx,
-            num_user_columns,
-        }
-    }
-}
-
-/// Schema with projection info.
-#[derive(Debug)]
-pub struct ProjectedSchema {
-    /// Projection info, `None` means don't need to do projection.
-    projection: Option<Projection>,
-    /// Schema used to read from data sources.
-    schema_to_read: StoreSchemaRef,
-    /// User schema after projection.
-    projected_user_schema: SchemaRef,
-}
-
-pub type ProjectedSchemaRef = Arc<ProjectedSchema>;
-
-impl ProjectedSchema {
-    /// Create a new `ProjectedSchema` with given `projected_columns`.
-    ///
-    /// If `projected_columns` is None, then all columns would be read. If `projected_columns` is
-    /// `Some`, then the `Vec` in it contains the indices of columns need to be read.
-    ///
-    /// If the `Vec` is empty or contains invalid index, `Err` would be returned.
-    pub fn new(
-        region_schema: RegionSchemaRef,
-        projected_columns: Option<Vec<usize>>,
-    ) -> Result<ProjectedSchema> {
-        match projected_columns {
-            Some(indices) => {
-                Self::validate_projection(&region_schema, &indices)?;
-
-                let projection = Projection::new(&region_schema, indices);
-
-                let schema_to_read = Self::build_schema_to_read(&region_schema, &projection)?;
-                let projected_user_schema =
-                    Self::build_projected_user_schema(&region_schema, &projection)?;
-
-                Ok(ProjectedSchema {
-                    projection: Some(projection),
-                    schema_to_read,
-                    projected_user_schema,
-                })
-            }
-            None => Ok(ProjectedSchema::no_projection(region_schema)),
-        }
-    }
-
-    /// Create a `ProjectedSchema` that read all columns.
-    pub fn no_projection(region_schema: RegionSchemaRef) -> ProjectedSchema {
-        // We could just reuse the StoreSchema and user schema.
-        ProjectedSchema {
-            projection: None,
-            schema_to_read: region_schema.store_schema().clone(),
-            projected_user_schema: region_schema.user_schema().clone(),
-        }
-    }
-
-    #[inline]
-    pub fn projected_user_schema(&self) -> &SchemaRef {
-        &self.projected_user_schema
-    }
-
-    #[inline]
-    pub fn schema_to_read(&self) -> &StoreSchemaRef {
-        &self.schema_to_read
-    }
-
-    /// Convert [Batch] into [Chunk].
-    ///
-    /// This will remove all internal columns. The input `batch` should has the
-    /// same schema as [`self.schema_to_read()`](ProjectedSchema::schema_to_read).
-    /// The output [Chunk] has the same schema as
-    /// [`self.projected_user_schema()`](ProjectedSchema::projected_user_schema).
-    pub fn batch_to_chunk(&self, batch: &Batch) -> Chunk {
-        let columns = match &self.projection {
-            Some(projection) => projection
-                .projected_idx_to_read_idx
-                .iter()
-                .map(|col_idx| batch.column(*col_idx))
-                .cloned()
-                .collect(),
-            None => {
-                let num_user_columns = self.projected_user_schema.num_columns();
-                batch
-                    .columns()
-                    .iter()
-                    .take(num_user_columns)
-                    .cloned()
-                    .collect()
-            }
-        };
-        Chunk::new(columns)
-    }
-
-    /// Returns true if column with given `column_id` is needed (in projection).
-    pub fn is_needed(&self, column_id: ColumnId) -> bool {
-        self.projection
-            .as_ref()
-            .map(|p| p.id_to_read_idx.contains_key(&column_id))
-            .unwrap_or(true)
-    }
-
-    fn build_schema_to_read(
-        region_schema: &RegionSchema,
-        projection: &Projection,
-    ) -> Result<StoreSchemaRef> {
-        // Reorder columns according to the projection.
-        let columns: Vec<_> = projection
-            .columns_to_read
-            .iter()
-            .map(|col_idx| region_schema.column_metadata(*col_idx))
-            .cloned()
-            .collect();
-        // All row key columns are reserved in this schema, so we can use the row_key_end
-        // and timestamp_key_index from region schema.
-        let store_schema = StoreSchema::new(
-            columns,
-            region_schema.version(),
-            region_schema.row_key_end(),
-            projection.num_user_columns,
-        )?;
-
-        Ok(Arc::new(store_schema))
-    }
-
-    fn build_projected_user_schema(
-        region_schema: &RegionSchema,
-        projection: &Projection,
-    ) -> Result<SchemaRef> {
-        let column_schemas: Vec<_> = projection
-            .projected_columns
-            .iter()
-            .map(|col_idx| {
-                region_schema
-                    .column_metadata(*col_idx)
-                    .desc
-                    .to_column_schema()
-            })
-            .collect();
-
-        let schema = SchemaBuilder::try_from(column_schemas)
-            .context(metadata::ConvertSchemaSnafu)?
-            .version(region_schema.version())
-            .build()
-            .context(metadata::InvalidSchemaSnafu)?;
-
-        Ok(Arc::new(schema))
-    }
-
-    fn validate_projection(region_schema: &RegionSchema, indices: &[usize]) -> Result<()> {
-        // The projection indices should not be empty, at least the timestamp column
-        // should be always read, and the `StoreSchema` also requires the timestamp column.
-        ensure!(
-            !indices.is_empty(),
-            metadata::InvalidProjectionSnafu {
-                msg: "at least one column should be read",
-            }
-        );
-
-        // Now only allowed to read user columns.
-        let user_schema = region_schema.user_schema();
-        for i in indices {
-            ensure!(
-                *i < user_schema.num_columns(),
-                metadata::InvalidProjectionSnafu {
-                    msg: format!(
-                        "index {} out of bound, only contains {} columns",
-                        i,
-                        user_schema.num_columns()
-                    ),
-                }
-            );
-        }
-
-        Ok(())
-    }
-}
-
-impl BatchOp for ProjectedSchema {
-    fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering {
-        // Ordered by (row_key asc, sequence desc, op_type desc).
-        let indices = self.schema_to_read.row_key_indices();
-        for idx in indices {
-            let (left_col, right_col) = (left.column(idx), right.column(idx));
-            // Comparison of vector is done by virtual method calls currently. Consider using
-            // enum dispatch if this becomes bottleneck.
-            let order = left_col.get_ref(i).cmp(&right_col.get_ref(j));
-            if order != Ordering::Equal {
-                return order;
-            }
-        }
-        let (sequence_index, op_type_index) = (
-            self.schema_to_read.sequence_index(),
-            self.schema_to_read.op_type_index(),
-        );
-        right
-            .column(sequence_index)
-            .get_ref(j)
-            .cmp(&left.column(sequence_index).get_ref(i))
-            .then_with(|| {
-                right
-                    .column(op_type_index)
-                    .get_ref(j)
-                    .cmp(&left.column(op_type_index).get_ref(i))
-            })
-    }
-
-    fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) {
-        if let Some(prev) = prev {
-            assert_eq!(batch.num_columns(), prev.num_columns());
-        }
-        let indices = self.schema_to_read.row_key_indices();
-        for idx in indices {
-            let (current, prev_col) = (
-                batch.column(idx),
-                prev.map(|prev| prev.column(idx).as_ref()),
-            );
-            current.find_unique(selected, prev_col);
-        }
-    }
-
-    fn filter(&self, batch: &Batch, filter: &BooleanVector) -> error::Result<Batch> {
-        let columns = batch
-            .columns()
-            .iter()
-            .enumerate()
-            .map(|(i, v)| {
-                v.filter(filter).context(error::FilterColumnSnafu {
-                    name: self.schema_to_read.column_name(i),
-                })
-            })
-            .collect::<error::Result<Vec<_>>>()?;
-
-        Ok(Batch::new(columns))
-    }
-
-    fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec) {
-        let op_types = batch.column(self.schema_to_read.op_type_index());
-        // Safety: We expect the batch has the same schema as `self.schema_to_read`. The
-        // read procedure should guarantee this, otherwise this is a critical bug and it
-        // should be fine to panic.
-        let op_types = op_types
-            .as_any()
-            .downcast_ref::<UInt8Vector>()
-            .unwrap_or_else(|| {
-                panic!(
-                    "Expect op_type (UInt8) column at index {}, given {:?}",
-                    self.schema_to_read.op_type_index(),
-                    op_types.data_type()
-                );
-            });
-
-        for (i, op_type) in op_types.iter_data().enumerate() {
-            if op_type == Some(OpType::Delete as u8) {
-                selected.set(i, false);
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use api::v1::OpType;
-    use datatypes::prelude::ScalarVector;
-    use datatypes::type_id::LogicalTypeId;
-    use datatypes::vectors::{TimestampMillisecondVector, VectorRef};
-
-    use super::*;
-    use crate::metadata::Error;
-    use crate::schema::tests;
-    use crate::test_util::{read_util, schema_util};
-
-    #[test]
-    fn test_projection() {
-        // Build a region schema with 2 value columns. So the final user schema is
-        // (k0, timestamp, v0, v1)
-        let region_schema = schema_util::new_region_schema(0, 2);
-
-        // Projection, but still keep column order.
-        // After projection: (timestamp, v0)
-        let projected_columns = vec![1, 2];
-        let projection = Projection::new(&region_schema, projected_columns.clone());
-        assert_eq!(projected_columns, projection.projected_columns);
-        // Need to read (k0, timestamp, v0, sequence, op_type)
-        assert_eq!(&[0, 1, 2, 4, 5], &projection.columns_to_read[..]);
-        assert_eq!(5, projection.id_to_read_idx.len());
-        // Index of timestamp, v0 in `columns_to_read`
-        assert_eq!(&[1, 2], &projection.projected_idx_to_read_idx[..]);
-        // 3 columns: k0, timestamp, v0
-        assert_eq!(3, projection.num_user_columns);
-
-        // Projection, unordered.
-        // After projection: (timestamp, v1, k0)
-        let projected_columns = vec![1, 3, 0];
-        let projection = Projection::new(&region_schema, projected_columns.clone());
-        assert_eq!(projected_columns, projection.projected_columns);
-        // Need to read (k0, timestamp, v1, sequence, op_type)
-        assert_eq!(&[0, 1, 3, 4, 5], &projection.columns_to_read[..]);
-        assert_eq!(5, projection.id_to_read_idx.len());
-        // Index of timestamp, v1, k0 in `columns_to_read`
-        assert_eq!(&[1, 2, 0], &projection.projected_idx_to_read_idx[..]);
-        // 3 columns: k0, timestamp, v1
-        assert_eq!(3, projection.num_user_columns);
-
-        // Empty projection.
-        let projection = Projection::new(&region_schema, Vec::new());
-        assert!(projection.projected_columns.is_empty());
-        // Still need to read row keys.
-        assert_eq!(&[0, 1, 4, 5], &projection.columns_to_read[..]);
-        assert_eq!(4, projection.id_to_read_idx.len());
-        assert!(projection.projected_idx_to_read_idx.is_empty());
-        assert_eq!(2, projection.num_user_columns);
-    }
-
-    #[test]
-    fn test_projected_schema_with_projection() {
-        // (k0, timestamp, v0, v1, v2)
-        let region_schema = Arc::new(schema_util::new_region_schema(123, 3));
-
-        // After projection: (v1, timestamp)
-        let projected_schema =
-            ProjectedSchema::new(region_schema.clone(), Some(vec![3, 1])).unwrap();
-        let expect_user = schema_util::new_schema_with_version(
-            &[
-                ("v1", LogicalTypeId::Int64, true),
-                ("timestamp", LogicalTypeId::TimestampMillisecond, false),
-            ],
-            Some(1),
-            123,
-        );
-        assert_eq!(expect_user, **projected_schema.projected_user_schema());
-
-        // Test is_needed
-        let needed: Vec<_> = region_schema
-            .columns()
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, column_meta)| {
-                if projected_schema.is_needed(column_meta.id()) {
-                    Some(idx)
-                } else {
-                    None
-                }
-            })
-            .collect();
-        // (k0, timestamp, v1, sequence, op_type)
-        assert_eq!(&[0, 1, 3, 5, 6], &needed[..]);
-
-        // Use another projection.
-        // After projection: (v0, timestamp)
-        let projected_schema = ProjectedSchema::new(region_schema, Some(vec![2, 1])).unwrap();
-
-        // The schema to read should be same as region schema with (k0, timestamp, v0).
-        // We can't use `new_schema_with_version()` because the StoreSchema also store other
-        // metadata that `new_schema_with_version()` can't store.
-        let expect_schema = schema_util::new_region_schema(123, 1);
-        assert_eq!(
-            expect_schema.store_schema(),
-            projected_schema.schema_to_read()
-        );
-
-        // (k0, timestamp, v0, sequence, op_type)
-        let batch = tests::new_batch();
-        // Test Batch to our Chunk.
-        // (v0, timestamp)
-        let chunk = projected_schema.batch_to_chunk(&batch);
-        assert_eq!(2, chunk.columns.len());
-        assert_eq!(&chunk.columns[0], batch.column(2));
-        assert_eq!(&chunk.columns[1], batch.column(1));
-    }
-
-    #[test]
-    fn test_projected_schema_no_projection() {
-        // (k0, timestamp, v0)
-        let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
-
-        let projected_schema = ProjectedSchema::no_projection(region_schema.clone());
-
-        assert_eq!(
-            region_schema.user_schema(),
-            projected_schema.projected_user_schema()
-        );
-        assert_eq!(
-            region_schema.store_schema(),
-            projected_schema.schema_to_read()
-        );
-
-        for column in region_schema.columns() {
-            assert!(projected_schema.is_needed(column.id()));
-        }
-
-        // (k0, timestamp, v0, sequence, op_type)
-        let batch = tests::new_batch();
-        // Test Batch to our Chunk.
-        // (k0, timestamp, v0)
-        let chunk = projected_schema.batch_to_chunk(&batch);
-        assert_eq!(3, chunk.columns.len());
-    }
-
-    #[test]
-    fn test_projected_schema_empty_projection() {
-        // (k0, timestamp, v0)
-        let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
-
-        let err = ProjectedSchema::new(region_schema, Some(Vec::new()))
-            .err()
-            .unwrap();
-        assert!(matches!(err, Error::InvalidProjection { .. }));
-    }
-
-    #[test]
-    fn test_compare_batch() {
-        let schema = read_util::new_projected_schema();
-        let left = read_util::new_full_kv_batch(&[(1000, 1, 1000, OpType::Put)]);
-        let right = read_util::new_full_kv_batch(&[
-            (999, 1, 1000, OpType::Put),
-            (1000, 1, 999, OpType::Put),
-            (1000, 1, 1000, OpType::Put),
-        ]);
-
-        assert_eq!(Ordering::Greater, schema.compare_row(&left, 0, &right, 0));
-        assert_eq!(Ordering::Less, schema.compare_row(&left, 0, &right, 1));
-        assert_eq!(Ordering::Equal, schema.compare_row(&left, 0, &right, 2));
-    }
-
-    #[test]
-    fn test_batch_find_unique() {
-        let schema = read_util::new_projected_schema();
-        let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);
-
-        let mut selected = BitVec::repeat(false, 3);
-        schema.find_unique(&batch, &mut selected, None);
-        assert!(selected[0]);
-        assert!(selected[1]);
-        assert!(!selected[2]);
-
-        let mut selected = BitVec::repeat(false, 3);
-        let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
-        schema.find_unique(&batch, &mut selected, Some(&prev));
-        assert!(!selected[0]);
-        assert!(selected[1]);
-        assert!(!selected[2]);
-    }
-
-    #[test]
-    fn test_find_unique_with_op() {
-        let schema = read_util::new_projected_schema();
-        let mut selected = BitVec::repeat(false, 3);
-        let batch = read_util::new_full_kv_batch(&[
-            (1001, 1, 3, OpType::Put),
-            (1000, 1, 2, OpType::Delete),
-            (1000, 1, 1, OpType::Put),
-        ]);
-        schema.find_unique(&batch, &mut selected, None);
-        assert!(selected[0]);
-        assert!(selected[1]);
-        assert!(!selected[2]);
-    }
-
-    #[test]
-    fn test_filter_batch() {
-        let schema = read_util::new_projected_schema();
-        let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (3000, Some(3))]);
-        let filter = BooleanVector::from_slice(&[true, false, true]);
-
-        let res = schema.filter(&batch, &filter).unwrap();
-        let expect: VectorRef = Arc::new(TimestampMillisecondVector::from_values([1000, 3000]));
-        assert_eq!(expect, *res.column(0));
-    }
-
-    #[test]
-    fn test_unselect_deleted() {
-        let schema = read_util::new_projected_schema();
-        let batch = read_util::new_full_kv_batch(&[
-            (100, 1, 1000, OpType::Put),
-            (101, 1, 999, OpType::Delete),
-            (102, 1, 1000, OpType::Put),
-            (103, 1, 999, OpType::Put),
-            (104, 1, 1000, OpType::Delete),
-        ]);
-
-        let mut selected = BitVec::repeat(true, batch.num_rows());
-        schema.unselect_deleted(&batch, &mut selected);
-        assert_eq!(
-            BitVec::from_iter([true, false, true, true, false]),
-            selected
-        );
-    }
-}
--- a/src/storage/src/schema/region.rs
+++ b/src/storage/src/schema/region.rs
@@ -1,214 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::fmt;
-use std::sync::Arc;
-
-use datatypes::schema::{Schema, SchemaBuilder, SchemaRef};
-use snafu::ResultExt;
-
-use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, ColumnsMetadataRef, Result};
-use crate::schema::{StoreSchema, StoreSchemaRef};
-
-/// Schema of region.
-///
-/// The `RegionSchema` has the knowledge of reserved and internal columns.
-/// Reserved columns are columns that their names, ids are reserved by the storage
-/// engine, and could not be used by the user. Reserved columns usually have
-/// special usage. Reserved columns expect the version columns are also
-/// called internal columns (though the version could also be thought as a
-/// special kind of internal column), are not visible to user, such as our
-/// internal sequence, op_type columns.
-///
-/// The user schema is the schema that only contains columns that user could visit,
-/// as well as what the schema user created.
-#[derive(PartialEq, Eq)]
-pub struct RegionSchema {
-    /// Schema that only contains columns that user defined, excluding internal columns
-    /// that are reserved and used by the storage engine.
-    ///
-    /// Holding a [SchemaRef] to allow converting into `SchemaRef`/`arrow::SchemaRef`
-    /// conveniently. The fields order in `SchemaRef` **must** be consistent with
-    /// columns order in [ColumnsMetadata] to ensure the projection index of a field
-    /// is correct.
-    user_schema: SchemaRef,
-    /// store schema contains all columns of the region, including all internal columns.
-    store_schema: StoreSchemaRef,
-    /// Metadata of columns.
-    columns: ColumnsMetadataRef,
-}
-
-impl fmt::Debug for RegionSchema {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_struct("RegionSchema")
-            .field("columns", &self.columns)
-            .finish()
-    }
-}
-
-impl RegionSchema {
-    pub fn new(columns: ColumnsMetadataRef, version: u32) -> Result<RegionSchema> {
-        let user_schema = Arc::new(build_user_schema(&columns, version)?);
-        let store_schema = Arc::new(StoreSchema::from_columns_metadata(&columns, version)?);
-
-        debug_assert_eq!(user_schema.version(), store_schema.version());
-        debug_assert_eq!(version, user_schema.version());
-
-        Ok(RegionSchema {
-            user_schema,
-            store_schema,
-            columns,
-        })
-    }
-
-    /// Returns the schema of the region, excluding internal columns that used by
-    /// the storage engine.
-    #[inline]
-    pub fn user_schema(&self) -> &SchemaRef {
-        &self.user_schema
-    }
-
-    /// Returns the schema actually stores, which would also contains all internal columns.
-    #[inline]
-    pub fn store_schema(&self) -> &StoreSchemaRef {
-        &self.store_schema
-    }
-
-    #[inline]
-    pub fn row_key_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
-        self.columns.iter_row_key_columns()
-    }
-
-    #[inline]
-    pub fn field_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
-        self.columns.iter_field_columns()
-    }
-
-    #[inline]
-    pub fn num_row_key_columns(&self) -> usize {
-        self.columns.num_row_key_columns()
-    }
-
-    #[inline]
-    pub fn num_field_columns(&self) -> usize {
-        self.columns.num_field_columns()
-    }
-
-    #[inline]
-    pub fn version(&self) -> u32 {
-        self.user_schema.version()
-    }
-
-    #[inline]
-    pub(crate) fn row_key_end(&self) -> usize {
-        self.columns.row_key_end()
-    }
-
-    #[inline]
-    pub(crate) fn sequence_index(&self) -> usize {
-        self.store_schema.sequence_index()
-    }
-
-    #[inline]
-    pub(crate) fn op_type_index(&self) -> usize {
-        self.store_schema.op_type_index()
-    }
-
-    #[inline]
-    pub(crate) fn row_key_indices(&self) -> impl Iterator<Item = usize> {
-        self.store_schema.row_key_indices()
-    }
-
-    #[inline]
-    pub fn timestamp_index(&self) -> usize {
-        self.store_schema.timestamp_index()
-    }
-
-    #[inline]
-    pub(crate) fn timestamp_column_name(&self) -> &str {
-        self.store_schema.column_name(self.timestamp_index())
-    }
-
-    #[inline]
-    pub(crate) fn value_indices(&self) -> impl Iterator<Item = usize> {
-        self.store_schema.value_indices()
-    }
-
-    #[inline]
-    pub fn column_metadata(&self, idx: usize) -> &ColumnMetadata {
-        self.columns.column_metadata(idx)
-    }
-
-    #[cfg(test)]
-    pub(crate) fn columns(&self) -> &[ColumnMetadata] {
-        self.columns.columns()
-    }
-}
-
-pub type RegionSchemaRef = Arc<RegionSchema>;
-
-// Now user schema don't have extra metadata like store schema.
-fn build_user_schema(columns: &ColumnsMetadata, version: u32) -> Result<Schema> {
-    let column_schemas: Vec<_> = columns
-        .iter_user_columns()
-        .map(|col| col.desc.to_column_schema())
-        .collect();
-
-    SchemaBuilder::try_from(column_schemas)
-        .context(metadata::ConvertSchemaSnafu)?
-        .version(version)
-        .build()
-        .context(metadata::InvalidSchemaSnafu)
-}
-
-#[cfg(test)]
-mod tests {
-    use datatypes::type_id::LogicalTypeId;
-
-    use super::*;
-    use crate::test_util::schema_util;
-
-    #[test]
-    fn test_region_schema() {
-        let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
-
-        let expect_schema = schema_util::new_schema_with_version(
-            &[
-                ("k0", LogicalTypeId::Int64, false),
-                ("timestamp", LogicalTypeId::TimestampMillisecond, false),
-                ("v0", LogicalTypeId::Int64, true),
-            ],
-            Some(1),
-            123,
-        );
-
-        assert_eq!(expect_schema, **region_schema.user_schema());
-
-        // Checks row key column.
-        let mut row_keys = region_schema.row_key_columns();
-        assert_eq!("k0", row_keys.next().unwrap().desc.name);
-        assert_eq!("timestamp", row_keys.next().unwrap().desc.name);
-        assert_eq!(None, row_keys.next());
-        assert_eq!(2, region_schema.num_row_key_columns());
-
-        // Checks value column.
-        let mut values = region_schema.field_columns();
-        assert_eq!("v0", values.next().unwrap().desc.name);
-        assert_eq!(None, values.next());
-        assert_eq!(1, region_schema.num_field_columns());
-
-        // Checks version.
-        assert_eq!(123, region_schema.version());
-    }
-}
--- a/src/storage/src/schema/store.rs
+++ b/src/storage/src/schema/store.rs
@@ -1,323 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::sync::Arc;
-
-use datatypes::arrow::datatypes::Schema as ArrowSchema;
-use datatypes::arrow::record_batch::RecordBatch;
-use datatypes::schema::{Schema, SchemaBuilder, SchemaRef};
-use snafu::{ensure, OptionExt, ResultExt};
-use store_api::storage::consts;
-
-use crate::error::NewRecordBatchSnafu;
-use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, Error, Result};
-use crate::read::Batch;
-
-const ROW_KEY_END_KEY: &str = "greptime:storage:row_key_end";
-const USER_COLUMN_END_KEY: &str = "greptime:storage:user_column_end";
-
-/// Schema that contains storage engine specific metadata, such as internal columns.
-///
-/// Used internally, contains all row key columns, internal columns and a sub set of
-/// value columns in a region. The columns are organized in `key, value, internal` order.
-#[derive(Debug, PartialEq, Eq)]
-pub struct StoreSchema {
-    columns: Vec<ColumnMetadata>,
-    schema: SchemaRef,
-    row_key_end: usize,
-    user_column_end: usize,
-}
-
-pub type StoreSchemaRef = Arc<StoreSchema>;
-
-impl StoreSchema {
-    #[inline]
-    pub fn version(&self) -> u32 {
-        self.schema.version()
-    }
-
-    #[inline]
-    pub fn schema(&self) -> &SchemaRef {
-        &self.schema
-    }
-
-    #[inline]
-    pub fn arrow_schema(&self) -> &Arc<ArrowSchema> {
-        self.schema.arrow_schema()
-    }
-
-    // TODO(yingwen): Remove this method.
-    pub fn batch_to_arrow_record_batch(
-        &self,
-        batch: &Batch,
-    ) -> std::result::Result<RecordBatch, crate::error::Error> {
-        assert_eq!(self.schema.num_columns(), batch.num_columns(),);
-        RecordBatch::try_new(
-            self.schema.arrow_schema().clone(),
-            batch.columns().iter().map(|v| v.to_arrow_array()).collect(),
-        )
-        .context(NewRecordBatchSnafu)
-    }
-
-    /// Returns the ending index of row key columns.
-    ///
-    /// The ending index has the same value as the number of the row key columns.
-    #[inline]
-    pub fn row_key_end(&self) -> usize {
-        self.row_key_end
-    }
-
-    /// Returns the index of timestamp column.
-    /// We always assume that timestamp is the last column in [StoreSchema].
-    #[inline]
-    pub fn timestamp_index(&self) -> usize {
-        self.row_key_end - 1
-    }
-
-    pub(crate) fn contains_column(&self, name: &str) -> bool {
-        self.schema.column_schema_by_name(name).is_some()
-    }
-
-    pub(crate) fn is_key_column(&self, name: &str) -> bool {
-        self.schema
-            .column_index_by_name(name)
-            .map(|idx| idx < self.row_key_end)
-            .unwrap_or(false)
-    }
-
-    pub(crate) fn is_user_column(&self, name: &str) -> bool {
-        self.schema
-            .column_index_by_name(name)
-            .map(|idx| idx < self.user_column_end)
-            .unwrap_or(false)
-    }
-
-    pub(crate) fn from_columns_metadata(
-        columns: &ColumnsMetadata,
-        version: u32,
-    ) -> Result<StoreSchema> {
-        StoreSchema::new(
-            columns.columns().to_vec(),
-            version,
-            columns.row_key_end(),
-            columns.user_column_end(),
-        )
-    }
-
-    pub(crate) fn new(
-        columns: Vec<ColumnMetadata>,
-        version: u32,
-        row_key_end: usize,
-        user_column_end: usize,
-    ) -> Result<StoreSchema> {
-        let column_schemas = columns
-            .iter()
-            .map(|meta| meta.to_column_schema())
-            .collect::<Result<Vec<_>>>()?;
-
-        let schema = SchemaBuilder::try_from(column_schemas)
-            .context(metadata::ConvertSchemaSnafu)?
-            .version(version)
-            .add_metadata(ROW_KEY_END_KEY, row_key_end.to_string())
-            .add_metadata(USER_COLUMN_END_KEY, user_column_end.to_string())
-            .build()
-            .context(metadata::InvalidSchemaSnafu)?;
-
-        assert_eq!(
-            consts::SEQUENCE_COLUMN_NAME,
-            schema.column_schemas()[user_column_end].name
-        );
-        assert_eq!(
-            consts::OP_TYPE_COLUMN_NAME,
-            schema.column_schemas()[user_column_end + 1].name
-        );
-
-        Ok(StoreSchema {
-            columns,
-            schema: Arc::new(schema),
-            row_key_end,
-            user_column_end,
-        })
-    }
-
-    #[inline]
-    pub(crate) fn sequence_index(&self) -> usize {
-        self.user_column_end
-    }
-
-    #[inline]
-    pub(crate) fn op_type_index(&self) -> usize {
-        self.user_column_end + 1
-    }
-
-    #[inline]
-    pub(crate) fn row_key_indices(&self) -> impl Iterator<Item = usize> {
-        0..self.row_key_end
-    }
-
-    #[inline]
-    pub(crate) fn value_indices(&self) -> impl Iterator<Item = usize> {
-        self.row_key_end..self.user_column_end
-    }
-
-    #[inline]
-    pub(crate) fn column_name(&self, idx: usize) -> &str {
-        &self.schema.column_schemas()[idx].name
-    }
-
-    /// # Panic
-    /// Panics if `name` is not a valid column name.
-    #[inline]
-    pub(crate) fn column_index(&self, name: &str) -> usize {
-        self.schema.column_index_by_name(name).unwrap()
-    }
-
-    #[inline]
-    pub(crate) fn num_columns(&self) -> usize {
-        self.schema.num_columns()
-    }
-
-    #[inline]
-    pub(crate) fn user_column_end(&self) -> usize {
-        self.user_column_end
-    }
-
-    #[inline]
-    pub(crate) fn field_columns(&self) -> &[ColumnMetadata] {
-        &self.columns[self.row_key_end..self.user_column_end]
-    }
-
-    /// Returns the index of the value column according its `offset`.
-    #[inline]
-    pub(crate) fn field_column_index_by_offset(&self, offset: usize) -> usize {
-        self.row_key_end + offset
-    }
-
-    #[inline]
-    pub(crate) fn columns(&self) -> &[ColumnMetadata] {
-        &self.columns
-    }
-}
-
-impl TryFrom<Arc<ArrowSchema>> for StoreSchema {
-    type Error = Error;
-
-    fn try_from(arrow_schema: Arc<ArrowSchema>) -> std::result::Result<Self, Self::Error> {
-        let schema = Schema::try_from(arrow_schema).context(metadata::ConvertArrowSchemaSnafu)?;
-        // Recover other metadata from schema.
-        let row_key_end = parse_index_from_metadata(schema.metadata(), ROW_KEY_END_KEY)?;
-        let user_column_end = parse_index_from_metadata(schema.metadata(), USER_COLUMN_END_KEY)?;
-
-        // There should be sequence and op_type columns.
-        ensure!(
-            consts::SEQUENCE_COLUMN_NAME == schema.column_schemas()[user_column_end].name,
-            metadata::InvalidIndexSnafu
-        );
-        ensure!(
-            consts::OP_TYPE_COLUMN_NAME == schema.column_schemas()[user_column_end + 1].name,
-            metadata::InvalidIndexSnafu
-        );
-
-        // Recover ColumnMetadata from schema.
-        let columns = schema
-            .column_schemas()
-            .iter()
-            .map(ColumnMetadata::from_column_schema)
-            .collect::<Result<_>>()?;
-
-        Ok(StoreSchema {
-            columns,
-            schema: Arc::new(schema),
-            row_key_end,
-            user_column_end,
-        })
-    }
-}
-
-impl TryFrom<ArrowSchema> for StoreSchema {
-    type Error = Error;
-
-    fn try_from(arrow_schema: ArrowSchema) -> std::result::Result<StoreSchema, Self::Error> {
-        StoreSchema::try_from(Arc::new(arrow_schema))
-    }
-}
-
-fn parse_index_from_metadata(metadata: &HashMap<String, String>, key: &str) -> Result<usize> {
-    let value = metadata
-        .get(key)
-        .context(metadata::MetaNotFoundSnafu { key })?;
-    value.parse().with_context(|_| metadata::ParseMetaIntSnafu {
-        key_value: format!("{key}={value}"),
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::read::Batch;
-    use crate::schema::tests;
-    use crate::test_util::schema_util;
-
-    fn check_chunk_batch(record_batch: &RecordBatch, batch: &Batch) {
-        assert_eq!(5, record_batch.num_columns());
-        assert_eq!(3, record_batch.num_rows());
-
-        for i in 0..5 {
-            assert_eq!(record_batch.column(i), &batch.column(i).to_arrow_array());
-        }
-    }
-
-    #[test]
-    fn test_store_schema() {
-        let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
-
-        // Checks StoreSchema.
-        let store_schema = region_schema.store_schema();
-        assert_eq!(123, store_schema.version());
-        let sst_arrow_schema = store_schema.arrow_schema();
-        let converted_store_schema = StoreSchema::try_from((**sst_arrow_schema).clone()).unwrap();
-
-        assert_eq!(**store_schema, converted_store_schema);
-
-        let column_schemas: Vec<_> = region_schema
-            .columns()
-            .iter()
-            .map(|meta| meta.to_column_schema().unwrap())
-            .collect();
-        let expect_schema = SchemaBuilder::try_from(column_schemas)
-            .unwrap()
-            .version(123)
-            .build()
-            .unwrap();
-        // Only compare column schemas since SchemaRef in StoreSchema also contains other metadata that only used
-        // by StoreSchema.
-        assert_eq!(
-            expect_schema.column_schemas(),
-            store_schema.schema().column_schemas(),
-        );
-        assert_eq!(3, store_schema.sequence_index());
-        assert_eq!(4, store_schema.op_type_index());
-        let row_key_indices: Vec<_> = store_schema.row_key_indices().collect();
-        assert_eq!([0, 1], &row_key_indices[..]);
-        let value_indices: Vec<_> = store_schema.value_indices().collect();
-        assert_eq!([2], &value_indices[..]);
-
-        // Test batch and chunk conversion.
-        let batch = tests::new_batch();
-        // Convert batch to chunk.
-        let chunk = store_schema.batch_to_arrow_record_batch(&batch).unwrap();
-        check_chunk_batch(&chunk, &batch);
-    }
-}
--- a/src/storage/src/snapshot.rs
+++ b/src/storage/src/snapshot.rs
@@ -1,103 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::cmp;
-
-use async_trait::async_trait;
-use store_api::storage::{
-    GetRequest, GetResponse, ReadContext, ScanRequest, ScanResponse, SchemaRef, SequenceNumber,
-    Snapshot,
-};
-
-use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
-use crate::error::{Error, Result};
-use crate::sst::AccessLayerRef;
-use crate::version::VersionRef;
-
-/// [Snapshot] implementation.
-pub struct SnapshotImpl {
-    version: VersionRef,
-    /// Max sequence number (inclusive) visible to user.
-    visible_sequence: SequenceNumber,
-    sst_layer: AccessLayerRef,
-}
-
-#[async_trait]
-impl Snapshot for SnapshotImpl {
-    type Error = Error;
-    type Reader = ChunkReaderImpl;
-
-    fn schema(&self) -> &SchemaRef {
-        self.version.user_schema()
-    }
-
-    async fn scan(
-        &self,
-        ctx: &ReadContext,
-        request: ScanRequest,
-    ) -> Result<ScanResponse<ChunkReaderImpl>> {
-        let visible_sequence = self.sequence_to_read(request.sequence);
-        let memtable_version = self.version.memtables();
-
-        let mutables = memtable_version.mutable_memtable();
-        let immutables = memtable_version.immutable_memtables();
-
-        let mut builder = ChunkReaderBuilder::new(
-            self.version.metadata().id(),
-            self.version.schema().clone(),
-            self.sst_layer.clone(),
-        )
-        .reserve_num_memtables(memtable_version.num_memtables())
-        .projection(request.projection)
-        .filters(request.filters)
-        .batch_size(ctx.batch_size)
-        .output_ordering(request.output_ordering)
-        .visible_sequence(visible_sequence)
-        .pick_memtables(mutables.clone())
-        .use_chain_reader(true);
-
-        for memtable in immutables {
-            builder = builder.pick_memtables(memtable.clone());
-        }
-
-        let reader = builder.pick_all_ssts(self.version.ssts())?.build().await?;
-
-        Ok(ScanResponse { reader })
-    }
-
-    async fn get(&self, _ctx: &ReadContext, _request: GetRequest) -> Result<GetResponse> {
-        unimplemented!()
-    }
-}
-
-impl SnapshotImpl {
-    pub fn new(
-        version: VersionRef,
-        visible_sequence: SequenceNumber,
-        sst_layer: AccessLayerRef,
-    ) -> SnapshotImpl {
-        SnapshotImpl {
-            version,
-            visible_sequence,
-            sst_layer,
-        }
-    }
-
-    #[inline]
-    fn sequence_to_read(&self, request_sequence: Option<SequenceNumber>) -> SequenceNumber {
-        request_sequence
-            .map(|s| cmp::min(s, self.visible_sequence))
-            .unwrap_or(self.visible_sequence)
-    }
-}
--- a/src/storage/src/sst.rs
+++ b/src/storage/src/sst.rs
@@ -1,830 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-pub(crate) mod parquet;
-mod pruning;
-mod stream_writer;
-
-use std::collections::HashMap;
-use std::fmt;
-use std::fmt::{Debug, Formatter};
-use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-
-use async_trait::async_trait;
-use common_base::readable_size::ReadableSize;
-use common_recordbatch::SendableRecordBatchStream;
-use common_telemetry::{debug, error};
-use common_time::range::TimestampRange;
-use common_time::Timestamp;
-use datatypes::schema::SchemaRef;
-use futures_util::StreamExt;
-use object_store::{util, ObjectStore};
-use serde::{Deserialize, Deserializer, Serialize};
-use snafu::{ResultExt, Snafu};
-use store_api::storage::{ChunkReader, RegionId};
-use table::predicate::Predicate;
-use uuid::Uuid;
-
-use crate::chunk::ChunkReaderImpl;
-use crate::error;
-use crate::error::{DeleteSstSnafu, Result};
-use crate::file_purger::{FilePurgeRequest, FilePurgerRef};
-use crate::memtable::BoxedBatchIterator;
-use crate::read::{Batch, BatchReader, BoxedBatchReader};
-use crate::scheduler::Scheduler;
-use crate::schema::ProjectedSchemaRef;
-use crate::sst::parquet::{ChunkStream, ParquetReader, ParquetWriter};
-
-/// Maximum level of SSTs.
-pub const MAX_LEVEL: u8 = 2;
-
-pub type Level = u8;
-
-pub use crate::sst::stream_writer::BufferedWriter;
-
-// We only has fixed number of level, so we use array to hold elements. This implementation
-// detail of LevelMetaVec should not be exposed to the user of [LevelMetas].
-type LevelMetaVec = [LevelMeta; MAX_LEVEL as usize];
-
-/// Metadata of all SSTs under a region.
-///
-/// Files are organized into multiple level, though there may be only one level.
-#[derive(Clone)]
-pub struct LevelMetas {
-    levels: LevelMetaVec,
-    sst_layer: AccessLayerRef,
-    file_purger: FilePurgerRef,
-    /// Compaction time window in seconds
-    compaction_time_window: Option<i64>,
-}
-
-impl std::fmt::Debug for LevelMetas {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("LevelMetas")
-            .field("levels", &self.levels)
-            .field("compaction_time_window", &self.compaction_time_window)
-            .finish()
-    }
-}
-
-impl LevelMetas {
-    /// Create a new LevelMetas and initialized each level.
-    pub fn new(sst_layer: AccessLayerRef, file_purger: FilePurgerRef) -> LevelMetas {
-        LevelMetas {
-            levels: new_level_meta_vec(),
-            sst_layer,
-            file_purger,
-            compaction_time_window: Default::default(),
-        }
-    }
-
-    /// Returns total level number.
-    #[inline]
-    pub fn level_num(&self) -> usize {
-        self.levels.len()
-    }
-
-    pub fn compaction_time_window(&self) -> Option<i64> {
-        self.compaction_time_window
-    }
-
-    #[inline]
-    pub fn level(&self, level: Level) -> &LevelMeta {
-        &self.levels[level as usize]
-    }
-
-    /// Merge `self` with files to add/remove to create a new [LevelMetas].
-    ///
-    /// # Panics
-    /// Panics if level of [FileHandle] is greater than [MAX_LEVEL].
-    pub fn merge(
-        &self,
-        files_to_add: impl Iterator<Item = FileMeta>,
-        files_to_remove: impl Iterator<Item = FileMeta>,
-        compaction_time_window: Option<i64>,
-    ) -> LevelMetas {
-        let mut merged = self.clone();
-        for file in files_to_add {
-            let level = file.level;
-            let handle = FileHandle::new(file, self.sst_layer.clone(), self.file_purger.clone());
-            merged.levels[level as usize].add_file(handle);
-        }
-
-        for file in files_to_remove {
-            let level = file.level;
-            if let Some(removed_file) = merged.levels[level as usize].remove_file(file.file_id) {
-                removed_file.mark_deleted();
-            }
-        }
-        // we only update region's compaction time window iff region's window is not set and VersionEdit's
-        // compaction time window is present.
-        if let Some(window) = compaction_time_window {
-            let _ = merged.compaction_time_window.get_or_insert(window);
-        }
-        merged
-    }
-
-    pub fn mark_all_files_deleted(&self) -> Vec<FileId> {
-        self.levels().iter().fold(vec![], |mut files, level| {
-            files.extend(level.files().map(|f| {
-                f.mark_deleted();
-                f.file_id()
-            }));
-            files
-        })
-    }
-
-    pub fn levels(&self) -> &[LevelMeta] {
-        &self.levels
-    }
-
-    pub fn file_purger(&self) -> FilePurgerRef {
-        self.file_purger.clone()
-    }
-}
-
-/// Metadata of files in same SST level.
-#[derive(Default, Clone)]
-pub struct LevelMeta {
-    level: Level,
-    /// Handles to the files in this level.
-    // TODO(yingwen): Now for simplicity, files are unordered, maybe sort the files by time range
-    // or use another structure to hold them.
-    files: HashMap<FileId, FileHandle>,
-}
-
-impl std::fmt::Debug for LevelMeta {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("LevelMeta")
-            .field("level", &self.level)
-            .field("files", &self.files.keys())
-            .finish()
-    }
-}
-
-impl LevelMeta {
-    pub fn new(level: Level) -> Self {
-        Self {
-            level,
-            files: HashMap::new(),
-        }
-    }
-
-    fn add_file(&mut self, file: FileHandle) {
-        let _ = self.files.insert(file.file_id(), file);
-    }
-
-    fn remove_file(&mut self, file_to_remove: FileId) -> Option<FileHandle> {
-        self.files.remove(&file_to_remove)
-    }
-
-    /// Returns the level of level meta.
-    #[inline]
-    pub fn level(&self) -> Level {
-        self.level
-    }
-
-    /// Returns number of SST files in level.
-    #[inline]
-    pub fn file_num(&self) -> usize {
-        self.files.len()
-    }
-
-    /// Returns expired SSTs from current level.
-    pub fn get_expired_files(&self, expire_time: &Timestamp) -> Vec<FileHandle> {
-        self.files
-            .iter()
-            .filter_map(|(_, v)| {
-                let Some((_, end)) = v.time_range() else {
-                    return None;
-                };
-                if end < expire_time {
-                    Some(v.clone())
-                } else {
-                    None
-                }
-            })
-            .collect()
-    }
-
-    pub fn files(&self) -> impl Iterator<Item = &FileHandle> {
-        self.files.values()
-    }
-}
-
-fn new_level_meta_vec() -> LevelMetaVec {
-    (0u8..MAX_LEVEL)
-        .map(LevelMeta::new)
-        .collect::<Vec<_>>()
-        .try_into()
-        .unwrap() // safety: LevelMetaVec is a fixed length array with length MAX_LEVEL
-}
-
-#[derive(Clone)]
-pub struct FileHandle {
-    inner: Arc<FileHandleInner>,
-}
-
-impl Debug for FileHandle {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        f.debug_struct("FileHandle")
-            .field("file_id", &self.inner.meta.file_id)
-            .field("region_id", &self.inner.meta.region_id)
-            .field("time_range", &self.inner.meta.time_range)
-            .field("size", &self.inner.meta.file_size)
-            .field("level", &self.inner.meta.level)
-            .field("compacting", &self.inner.compacting)
-            .field("deleted", &self.inner.deleted)
-            .finish()
-    }
-}
-
-impl FileHandle {
-    pub fn new(
-        meta: FileMeta,
-        sst_layer: AccessLayerRef,
-        file_purger: FilePurgerRef,
-    ) -> FileHandle {
-        FileHandle {
-            inner: Arc::new(FileHandleInner::new(meta, sst_layer, file_purger)),
-        }
-    }
-
-    /// Returns level as usize so it can be used as index.
-    #[inline]
-    pub fn level(&self) -> Level {
-        self.inner.meta.level
-    }
-
-    #[inline]
-    pub fn file_name(&self) -> String {
-        self.inner.meta.file_id.as_parquet()
-    }
-
-    #[inline]
-    pub fn file_path(&self) -> String {
-        self.inner
-            .sst_layer
-            .sst_file_path(&self.inner.meta.file_id.as_parquet())
-    }
-
-    #[inline]
-    pub fn file_id(&self) -> FileId {
-        self.inner.meta.file_id
-    }
-
-    #[inline]
-    pub fn time_range(&self) -> &Option<(Timestamp, Timestamp)> {
-        &self.inner.meta.time_range
-    }
-
-    /// Returns true if current file is under compaction.
-    #[inline]
-    pub fn compacting(&self) -> bool {
-        self.inner.compacting.load(Ordering::Relaxed)
-    }
-
-    /// Sets the compacting flag.
-    #[inline]
-    pub fn mark_compacting(&self, compacting: bool) {
-        self.inner.compacting.store(compacting, Ordering::Relaxed);
-    }
-
-    #[inline]
-    pub fn deleted(&self) -> bool {
-        self.inner.deleted.load(Ordering::Relaxed)
-    }
-
-    #[inline]
-    pub fn mark_deleted(&self) {
-        self.inner.deleted.store(true, Ordering::Relaxed);
-    }
-
-    #[inline]
-    pub fn meta(&self) -> FileMeta {
-        self.inner.meta.clone()
-    }
-
-    #[inline]
-    pub fn file_size(&self) -> u64 {
-        self.inner.meta.file_size
-    }
-}
-
-/// Actually data of [FileHandle].
-///
-/// Contains meta of the file, and other mutable info like metrics.
-struct FileHandleInner {
-    meta: FileMeta,
-    compacting: AtomicBool,
-    deleted: AtomicBool,
-    sst_layer: AccessLayerRef,
-    file_purger: FilePurgerRef,
-}
-
-impl fmt::Debug for FileHandleInner {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("FileHandleInner")
-            .field("meta", &self.meta)
-            .field("compacting", &self.compacting)
-            .field("deleted", &self.deleted)
-            .finish()
-    }
-}
-
-impl Drop for FileHandleInner {
-    fn drop(&mut self) {
-        if self.deleted.load(Ordering::Relaxed) {
-            let request = FilePurgeRequest {
-                sst_layer: self.sst_layer.clone(),
-                file_id: self.meta.file_id,
-                region_id: self.meta.region_id,
-            };
-            match self.file_purger.schedule(request) {
-                Ok(res) => {
-                    debug!(
-                        "Scheduled SST purge task, region: {}, name: {}, res: {}",
-                        self.meta.region_id,
-                        self.meta.file_id.as_parquet(),
-                        res
-                    );
-                }
-                Err(e) => {
-                    error!(e; "Failed to schedule SST purge task, region: {}, name: {}",
-                           self.meta.region_id, self.meta.file_id.as_parquet());
-                }
-            }
-        }
-    }
-}
-
-impl FileHandleInner {
-    fn new(
-        meta: FileMeta,
-        sst_layer: AccessLayerRef,
-        file_purger: FilePurgerRef,
-    ) -> FileHandleInner {
-        FileHandleInner {
-            meta,
-            compacting: AtomicBool::new(false),
-            deleted: AtomicBool::new(false),
-            sst_layer,
-            file_purger,
-        }
-    }
-}
-
-#[derive(Debug, Snafu, PartialEq)]
-pub struct ParseIdError {
-    source: uuid::Error,
-}
-
-/// Unique id for [SST File].
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
-pub struct FileId(Uuid);
-
-impl FileId {
-    /// Returns a new unique [FileId] randomly.
-    pub fn random() -> FileId {
-        FileId(Uuid::new_v4())
-    }
-
-    /// Parses id from string.
-    pub fn parse_str(input: &str) -> std::result::Result<FileId, ParseIdError> {
-        Uuid::parse_str(input).map(FileId).context(ParseIdSnafu)
-    }
-
-    /// Append `.parquet` to file id to make a complete file name
-    pub fn as_parquet(&self) -> String {
-        format!("{}{}", self.0.hyphenated(), ".parquet")
-    }
-}
-
-impl fmt::Display for FileId {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-
-impl FromStr for FileId {
-    type Err = ParseIdError;
-
-    fn from_str(s: &str) -> std::result::Result<FileId, ParseIdError> {
-        FileId::parse_str(s)
-    }
-}
-
-/// Immutable metadata of a sst file.
-#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
-#[serde(default)]
-pub struct FileMeta {
-    /// Region of file.
-    pub region_id: RegionId,
-    /// Compared to normal file names, FileId ignore the extension
-    #[serde(deserialize_with = "deserialize_from_string")]
-    #[serde(alias = "file_name")]
-    pub file_id: FileId,
-    /// Timestamp range of file.
-    pub time_range: Option<(Timestamp, Timestamp)>,
-    /// SST level of the file.
-    pub level: Level,
-    /// Size of the file.
-    pub file_size: u64,
-}
-
-fn deserialize_from_string<'de, D>(deserializer: D) -> std::result::Result<FileId, D::Error>
-where
-    D: Deserializer<'de>,
-{
-    let s: &str = Deserialize::deserialize(deserializer)?;
-    let stripped = s.strip_suffix(".parquet").unwrap_or(s); // strip parquet suffix if needed.
-    FileId::from_str(stripped).map_err(<D::Error as serde::de::Error>::custom)
-}
-
-#[derive(Debug)]
-pub struct WriteOptions {
-    // TODO(yingwen): [flush] row group size.
-    pub sst_write_buffer_size: ReadableSize,
-}
-
-impl Default for WriteOptions {
-    fn default() -> Self {
-        Self {
-            sst_write_buffer_size: ReadableSize::mb(8),
-        }
-    }
-}
-
-pub struct ReadOptions {
-    /// Suggested size of each batch.
-    pub batch_size: usize,
-    /// The schema that user expected to read, might not the same as the
-    /// schema of the SST file.
-    pub projected_schema: ProjectedSchemaRef,
-
-    pub predicate: Predicate,
-    pub time_range: TimestampRange,
-}
-
-#[derive(Debug, PartialEq)]
-pub struct SstInfo {
-    pub time_range: Option<(Timestamp, Timestamp)>,
-    pub file_size: u64,
-    pub num_rows: usize,
-}
-
-/// SST access layer.
-#[async_trait]
-pub trait AccessLayer: Send + Sync + std::fmt::Debug {
-    /// Returns the sst file path.
-    fn sst_file_path(&self, file_name: &str) -> String;
-
-    /// Writes SST file with given `file_id` and returns the SST info.
-    /// If source does not contain any data, `write_sst` will return `Ok(None)`.
-    async fn write_sst(
-        &self,
-        file_id: FileId,
-        source: Source,
-        opts: &WriteOptions,
-    ) -> Result<Option<SstInfo>>;
-
-    /// Read SST file with given `file_handle` and schema.
-    async fn read_sst(
-        &self,
-        file_handle: FileHandle,
-        opts: &ReadOptions,
-    ) -> Result<BoxedBatchReader>;
-
-    /// Deletes a SST file with given name.
-    async fn delete_sst(&self, file_id: FileId) -> Result<()>;
-}
-
-pub type AccessLayerRef = Arc<dyn AccessLayer>;
-
-/// Parquet writer data source.
-pub enum Source {
-    /// Writes rows from memtable to parquet
-    Iter(BoxedBatchIterator),
-    /// Writes row from ChunkReaderImpl (maybe a set of SSTs) to parquet.
-    Reader(ChunkReaderImpl),
-    /// Record batch stream yielded by table scan
-    Stream(SendableRecordBatchStream),
-}
-
-impl Source {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        match self {
-            Source::Iter(iter) => iter.next().transpose(),
-            Source::Reader(reader) => reader
-                .next_chunk()
-                .await
-                .map(|p| p.map(|chunk| Batch::new(chunk.columns))),
-            Source::Stream(stream) => stream
-                .next()
-                .await
-                .transpose()
-                .map(|r| r.map(|r| Batch::new(r.columns().to_vec())))
-                .context(error::CreateRecordBatchSnafu),
-        }
-    }
-
-    fn schema(&self) -> SchemaRef {
-        match self {
-            Source::Iter(iter) => {
-                let projected_schema = iter.schema();
-                projected_schema.schema_to_read().schema().clone()
-            }
-            Source::Reader(reader) => reader.projected_schema().schema_to_read().schema().clone(),
-            Source::Stream(stream) => stream.schema(),
-        }
-    }
-}
-
-/// Sst access layer.
-pub struct FsAccessLayer {
-    sst_dir: String,
-    object_store: ObjectStore,
-}
-
-impl fmt::Debug for FsAccessLayer {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("FsAccessLayer")
-            .field("sst_dir", &self.sst_dir)
-            .finish()
-    }
-}
-
-impl FsAccessLayer {
-    pub fn new(sst_dir: &str, object_store: ObjectStore) -> FsAccessLayer {
-        FsAccessLayer {
-            sst_dir: util::normalize_dir(sst_dir),
-            object_store,
-        }
-    }
-}
-
-#[async_trait]
-impl AccessLayer for FsAccessLayer {
-    fn sst_file_path(&self, file_name: &str) -> String {
-        format!("{}{}", self.sst_dir, file_name)
-    }
-
-    /// Writes SST file with given `file_id`.
-    async fn write_sst(
-        &self,
-        file_id: FileId,
-        source: Source,
-        opts: &WriteOptions,
-    ) -> Result<Option<SstInfo>> {
-        // Now we only supports parquet format. We may allow caller to specific SST format in
-        // WriteOptions in the future.
-        let file_path = self.sst_file_path(&file_id.as_parquet());
-        let writer = ParquetWriter::new(&file_path, source, self.object_store.clone());
-        writer.write_sst(opts).await
-    }
-
-    /// Read SST file with given `file_handle` and schema.
-    async fn read_sst(
-        &self,
-        file_handle: FileHandle,
-        opts: &ReadOptions,
-    ) -> Result<BoxedBatchReader> {
-        let reader = ParquetReader::new(
-            file_handle,
-            self.object_store.clone(),
-            opts.projected_schema.clone(),
-            opts.predicate.clone(),
-            opts.time_range,
-        );
-
-        Ok(Box::new(LazyParquetBatchReader::new(reader)))
-    }
-
-    /// Deletes a SST file with given file id.
-    async fn delete_sst(&self, file_id: FileId) -> Result<()> {
-        let path = self.sst_file_path(&file_id.as_parquet());
-        self.object_store
-            .delete(&path)
-            .await
-            .context(DeleteSstSnafu)
-    }
-}
-
-struct LazyParquetBatchReader {
-    inner: ParquetReader,
-    stream: Option<ChunkStream>,
-}
-
-impl LazyParquetBatchReader {
-    fn new(inner: ParquetReader) -> Self {
-        Self {
-            inner,
-            stream: None,
-        }
-    }
-}
-
-#[async_trait]
-impl BatchReader for LazyParquetBatchReader {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        if let Some(s) = &mut self.stream {
-            s.next_batch().await
-        } else {
-            let mut stream = self.inner.chunk_stream().await?;
-            let res = stream.next_batch().await;
-            self.stream = Some(stream);
-            res
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashSet;
-
-    use super::*;
-    use crate::file_purger::noop::NoopFilePurgeHandler;
-    use crate::scheduler::{LocalScheduler, SchedulerConfig};
-
-    #[test]
-    fn test_file_id() {
-        let id = FileId::random();
-        let uuid_str = id.to_string();
-        assert_eq!(id.0.to_string(), uuid_str);
-
-        let parsed = FileId::parse_str(&uuid_str).unwrap();
-        assert_eq!(id, parsed);
-        let parsed = uuid_str.parse().unwrap();
-        assert_eq!(id, parsed);
-    }
-
-    #[test]
-    fn test_file_id_serialization() {
-        let id = FileId::random();
-        let json = serde_json::to_string(&id).unwrap();
-        assert_eq!(format!("\"{id}\""), json);
-
-        let parsed = serde_json::from_str(&json).unwrap();
-        assert_eq!(id, parsed);
-    }
-
-    #[test]
-    fn test_deserialize_file_meta() {
-        let file_meta = create_file_meta(FileId::random(), 0);
-        let serialized_file_meta = serde_json::to_string(&file_meta).unwrap();
-        let deserialized_file_meta = serde_json::from_str(&serialized_file_meta);
-        assert_eq!(file_meta, deserialized_file_meta.unwrap());
-    }
-
-    #[test]
-    fn test_deserialize_from_string() {
-        let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55\",\"time_range\":null,\"level\":0}";
-        let file_meta = create_file_meta(
-            FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
-            0,
-        );
-        let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
-        assert_eq!(file_meta, deserialized_file_meta);
-    }
-    #[test]
-    fn test_deserialize_from_string_parquet() {
-        let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}";
-        let file_meta = create_file_meta(
-            FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
-            0,
-        );
-        let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
-        assert_eq!(file_meta, deserialized_file_meta);
-    }
-
-    #[test]
-    fn test_deserialize_from_string_parquet_file_name() {
-        let json_file_meta = "{\"region_id\":0,\"file_name\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}";
-        let file_meta = create_file_meta(
-            FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
-            0,
-        );
-        let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
-        assert_eq!(file_meta, deserialized_file_meta);
-    }
-
-    #[test]
-    fn test_file_id_as_parquet() {
-        let id = FileId::from_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap();
-        assert_eq!(
-            "67e55044-10b1-426f-9247-bb680e5fe0c8.parquet",
-            id.as_parquet()
-        );
-    }
-
-    fn create_file_meta(file_id: FileId, level: Level) -> FileMeta {
-        FileMeta {
-            region_id: 0.into(),
-            file_id,
-            time_range: None,
-            level,
-            file_size: 0,
-        }
-    }
-
-    #[test]
-    fn test_level_metas_add_and_remove() {
-        let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
-        let purger = Arc::new(LocalScheduler::new(
-            SchedulerConfig::default(),
-            NoopFilePurgeHandler,
-        ));
-        let file_ids = [
-            FileId::random(),
-            FileId::random(),
-            FileId::random(),
-            FileId::random(),
-        ];
-
-        let metas = LevelMetas::new(layer, purger);
-        let merged = metas.merge(
-            vec![
-                create_file_meta(file_ids[0], 0),
-                create_file_meta(file_ids[1], 0),
-            ]
-            .into_iter(),
-            vec![].into_iter(),
-            None,
-        );
-
-        assert_eq!(
-            HashSet::from([file_ids[0], file_ids[1]]),
-            merged.level(0).files().map(|f| f.file_id()).collect()
-        );
-
-        let merged1 = merged.merge(
-            vec![
-                create_file_meta(file_ids[2], 1),
-                create_file_meta(file_ids[3], 1),
-            ]
-            .into_iter(),
-            vec![].into_iter(),
-            None,
-        );
-        assert_eq!(
-            HashSet::from([file_ids[0], file_ids[1]]),
-            merged1.level(0).files().map(|f| f.file_id()).collect()
-        );
-
-        assert_eq!(
-            HashSet::from([file_ids[2], file_ids[3]]),
-            merged1.level(1).files().map(|f| f.file_id()).collect()
-        );
-
-        let removed1 = merged1.merge(
-            vec![].into_iter(),
-            vec![
-                create_file_meta(file_ids[0], 0),
-                create_file_meta(file_ids[2], 0),
-            ]
-            .into_iter(),
-            None,
-        );
-        assert_eq!(
-            HashSet::from([file_ids[1]]),
-            removed1.level(0).files().map(|f| f.file_id()).collect()
-        );
-
-        assert_eq!(
-            HashSet::from([file_ids[2], file_ids[3]]),
-            removed1.level(1).files().map(|f| f.file_id()).collect()
-        );
-
-        let removed2 = removed1.merge(
-            vec![].into_iter(),
-            vec![
-                create_file_meta(file_ids[2], 1),
-                create_file_meta(file_ids[3], 1),
-            ]
-            .into_iter(),
-            None,
-        );
-        assert_eq!(
-            HashSet::from([file_ids[1]]),
-            removed2.level(0).files().map(|f| f.file_id()).collect()
-        );
-
-        assert_eq!(
-            HashSet::new(),
-            removed2.level(1).files().map(|f| f.file_id()).collect()
-        );
-    }
-}
--- a/src/storage/src/sst/parquet.rs
+++ b/src/storage/src/sst/parquet.rs
@@ -1,819 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Parquet sst format.
-
-use std::collections::HashMap;
-use std::pin::Pin;
-use std::sync::Arc;
-
-use async_compat::CompatExt;
-use async_stream::try_stream;
-use async_trait::async_trait;
-use common_telemetry::{debug, error};
-use common_time::range::TimestampRange;
-use common_time::Timestamp;
-use datatypes::arrow::record_batch::RecordBatch;
-use datatypes::prelude::ConcreteDataType;
-use futures_util::{Stream, StreamExt, TryStreamExt};
-use object_store::ObjectStore;
-use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
-use parquet::basic::{Compression, Encoding, ZstdLevel};
-use parquet::file::metadata::KeyValue;
-use parquet::file::properties::WriterProperties;
-use parquet::format::FileMetaData;
-use parquet::schema::types::ColumnPath;
-use snafu::{OptionExt, ResultExt};
-use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
-use table::predicate::Predicate;
-use tokio::io::BufReader;
-
-use crate::error::{self, DecodeParquetTimeRangeSnafu, ReadObjectSnafu, ReadParquetSnafu, Result};
-use crate::read::{Batch, BatchReader};
-use crate::schema::compat::ReadAdapter;
-use crate::schema::{ProjectedSchemaRef, StoreSchema};
-use crate::sst;
-use crate::sst::pruning::build_row_filter;
-use crate::sst::stream_writer::BufferedWriter;
-use crate::sst::{FileHandle, Source, SstInfo};
-
-/// Parquet sst writer.
-pub struct ParquetWriter<'a> {
-    file_path: &'a str,
-    source: Source,
-    object_store: ObjectStore,
-    max_row_group_size: usize,
-}
-
-impl<'a> ParquetWriter<'a> {
-    pub fn new(file_path: &'a str, source: Source, object_store: ObjectStore) -> ParquetWriter {
-        ParquetWriter {
-            file_path,
-            source,
-            object_store,
-            max_row_group_size: 4096, // TODO(hl): make this configurable
-        }
-    }
-
-    pub async fn write_sst(self, opts: &sst::WriteOptions) -> Result<Option<SstInfo>> {
-        self.write_rows(None, opts).await
-    }
-
-    /// Iterates memtable and writes rows to Parquet file.
-    /// A chunk of records yielded from each iteration with a size given
-    /// in config will be written to a single row group.
-    async fn write_rows(
-        mut self,
-        extra_meta: Option<HashMap<String, String>>,
-        opts: &sst::WriteOptions,
-    ) -> Result<Option<SstInfo>> {
-        let schema = self.source.schema();
-
-        let mut props_builder = WriterProperties::builder()
-            .set_compression(Compression::ZSTD(ZstdLevel::default()))
-            .set_encoding(Encoding::PLAIN)
-            .set_max_row_group_size(self.max_row_group_size)
-            .set_key_value_metadata(extra_meta.map(|map| {
-                map.iter()
-                    .map(|(k, v)| KeyValue::new(k.clone(), v.clone()))
-                    .collect::<Vec<_>>()
-            }))
-            .set_column_encoding(
-                ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
-                Encoding::DELTA_BINARY_PACKED,
-            )
-            .set_column_dictionary_enabled(
-                ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
-                false,
-            );
-
-        if let Some(ts_col) = schema.timestamp_column() {
-            props_builder = props_builder.set_column_encoding(
-                ColumnPath::new(vec![ts_col.name.clone()]),
-                Encoding::DELTA_BINARY_PACKED,
-            );
-        }
-
-        let writer_props = props_builder.build();
-
-        let mut buffered_writer = BufferedWriter::try_new(
-            self.file_path.to_string(),
-            self.object_store.clone(),
-            &schema,
-            Some(writer_props),
-            opts.sst_write_buffer_size.as_bytes() as usize,
-        )
-        .await?;
-        let mut rows_written = 0;
-
-        while let Some(batch) = self.source.next_batch().await? {
-            buffered_writer.write(&batch).await?;
-            rows_written += batch.num_rows();
-        }
-
-        if rows_written == 0 {
-            debug!("No data written, try abort writer: {}", self.file_path);
-            let _ = buffered_writer.close().await?;
-            return Ok(None);
-        }
-
-        let (file_meta, file_size) = buffered_writer.close().await?;
-        let time_range = decode_timestamp_range(&file_meta, &schema).ok().flatten();
-
-        // object_store.write will make sure all bytes are written or an error is raised.
-        Ok(Some(SstInfo {
-            time_range,
-            file_size,
-            num_rows: rows_written,
-        }))
-    }
-}
-
-fn decode_timestamp_range(
-    file_meta: &FileMetaData,
-    schema: &datatypes::schema::SchemaRef,
-) -> Result<Option<(Timestamp, Timestamp)>> {
-    let (Some(ts_col_idx), Some(ts_col)) = (schema.timestamp_index(), schema.timestamp_column())
-    else {
-        return Ok(None);
-    };
-    let ts_datatype = &ts_col.data_type;
-    decode_timestamp_range_inner(file_meta, ts_col_idx, ts_datatype)
-}
-
-fn decode_timestamp_range_inner(
-    file_meta: &FileMetaData,
-    ts_index: usize,
-    ts_datatype: &ConcreteDataType,
-) -> Result<Option<(Timestamp, Timestamp)>> {
-    let mut start = i64::MAX;
-    let mut end = i64::MIN;
-
-    let unit = match ts_datatype {
-        ConcreteDataType::Timestamp(type_) => type_.unit(),
-        _ => {
-            return DecodeParquetTimeRangeSnafu {
-                msg: format!("Unexpected timestamp column datatype: {ts_datatype:?}"),
-            }
-            .fail();
-        }
-    };
-
-    for rg in &file_meta.row_groups {
-        let Some(ref metadata) = rg
-            .columns
-            .get(ts_index)
-            .context(DecodeParquetTimeRangeSnafu {
-                msg: format!("Cannot find ts column by index: {ts_index}"),
-            })?
-            .meta_data
-        else {
-            return Ok(None);
-        };
-        let Some(stats) = &metadata.statistics else {
-            return Ok(None);
-        };
-        let (Some(min_value), Some(max_value)) = (&stats.min_value, &stats.max_value) else {
-            return Ok(None);
-        };
-
-        // according to [parquet's spec](https://parquet.apache.org/docs/file-format/data-pages/encodings/), min/max value in stats uses plain encoding with little endian.
-        // also see https://github.com/apache/arrow-rs/blob/5fb337db04a1a19f7d40da46f19b7b5fd4051593/parquet/src/file/statistics.rs#L172
-        let min = i64::from_le_bytes(min_value[..8].try_into().map_err(|e| {
-            error!(
-                "Failed to decode min value from stats, bytes: {:?}, source: {:?}",
-                min_value, e
-            );
-            DecodeParquetTimeRangeSnafu {
-                msg: "decode min value",
-            }
-            .build()
-        })?);
-        let max = i64::from_le_bytes(max_value[..8].try_into().map_err(|e| {
-            error!(
-                "Failed to decode max value from stats, bytes: {:?}, source: {:?}",
-                max_value, e
-            );
-            DecodeParquetTimeRangeSnafu {
-                msg: "decode max value",
-            }
-            .build()
-        })?);
-        start = start.min(min);
-        end = end.max(max);
-    }
-
-    assert!(
-        start <= end,
-        "Illegal timestamp range decoded from SST file {:?}, start: {}, end: {}",
-        file_meta,
-        start,
-        end
-    );
-    Ok(Some((
-        Timestamp::new(start, unit),
-        Timestamp::new(end, unit),
-    )))
-}
-
-pub struct ParquetReader {
-    // Holds the file handle to avoid the file purge purge it.
-    file_handle: FileHandle,
-    object_store: ObjectStore,
-    projected_schema: ProjectedSchemaRef,
-    predicate: Predicate,
-    time_range: TimestampRange,
-}
-
-impl ParquetReader {
-    pub fn new(
-        file_handle: FileHandle,
-        object_store: ObjectStore,
-        projected_schema: ProjectedSchemaRef,
-        predicate: Predicate,
-        time_range: TimestampRange,
-    ) -> ParquetReader {
-        ParquetReader {
-            file_handle,
-            object_store,
-            projected_schema,
-            predicate,
-            time_range,
-        }
-    }
-
-    pub async fn chunk_stream(&self) -> Result<ChunkStream> {
-        let file_path = self.file_handle.file_path();
-        let operator = self.object_store.clone();
-
-        let reader = operator
-            .reader(&file_path)
-            .await
-            .context(ReadObjectSnafu { path: &file_path })?
-            .compat();
-        let buf_reader = BufReader::new(reader);
-        let builder = ParquetRecordBatchStreamBuilder::new(buf_reader)
-            .await
-            .context(ReadParquetSnafu { file: &file_path })?;
-        let arrow_schema = builder.schema().clone();
-
-        let store_schema = Arc::new(
-            StoreSchema::try_from(arrow_schema)
-                .context(error::ConvertStoreSchemaSnafu { file: &file_path })?,
-        );
-
-        let adapter = ReadAdapter::new(store_schema.clone(), self.projected_schema.clone())?;
-
-        let pruned_row_groups = self
-            .predicate
-            .prune_row_groups(
-                builder.metadata().row_groups(),
-                store_schema.schema().clone(),
-            )
-            .into_iter()
-            .enumerate()
-            .filter_map(|(idx, valid)| if valid { Some(idx) } else { None })
-            .collect::<Vec<_>>();
-
-        let parquet_schema_desc = builder.metadata().file_metadata().schema_descr_ptr();
-
-        let projection_mask = ProjectionMask::roots(&parquet_schema_desc, adapter.fields_to_read());
-        let mut builder = builder
-            .with_projection(projection_mask.clone())
-            .with_row_groups(pruned_row_groups);
-
-        if let Some(row_filter) = build_row_filter(
-            self.time_range,
-            &self.predicate,
-            &store_schema,
-            &parquet_schema_desc,
-            projection_mask,
-        ) {
-            builder = builder.with_row_filter(row_filter);
-        }
-
-        let mut stream = builder
-            .build()
-            .context(ReadParquetSnafu { file: &file_path })?;
-
-        let chunk_stream = try_stream!({
-            while let Some(res) = stream.next().await {
-                yield res.context(ReadParquetSnafu { file: &file_path })?
-            }
-        });
-
-        ChunkStream::new(self.file_handle.clone(), adapter, Box::pin(chunk_stream))
-    }
-}
-
-pub type SendableChunkStream = Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>;
-
-pub struct ChunkStream {
-    // Holds the file handle in the stream to avoid the purger purge it.
-    _file_handle: FileHandle,
-    adapter: ReadAdapter,
-    stream: SendableChunkStream,
-}
-
-impl ChunkStream {
-    pub fn new(
-        file_handle: FileHandle,
-        adapter: ReadAdapter,
-        stream: SendableChunkStream,
-    ) -> Result<Self> {
-        Ok(Self {
-            _file_handle: file_handle,
-            adapter,
-            stream,
-        })
-    }
-}
-
-#[async_trait]
-impl BatchReader for ChunkStream {
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        self.stream
-            .try_next()
-            .await?
-            .map(|rb| self.adapter.arrow_record_batch_to_batch(&rb))
-            .transpose()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::ops::Range;
-    use std::sync::Arc;
-
-    use api::v1::OpType;
-    use common_base::readable_size::ReadableSize;
-    use common_test_util::temp_dir::create_temp_dir;
-    use common_time::timestamp::TimeUnit;
-    use datatypes::arrow::array::{Array, UInt64Array, UInt8Array};
-    use datatypes::prelude::{ScalarVector, Vector};
-    use datatypes::types::{TimestampMillisecondType, TimestampType};
-    use datatypes::vectors::TimestampMillisecondVector;
-    use object_store::services::Fs;
-
-    use super::*;
-    use crate::file_purger::noop::new_noop_file_purger;
-    use crate::memtable::{
-        tests as memtable_tests, DefaultMemtableBuilder, IterContext, MemtableBuilder,
-    };
-    use crate::schema::ProjectedSchema;
-    use crate::sst::{FileId, FileMeta};
-
-    fn create_object_store(root: &str) -> ObjectStore {
-        let mut builder = Fs::default();
-        let _ = builder.root(root);
-        ObjectStore::new(builder).unwrap().finish()
-    }
-
-    #[tokio::test]
-    async fn test_parquet_writer() {
-        common_telemetry::init_default_ut_logging();
-        let schema = memtable_tests::schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema);
-
-        memtable_tests::write_kvs(
-            &*memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1002, 2002, 2003, 2003, 1001], // keys
-            &[
-                (Some(1), Some(1234)),
-                (Some(2), Some(1234)),
-                (Some(7), Some(1234)),
-                (Some(8), Some(1234)),
-                (Some(9), Some(1234)),
-                (Some(3), Some(1234)),
-            ], // values
-        );
-
-        let dir = create_temp_dir("write_parquet");
-        let path = dir.path().to_str().unwrap();
-
-        let object_store = create_object_store(path);
-        let sst_file_name = "test-flush.parquet";
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
-
-        assert!(writer
-            .write_sst(&sst::WriteOptions::default())
-            .await
-            .is_ok());
-
-        // verify parquet file
-        let reader = BufReader::new(object_store.reader(sst_file_name).await.unwrap().compat());
-
-        let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
-
-        let mut stream = builder.build().unwrap();
-        // chunk schema: timestamp, v1, __sequence, __op_type
-        let chunk = stream.next().await.unwrap().unwrap();
-        assert_eq!(5, chunk.columns().len());
-
-        // timestamp
-        assert_eq!(
-            &TimestampMillisecondVector::from_slice([
-                1000.into(),
-                1001.into(),
-                1002.into(),
-                2002.into(),
-                2003.into(),
-            ])
-            .to_arrow_array(),
-            chunk.column(0)
-        );
-
-        // v0
-        assert_eq!(
-            &(Arc::new(UInt64Array::from(vec![1, 3, 2, 7, 9])) as Arc<dyn Array>),
-            chunk.column(1)
-        );
-
-        // v1
-        assert_eq!(
-            &(Arc::new(UInt64Array::from(vec![1234; 5])) as Arc<dyn Array>),
-            chunk.column(2)
-        );
-
-        // sequence
-        assert_eq!(
-            &(Arc::new(UInt64Array::from(vec![10; 5])) as Arc<dyn Array>),
-            chunk.column(3)
-        );
-
-        // op_type
-        assert_eq!(
-            &(Arc::new(UInt8Array::from(vec![1; 5])) as Arc<dyn Array>),
-            chunk.column(4)
-        );
-    }
-
-    #[tokio::test]
-    async fn test_write_large_data() {
-        common_telemetry::init_default_ut_logging();
-        let schema = memtable_tests::schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema);
-
-        let mut rows_written = 0;
-        for i in 0..16 {
-            let range: Range<i64> = i * 1024..(i + 1) * 1024;
-            let keys = range.clone().collect::<Vec<_>>();
-            let values = range
-                .map(|idx| (Some(idx as u64), Some(idx as u64)))
-                .collect::<Vec<_>>();
-            memtable_tests::write_kvs(&*memtable, i as u64, OpType::Put, &keys, &values);
-            rows_written += keys.len();
-        }
-
-        let dir = create_temp_dir("write_large_parquet");
-        let path = dir.path().to_str().unwrap();
-
-        let object_store = create_object_store(path);
-        let sst_file_name = "test-large.parquet";
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
-
-        let sst_info = writer
-            .write_sst(&sst::WriteOptions {
-                sst_write_buffer_size: ReadableSize::kb(4),
-            })
-            .await
-            .unwrap()
-            .unwrap();
-        let file_meta = object_store.stat(sst_file_name).await.unwrap();
-        assert!(file_meta.is_file());
-        assert_eq!(sst_info.file_size, file_meta.content_length());
-        assert_eq!(rows_written, sst_info.num_rows);
-    }
-
-    #[tokio::test]
-    async fn test_parquet_read_large_batch() {
-        common_telemetry::init_default_ut_logging();
-        let schema = memtable_tests::schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema.clone());
-
-        let rows_total = 4096 * 4;
-        let mut keys_vec = Vec::with_capacity(rows_total);
-        let mut values_vec = Vec::with_capacity(rows_total);
-
-        for i in 0..rows_total {
-            keys_vec.push(i as i64);
-            values_vec.push((Some(i as u64), Some(i as u64)));
-        }
-
-        memtable_tests::write_kvs(
-            &*memtable,
-            10, // sequence
-            OpType::Put,
-            &keys_vec,   // keys
-            &values_vec, // values
-        );
-
-        let dir = create_temp_dir("write_parquet");
-        let path = dir.path().to_str().unwrap();
-        let object_store = create_object_store(path);
-        let sst_file_handle = new_file_handle(FileId::random());
-        let sst_file_name = sst_file_handle.file_name();
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
-
-        let SstInfo {
-            time_range,
-            file_size,
-            ..
-        } = writer
-            .write_sst(&sst::WriteOptions::default())
-            .await
-            .unwrap()
-            .unwrap();
-
-        assert_eq!(
-            Some((
-                Timestamp::new_millisecond(0),
-                Timestamp::new_millisecond((rows_total - 1) as i64)
-            )),
-            time_range
-        );
-        assert_ne!(file_size, 0);
-        let operator = create_object_store(dir.path().to_str().unwrap());
-
-        let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
-        let reader = ParquetReader::new(
-            sst_file_handle,
-            operator,
-            projected_schema,
-            Predicate::empty(),
-            TimestampRange::min_to_max(),
-        );
-
-        let mut rows_fetched = 0;
-        let mut stream = reader.chunk_stream().await.unwrap();
-        while let Some(res) = stream.next_batch().await.unwrap() {
-            rows_fetched += res.num_rows();
-        }
-        assert_eq!(rows_total, rows_fetched);
-    }
-
-    fn new_file_handle(file_id: FileId) -> FileHandle {
-        let file_purger = new_noop_file_purger();
-        let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
-        FileHandle::new(
-            FileMeta {
-                region_id: 0.into(),
-                file_id,
-                time_range: Some((
-                    Timestamp::new_millisecond(0),
-                    Timestamp::new_millisecond(1000),
-                )),
-                level: 0,
-                file_size: 0,
-            },
-            layer,
-            file_purger,
-        )
-    }
-
-    #[tokio::test]
-    async fn test_parquet_reader() {
-        common_telemetry::init_default_ut_logging();
-        let schema = memtable_tests::schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema.clone());
-
-        memtable_tests::write_kvs(
-            &*memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1002, 2002, 2003, 2003, 1001], // keys
-            &[
-                (Some(1), Some(1234)),
-                (Some(2), Some(1234)),
-                (Some(7), Some(1234)),
-                (Some(8), Some(1234)),
-                (Some(9), Some(1234)),
-                (Some(3), Some(1234)),
-            ], // values
-        );
-
-        let dir = create_temp_dir("write_parquet");
-        let path = dir.path().to_str().unwrap();
-
-        let object_store = create_object_store(path);
-        let file_handle = new_file_handle(FileId::random());
-        let sst_file_name = file_handle.file_name();
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
-
-        let SstInfo {
-            time_range,
-            file_size,
-            ..
-        } = writer
-            .write_sst(&sst::WriteOptions::default())
-            .await
-            .unwrap()
-            .unwrap();
-
-        assert_eq!(
-            Some((
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2003)
-            )),
-            time_range
-        );
-        assert_ne!(file_size, 0);
-        let operator = create_object_store(dir.path().to_str().unwrap());
-
-        let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
-        let reader = ParquetReader::new(
-            file_handle,
-            operator,
-            projected_schema,
-            Predicate::empty(),
-            TimestampRange::min_to_max(),
-        );
-
-        let mut stream = reader.chunk_stream().await.unwrap();
-        assert_eq!(
-            5,
-            stream
-                .next_batch()
-                .await
-                .transpose()
-                .unwrap()
-                .unwrap()
-                .num_rows()
-        );
-    }
-
-    async fn check_range_read(
-        file_handle: FileHandle,
-        object_store: ObjectStore,
-        schema: ProjectedSchemaRef,
-        range: TimestampRange,
-        expect: Vec<i64>,
-    ) {
-        let reader =
-            ParquetReader::new(file_handle, object_store, schema, Predicate::empty(), range);
-        let mut stream = reader.chunk_stream().await.unwrap();
-        let result = stream.next_batch().await;
-
-        let Some(batch) = result.unwrap() else {
-            // if batch does not contain any row
-            assert!(expect.is_empty());
-            return;
-        };
-
-        assert_eq!(
-            ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType)),
-            batch.column(0).data_type()
-        );
-
-        let ts = batch
-            .column(0)
-            .as_any()
-            .downcast_ref::<TimestampMillisecondVector>()
-            .unwrap()
-            .iter_data()
-            .map(|t| t.unwrap().0.value())
-            .collect::<Vec<_>>();
-        assert_eq!(expect, ts);
-    }
-
-    #[tokio::test]
-    async fn test_parquet_reader_with_time_range_filter() {
-        common_telemetry::init_default_ut_logging();
-        let schema = memtable_tests::schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema.clone());
-
-        memtable_tests::write_kvs(
-            &*memtable,
-            10, // sequence
-            OpType::Put,
-            &[1000, 1002, 2002, 2003, 2003, 1001, 3001], // keys
-            &[
-                (Some(1), Some(1234)),
-                (Some(2), Some(1234)),
-                (Some(7), Some(1234)),
-                (Some(8), Some(1234)),
-                (Some(9), Some(1234)),
-                (Some(3), Some(1234)),
-                (Some(7), Some(1234)),
-            ], // values
-        );
-
-        let dir = create_temp_dir("read-parquet-by-range");
-        let path = dir.path().to_str().unwrap();
-        let object_store = create_object_store(path);
-        let sst_file_handle = new_file_handle(FileId::random());
-        let sst_file_name = sst_file_handle.file_name();
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
-
-        let SstInfo {
-            time_range,
-            file_size,
-            ..
-        } = writer
-            .write_sst(&sst::WriteOptions::default())
-            .await
-            .unwrap()
-            .unwrap();
-
-        assert_eq!(
-            Some((
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(3001)
-            )),
-            time_range
-        );
-        assert_ne!(file_size, 0);
-
-        let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1, 0, 2])).unwrap());
-
-        check_range_read(
-            sst_file_handle.clone(),
-            object_store.clone(),
-            projected_schema.clone(),
-            TimestampRange::with_unit(1000, 2003, TimeUnit::Millisecond).unwrap(),
-            vec![1000, 1001, 1002, 2002],
-        )
-        .await;
-
-        check_range_read(
-            sst_file_handle.clone(),
-            object_store.clone(),
-            projected_schema.clone(),
-            TimestampRange::with_unit(2002, 3001, TimeUnit::Millisecond).unwrap(),
-            vec![2002, 2003],
-        )
-        .await;
-
-        // read a range without any rows.
-        check_range_read(
-            sst_file_handle.clone(),
-            object_store.clone(),
-            projected_schema.clone(),
-            TimestampRange::with_unit(3002, 3003, TimeUnit::Millisecond).unwrap(),
-            vec![],
-        )
-        .await;
-
-        //
-        check_range_read(
-            sst_file_handle.clone(),
-            object_store.clone(),
-            projected_schema.clone(),
-            TimestampRange::with_unit(1000, 3000, TimeUnit::Millisecond).unwrap(),
-            vec![1000, 1001, 1002, 2002, 2003],
-        )
-        .await;
-
-        // read full range
-        check_range_read(
-            sst_file_handle,
-            object_store,
-            projected_schema,
-            TimestampRange::min_to_max(),
-            vec![1000, 1001, 1002, 2002, 2003, 3001],
-        )
-        .await;
-    }
-
-    #[tokio::test]
-    async fn test_write_empty_file() {
-        common_telemetry::init_default_ut_logging();
-        let schema = memtable_tests::schema_for_test();
-        let memtable = DefaultMemtableBuilder::default().build(schema.clone());
-
-        let dir = create_temp_dir("write-empty-file");
-        let path = dir.path().to_str().unwrap();
-        let mut builder = Fs::default();
-        let _ = builder.root(path);
-        let object_store = ObjectStore::new(builder).unwrap().finish();
-        let sst_file_name = "test-empty.parquet";
-        let iter = memtable.iter(IterContext::default()).unwrap();
-        let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
-
-        let sst_info_opt = writer
-            .write_sst(&sst::WriteOptions::default())
-            .await
-            .unwrap();
-        assert!(sst_info_opt.is_none());
-        // The file should not exist when no row has been written.
-        assert!(!object_store.is_exist(sst_file_name).await.unwrap());
-    }
-}
--- a/Show More
+++ b/Show More