mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-26 09:50:40 +00:00
refactor: Remove usages of the old storage crate (#2777)
* chore: remove storage from some crate * feat: remove storage config * feat: remove storage from cmd * feat: impl stream_to_parquet * feat: remove storage from operator * feat: remove stream writer from mito2 * feat: remove storage from project toml * test: fix config api test * docs: remove outdated configs * refactor: remove storage directory
This commit is contained in:
119
Cargo.lock
generated
119
Cargo.lock
generated
@@ -641,12 +641,6 @@ version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
|
||||
|
||||
[[package]]
|
||||
name = "atomic_float"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62af46d040ba9df09edc6528dae9d8e49f5f3e82f55b7d2ec31a733c38dbc49d"
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
@@ -1205,7 +1199,6 @@ dependencies = [
|
||||
"serde_json",
|
||||
"session",
|
||||
"snafu",
|
||||
"storage",
|
||||
"store-api",
|
||||
"table",
|
||||
"tokio",
|
||||
@@ -1628,11 +1621,13 @@ dependencies = [
|
||||
"common-runtime",
|
||||
"common-test-util",
|
||||
"datafusion",
|
||||
"datatypes",
|
||||
"derive_builder 0.12.0",
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"object-store",
|
||||
"orc-rust",
|
||||
"parquet",
|
||||
"paste",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -1722,7 +1717,7 @@ dependencies = [
|
||||
"common-runtime",
|
||||
"common-telemetry",
|
||||
"common-time",
|
||||
"criterion 0.4.0",
|
||||
"criterion",
|
||||
"dashmap",
|
||||
"datafusion",
|
||||
"datatypes",
|
||||
@@ -2142,32 +2137,6 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"cast",
|
||||
"clap 2.34.0",
|
||||
"criterion-plot 0.4.5",
|
||||
"csv",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"num-traits",
|
||||
"oorandom",
|
||||
"plotters",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_cbor",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"tinytemplate",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.4.0"
|
||||
@@ -2179,7 +2148,7 @@ dependencies = [
|
||||
"cast",
|
||||
"ciborium",
|
||||
"clap 3.2.25",
|
||||
"criterion-plot 0.5.0",
|
||||
"criterion-plot",
|
||||
"futures",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
@@ -2196,16 +2165,6 @@ dependencies = [
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools 0.10.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.5.0"
|
||||
@@ -2681,7 +2640,6 @@ dependencies = [
|
||||
"session",
|
||||
"snafu",
|
||||
"sql",
|
||||
"storage",
|
||||
"store-api",
|
||||
"substrait 0.4.3",
|
||||
"table",
|
||||
@@ -3313,7 +3271,6 @@ dependencies = [
|
||||
"snafu",
|
||||
"sql",
|
||||
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)",
|
||||
"storage",
|
||||
"store-api",
|
||||
"strfmt",
|
||||
"substrait 0.4.3",
|
||||
@@ -5569,7 +5526,6 @@ dependencies = [
|
||||
"snafu",
|
||||
"sql",
|
||||
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)",
|
||||
"storage",
|
||||
"store-api",
|
||||
"substrait 0.4.3",
|
||||
"table",
|
||||
@@ -7966,7 +7922,7 @@ dependencies = [
|
||||
"common-test-util",
|
||||
"common-time",
|
||||
"console",
|
||||
"criterion 0.4.0",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"datafusion",
|
||||
"datafusion-common",
|
||||
@@ -7998,7 +7954,6 @@ dependencies = [
|
||||
"session",
|
||||
"snafu",
|
||||
"sql",
|
||||
"storage",
|
||||
"store-api",
|
||||
"table",
|
||||
"tokio",
|
||||
@@ -8078,16 +8033,6 @@ dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_cbor"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
|
||||
dependencies = [
|
||||
"half 1.8.2",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.190"
|
||||
@@ -8829,60 +8774,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage"
|
||||
version = "0.4.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"async-compat",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"atomic_float",
|
||||
"bytes",
|
||||
"common-base",
|
||||
"common-config",
|
||||
"common-datasource",
|
||||
"common-error",
|
||||
"common-macro",
|
||||
"common-query",
|
||||
"common-recordbatch",
|
||||
"common-runtime",
|
||||
"common-telemetry",
|
||||
"common-test-util",
|
||||
"common-time",
|
||||
"criterion 0.3.6",
|
||||
"datafusion",
|
||||
"datafusion-common",
|
||||
"datafusion-expr",
|
||||
"datafusion-physical-expr",
|
||||
"datatypes",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"log-store",
|
||||
"object-store",
|
||||
"parquet",
|
||||
"paste",
|
||||
"prometheus",
|
||||
"prost 0.12.1",
|
||||
"rand",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"snafu",
|
||||
"store-api",
|
||||
"table",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tonic 0.10.2",
|
||||
"tonic-build 0.9.2",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "store-api"
|
||||
version = "0.4.3"
|
||||
|
||||
@@ -49,7 +49,6 @@ members = [
|
||||
"src/servers",
|
||||
"src/session",
|
||||
"src/sql",
|
||||
"src/storage",
|
||||
"src/store-api",
|
||||
"src/table",
|
||||
"tests-integration",
|
||||
@@ -176,7 +175,6 @@ script = { path = "src/script" }
|
||||
servers = { path = "src/servers" }
|
||||
session = { path = "src/session" }
|
||||
sql = { path = "src/sql" }
|
||||
storage = { path = "src/storage" }
|
||||
store-api = { path = "src/store-api" }
|
||||
substrait = { path = "src/common/substrait" }
|
||||
table = { path = "src/table" }
|
||||
|
||||
@@ -53,33 +53,6 @@ type = "File"
|
||||
# The local file cache capacity in bytes.
|
||||
# cache_capacity = "256MB"
|
||||
|
||||
# Compaction options, see `standalone.example.toml`.
|
||||
[storage.compaction]
|
||||
max_inflight_tasks = 4
|
||||
max_files_in_level0 = 8
|
||||
max_purge_tasks = 32
|
||||
|
||||
# Storage manifest options
|
||||
[storage.manifest]
|
||||
# Region checkpoint actions margin.
|
||||
# Create a checkpoint every <checkpoint_margin> actions.
|
||||
checkpoint_margin = 10
|
||||
# Region manifest logs and checkpoints gc execution duration
|
||||
gc_duration = '10m'
|
||||
|
||||
# Storage flush options
|
||||
[storage.flush]
|
||||
# Max inflight flush tasks.
|
||||
max_flush_tasks = 8
|
||||
# Default write buffer size for a region.
|
||||
region_write_buffer_size = "32MB"
|
||||
# Interval to check whether a region needs flush.
|
||||
picker_schedule_interval = "5m"
|
||||
# Interval to auto flush a region if it has not flushed yet.
|
||||
auto_flush_interval = "1h"
|
||||
# Global write buffer size for all regions.
|
||||
global_write_buffer_size = "1GB"
|
||||
|
||||
# Mito engine options
|
||||
[[region_engine]]
|
||||
[region_engine.mito]
|
||||
|
||||
@@ -122,36 +122,6 @@ type = "File"
|
||||
# The local file cache capacity in bytes.
|
||||
# cache_capacity = "256MB"
|
||||
|
||||
# Compaction options.
|
||||
[storage.compaction]
|
||||
# Max task number that can concurrently run.
|
||||
max_inflight_tasks = 4
|
||||
# Max files in level 0 to trigger compaction.
|
||||
max_files_in_level0 = 8
|
||||
# Max task number for SST purge task after compaction.
|
||||
max_purge_tasks = 32
|
||||
|
||||
# Storage manifest options
|
||||
[storage.manifest]
|
||||
# Region checkpoint actions margin.
|
||||
# Create a checkpoint every <checkpoint_margin> actions.
|
||||
checkpoint_margin = 10
|
||||
# Region manifest logs and checkpoints gc execution duration
|
||||
gc_duration = '10m'
|
||||
|
||||
# Storage flush options
|
||||
[storage.flush]
|
||||
# Max inflight flush tasks.
|
||||
max_flush_tasks = 8
|
||||
# Default write buffer size for a region.
|
||||
region_write_buffer_size = "32MB"
|
||||
# Interval to check whether a region needs flush.
|
||||
picker_schedule_interval = "5m"
|
||||
# Interval to auto flush a region if it has not flushed yet.
|
||||
auto_flush_interval = "1h"
|
||||
# Global write buffer size for all regions.
|
||||
global_write_buffer_size = "1GB"
|
||||
|
||||
# Mito engine options
|
||||
[[region_engine]]
|
||||
[region_engine.mito]
|
||||
|
||||
@@ -49,5 +49,4 @@ chrono.workspace = true
|
||||
common-test-util.workspace = true
|
||||
log-store.workspace = true
|
||||
object-store.workspace = true
|
||||
storage.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -192,7 +192,7 @@ mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use common_test_util::temp_dir::create_named_temp_file;
|
||||
use datanode::config::{CompactionConfig, FileConfig, ObjectStoreConfig, RegionManifestConfig};
|
||||
use datanode::config::{FileConfig, ObjectStoreConfig};
|
||||
use servers::heartbeat_options::HeartbeatOptions;
|
||||
use servers::Mode;
|
||||
|
||||
@@ -232,16 +232,6 @@ mod tests {
|
||||
type = "File"
|
||||
data_home = "/tmp/greptimedb/"
|
||||
|
||||
[storage.compaction]
|
||||
max_inflight_tasks = 3
|
||||
max_files_in_level0 = 7
|
||||
max_purge_tasks = 32
|
||||
|
||||
[storage.manifest]
|
||||
checkpoint_margin = 9
|
||||
gc_duration = '7s'
|
||||
compress = true
|
||||
|
||||
[logging]
|
||||
level = "debug"
|
||||
dir = "/tmp/greptimedb/test/logs"
|
||||
@@ -294,23 +284,6 @@ mod tests {
|
||||
ObjectStoreConfig::File(FileConfig { .. })
|
||||
));
|
||||
|
||||
assert_eq!(
|
||||
CompactionConfig {
|
||||
max_inflight_tasks: 3,
|
||||
max_files_in_level0: 7,
|
||||
max_purge_tasks: 32,
|
||||
},
|
||||
options.storage.compaction,
|
||||
);
|
||||
assert_eq!(
|
||||
RegionManifestConfig {
|
||||
checkpoint_margin: Some(9),
|
||||
gc_duration: Some(Duration::from_secs(7)),
|
||||
compress: true
|
||||
},
|
||||
options.storage.manifest,
|
||||
);
|
||||
|
||||
assert_eq!("debug", options.logging.level.unwrap());
|
||||
assert_eq!("/tmp/greptimedb/test/logs".to_string(), options.logging.dir);
|
||||
}
|
||||
@@ -387,18 +360,12 @@ mod tests {
|
||||
file_size = "1GB"
|
||||
purge_threshold = "50GB"
|
||||
purge_interval = "10m"
|
||||
read_batch_size = 128
|
||||
sync_write = false
|
||||
|
||||
[storage]
|
||||
type = "File"
|
||||
data_home = "/tmp/greptimedb/"
|
||||
|
||||
[storage.compaction]
|
||||
max_inflight_tasks = 3
|
||||
max_files_in_level0 = 7
|
||||
max_purge_tasks = 32
|
||||
|
||||
[logging]
|
||||
level = "debug"
|
||||
dir = "/tmp/greptimedb/test/logs"
|
||||
@@ -409,26 +376,24 @@ mod tests {
|
||||
temp_env::with_vars(
|
||||
[
|
||||
(
|
||||
// storage.manifest.gc_duration = 9s
|
||||
// wal.purge_interval = 1m
|
||||
[
|
||||
env_prefix.to_string(),
|
||||
"storage".to_uppercase(),
|
||||
"manifest".to_uppercase(),
|
||||
"gc_duration".to_uppercase(),
|
||||
"wal".to_uppercase(),
|
||||
"purge_interval".to_uppercase(),
|
||||
]
|
||||
.join(ENV_VAR_SEP),
|
||||
Some("9s"),
|
||||
Some("1m"),
|
||||
),
|
||||
(
|
||||
// storage.compaction.max_purge_tasks = 99
|
||||
// wal.read_batch_size = 100
|
||||
[
|
||||
env_prefix.to_string(),
|
||||
"storage".to_uppercase(),
|
||||
"compaction".to_uppercase(),
|
||||
"max_purge_tasks".to_uppercase(),
|
||||
"wal".to_uppercase(),
|
||||
"read_batch_size".to_uppercase(),
|
||||
]
|
||||
.join(ENV_VAR_SEP),
|
||||
Some("99"),
|
||||
Some("100"),
|
||||
),
|
||||
(
|
||||
// meta_client.metasrv_addrs = 127.0.0.1:3001,127.0.0.1:3002,127.0.0.1:3003
|
||||
@@ -456,10 +421,7 @@ mod tests {
|
||||
};
|
||||
|
||||
// Should be read from env, env > default values.
|
||||
assert_eq!(
|
||||
opts.storage.manifest.gc_duration,
|
||||
Some(Duration::from_secs(9))
|
||||
);
|
||||
assert_eq!(opts.wal.read_batch_size, 100,);
|
||||
assert_eq!(
|
||||
opts.meta_client.unwrap().metasrv_addrs,
|
||||
vec![
|
||||
@@ -470,19 +432,13 @@ mod tests {
|
||||
);
|
||||
|
||||
// Should be read from config file, config file > env > default values.
|
||||
assert_eq!(opts.storage.compaction.max_purge_tasks, 32);
|
||||
assert_eq!(opts.wal.purge_interval, Duration::from_secs(60 * 10));
|
||||
|
||||
// Should be read from cli, cli > config file > env > default values.
|
||||
assert_eq!(opts.wal.dir.unwrap(), "/other/wal/dir");
|
||||
|
||||
// Should be default value.
|
||||
assert_eq!(
|
||||
opts.storage.manifest.checkpoint_margin,
|
||||
DatanodeOptions::default()
|
||||
.storage
|
||||
.manifest
|
||||
.checkpoint_margin
|
||||
);
|
||||
assert_eq!(opts.http.addr, DatanodeOptions::default().http.addr);
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
@@ -147,7 +147,6 @@ impl Options {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Write;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_test_util::temp_dir::create_named_temp_file;
|
||||
use datanode::config::{DatanodeOptions, ObjectStoreConfig};
|
||||
@@ -179,11 +178,6 @@ mod tests {
|
||||
read_batch_size = 128
|
||||
sync_write = false
|
||||
|
||||
[storage.compaction]
|
||||
max_inflight_tasks = 3
|
||||
max_files_in_level0 = 7
|
||||
max_purge_tasks = 32
|
||||
|
||||
[logging]
|
||||
level = "debug"
|
||||
dir = "/tmp/greptimedb/test/logs"
|
||||
@@ -194,17 +188,6 @@ mod tests {
|
||||
temp_env::with_vars(
|
||||
// The following environment variables will be used to override the values in the config file.
|
||||
[
|
||||
(
|
||||
// storage.manifest.checkpoint_margin = 99
|
||||
[
|
||||
env_prefix.to_string(),
|
||||
"storage".to_uppercase(),
|
||||
"manifest".to_uppercase(),
|
||||
"checkpoint_margin".to_uppercase(),
|
||||
]
|
||||
.join(ENV_VAR_SEP),
|
||||
Some("99"),
|
||||
),
|
||||
(
|
||||
// storage.type = S3
|
||||
[
|
||||
@@ -225,17 +208,6 @@ mod tests {
|
||||
.join(ENV_VAR_SEP),
|
||||
Some("mybucket"),
|
||||
),
|
||||
(
|
||||
// storage.manifest.gc_duration = 42s
|
||||
[
|
||||
env_prefix.to_string(),
|
||||
"storage".to_uppercase(),
|
||||
"manifest".to_uppercase(),
|
||||
"gc_duration".to_uppercase(),
|
||||
]
|
||||
.join(ENV_VAR_SEP),
|
||||
Some("42s"),
|
||||
),
|
||||
(
|
||||
// wal.dir = /other/wal/dir
|
||||
[
|
||||
@@ -266,17 +238,12 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
// Check the configs from environment variables.
|
||||
assert_eq!(opts.storage.manifest.checkpoint_margin, Some(99));
|
||||
match opts.storage.store {
|
||||
ObjectStoreConfig::S3(s3_config) => {
|
||||
assert_eq!(s3_config.bucket, "mybucket".to_string());
|
||||
}
|
||||
_ => panic!("unexpected store type"),
|
||||
}
|
||||
assert_eq!(
|
||||
opts.storage.manifest.gc_duration,
|
||||
Some(Duration::from_secs(42))
|
||||
);
|
||||
assert_eq!(
|
||||
opts.meta_client.unwrap().metasrv_addrs,
|
||||
vec![
|
||||
|
||||
@@ -21,11 +21,13 @@ common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-runtime.workspace = true
|
||||
datafusion.workspace = true
|
||||
datatypes.workspace = true
|
||||
derive_builder.workspace = true
|
||||
futures.workspace = true
|
||||
lazy_static.workspace = true
|
||||
object-store.workspace = true
|
||||
orc-rust = "0.2"
|
||||
parquet.workspace = true
|
||||
paste = "1.0"
|
||||
regex = "1.7"
|
||||
serde.workspace = true
|
||||
|
||||
@@ -166,6 +166,14 @@ pub enum Error {
|
||||
|
||||
#[snafu(display("Buffered writer closed"))]
|
||||
BufferedWriterClosed { location: Location },
|
||||
|
||||
#[snafu(display("Failed to write parquet file, path: {}", path))]
|
||||
WriteParquet {
|
||||
path: String,
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: parquet::errors::ParquetError,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -178,7 +186,8 @@ impl ErrorExt for Error {
|
||||
| ListObjects { .. }
|
||||
| ReadObject { .. }
|
||||
| WriteObject { .. }
|
||||
| AsyncWrite { .. } => StatusCode::StorageUnavailable,
|
||||
| AsyncWrite { .. }
|
||||
| WriteParquet { .. } => StatusCode::StorageUnavailable,
|
||||
|
||||
UnsupportedBackendProtocol { .. }
|
||||
| UnsupportedCompressionType { .. }
|
||||
@@ -231,6 +240,7 @@ impl ErrorExt for Error {
|
||||
InvalidConnection { location, .. } => Some(*location),
|
||||
UnsupportedCompressionType { location, .. } => Some(*location),
|
||||
UnsupportedFormat { location, .. } => Some(*location),
|
||||
WriteParquet { location, .. } => Some(*location),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,11 +12,13 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_schema::Schema;
|
||||
use arrow_schema::{Schema, SchemaRef};
|
||||
use async_trait::async_trait;
|
||||
use datafusion::datasource::physical_plan::{FileMeta, ParquetFileReaderFactory};
|
||||
use datafusion::error::Result as DatafusionResult;
|
||||
@@ -26,11 +28,15 @@ use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
|
||||
use datafusion::parquet::file::metadata::ParquetMetaData;
|
||||
use datafusion::parquet::format::FileMetaData;
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use datafusion::physical_plan::SendableRecordBatchStream;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::StreamExt;
|
||||
use object_store::{ObjectStore, Reader};
|
||||
use parquet::basic::{Compression, ZstdLevel};
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder};
|
||||
use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder, LazyBufferedWriter};
|
||||
use crate::error::{self, Result};
|
||||
use crate::file_format::FileFormat;
|
||||
use crate::share_buffer::SharedBuffer;
|
||||
@@ -156,6 +162,103 @@ impl ArrowWriterCloser for ArrowWriter<SharedBuffer> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying
|
||||
/// storage by chunks to reduce memory consumption.
|
||||
pub struct BufferedWriter {
|
||||
inner: InnerBufferedWriter,
|
||||
}
|
||||
|
||||
type InnerBufferedWriter = LazyBufferedWriter<
|
||||
object_store::Writer,
|
||||
ArrowWriter<SharedBuffer>,
|
||||
Box<
|
||||
dyn FnMut(
|
||||
String,
|
||||
)
|
||||
-> Pin<Box<dyn Future<Output = error::Result<object_store::Writer>> + Send>>
|
||||
+ Send,
|
||||
>,
|
||||
>;
|
||||
|
||||
impl BufferedWriter {
|
||||
pub async fn try_new(
|
||||
path: String,
|
||||
store: ObjectStore,
|
||||
arrow_schema: SchemaRef,
|
||||
props: Option<WriterProperties>,
|
||||
buffer_threshold: usize,
|
||||
) -> error::Result<Self> {
|
||||
let buffer = SharedBuffer::with_capacity(buffer_threshold);
|
||||
|
||||
let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props)
|
||||
.context(error::WriteParquetSnafu { path: &path })?;
|
||||
|
||||
Ok(Self {
|
||||
inner: LazyBufferedWriter::new(
|
||||
buffer_threshold,
|
||||
buffer,
|
||||
arrow_writer,
|
||||
&path,
|
||||
Box::new(move |path| {
|
||||
let store = store.clone();
|
||||
Box::pin(async move {
|
||||
store
|
||||
.writer(&path)
|
||||
.await
|
||||
.context(error::WriteObjectSnafu { path })
|
||||
})
|
||||
}),
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
/// Write a record batch to stream writer.
|
||||
pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
|
||||
self.inner.write(arrow_batch).await?;
|
||||
self.inner.try_flush(false).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close parquet writer.
|
||||
///
|
||||
/// Return file metadata and bytes written.
|
||||
pub async fn close(self) -> error::Result<(FileMetaData, u64)> {
|
||||
self.inner.close_with_arrow_writer().await
|
||||
}
|
||||
}
|
||||
|
||||
/// Output the stream to a parquet file.
|
||||
///
|
||||
/// Returns number of rows written.
|
||||
pub async fn stream_to_parquet(
|
||||
mut stream: SendableRecordBatchStream,
|
||||
store: ObjectStore,
|
||||
path: &str,
|
||||
threshold: usize,
|
||||
) -> Result<usize> {
|
||||
let write_props = WriterProperties::builder()
|
||||
.set_compression(Compression::ZSTD(ZstdLevel::default()))
|
||||
.build();
|
||||
let schema = stream.schema();
|
||||
let mut buffered_writer = BufferedWriter::try_new(
|
||||
path.to_string(),
|
||||
store,
|
||||
schema,
|
||||
Some(write_props),
|
||||
threshold,
|
||||
)
|
||||
.await?;
|
||||
let mut rows_written = 0;
|
||||
while let Some(batch) = stream.next().await {
|
||||
let batch = batch.context(error::ReadRecordBatchSnafu)?;
|
||||
buffered_writer.write(&batch).await?;
|
||||
rows_written += batch.num_rows();
|
||||
}
|
||||
buffered_writer.close().await?;
|
||||
Ok(rows_written)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_test_util::find_workspace_path;
|
||||
|
||||
@@ -61,7 +61,6 @@ servers.workspace = true
|
||||
session.workspace = true
|
||||
snafu.workspace = true
|
||||
sql.workspace = true
|
||||
storage.workspace = true
|
||||
store-api.workspace = true
|
||||
substrait.workspace = true
|
||||
table.workspace = true
|
||||
|
||||
@@ -31,11 +31,6 @@ use serde::{Deserialize, Serialize};
|
||||
use servers::heartbeat_options::HeartbeatOptions;
|
||||
use servers::http::HttpOptions;
|
||||
use servers::Mode;
|
||||
use storage::config::{
|
||||
EngineConfig as StorageEngineConfig, DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_MAX_FLUSH_TASKS,
|
||||
DEFAULT_PICKER_SCHEDULE_INTERVAL, DEFAULT_REGION_WRITE_BUFFER_SIZE,
|
||||
};
|
||||
use storage::scheduler::SchedulerConfig;
|
||||
|
||||
pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::mb(256);
|
||||
|
||||
@@ -68,9 +63,6 @@ pub struct StorageConfig {
|
||||
pub data_home: String,
|
||||
#[serde(flatten)]
|
||||
pub store: ObjectStoreConfig,
|
||||
pub compaction: CompactionConfig,
|
||||
pub manifest: RegionManifestConfig,
|
||||
pub flush: FlushConfig,
|
||||
}
|
||||
|
||||
impl Default for StorageConfig {
|
||||
@@ -79,9 +71,6 @@ impl Default for StorageConfig {
|
||||
global_ttl: None,
|
||||
data_home: DEFAULT_DATA_HOME.to_string(),
|
||||
store: ObjectStoreConfig::default(),
|
||||
compaction: CompactionConfig::default(),
|
||||
manifest: RegionManifestConfig::default(),
|
||||
flush: FlushConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,109 +205,6 @@ impl Default for ObjectStoreConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/// Options for region manifest
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[serde(default)]
|
||||
pub struct RegionManifestConfig {
|
||||
/// Region manifest checkpoint actions margin.
|
||||
/// Manifest service create a checkpoint every `checkpoint_margin` actions.
|
||||
pub checkpoint_margin: Option<u16>,
|
||||
/// Region manifest logs and checkpoints gc task execution duration.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub gc_duration: Option<Duration>,
|
||||
/// Whether to compress manifest and checkpoint file by gzip
|
||||
pub compress: bool,
|
||||
}
|
||||
|
||||
impl Default for RegionManifestConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
checkpoint_margin: Some(10u16),
|
||||
gc_duration: Some(Duration::from_secs(600)),
|
||||
compress: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Options for table compaction
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[serde(default)]
|
||||
pub struct CompactionConfig {
|
||||
/// Max task number that can concurrently run.
|
||||
pub max_inflight_tasks: usize,
|
||||
/// Max files in level 0 to trigger compaction.
|
||||
pub max_files_in_level0: usize,
|
||||
/// Max task number for SST purge task after compaction.
|
||||
pub max_purge_tasks: usize,
|
||||
}
|
||||
|
||||
impl Default for CompactionConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_inflight_tasks: 4,
|
||||
max_files_in_level0: 8,
|
||||
max_purge_tasks: 32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[serde(default)]
|
||||
pub struct FlushConfig {
|
||||
/// Max inflight flush tasks.
|
||||
pub max_flush_tasks: usize,
|
||||
/// Default write buffer size for a region.
|
||||
pub region_write_buffer_size: ReadableSize,
|
||||
/// Interval to schedule auto flush picker to find region to flush.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub picker_schedule_interval: Duration,
|
||||
/// Interval to auto flush a region if it has not flushed yet.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub auto_flush_interval: Duration,
|
||||
/// Global write buffer size for all regions.
|
||||
pub global_write_buffer_size: Option<ReadableSize>,
|
||||
}
|
||||
|
||||
impl Default for FlushConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
|
||||
region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
|
||||
picker_schedule_interval: Duration::from_millis(
|
||||
DEFAULT_PICKER_SCHEDULE_INTERVAL.into(),
|
||||
),
|
||||
auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
|
||||
global_write_buffer_size: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&DatanodeOptions> for SchedulerConfig {
|
||||
fn from(value: &DatanodeOptions) -> Self {
|
||||
Self {
|
||||
max_inflight_tasks: value.storage.compaction.max_inflight_tasks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&DatanodeOptions> for StorageEngineConfig {
|
||||
fn from(value: &DatanodeOptions) -> Self {
|
||||
Self {
|
||||
compress_manifest: value.storage.manifest.compress,
|
||||
manifest_checkpoint_margin: value.storage.manifest.checkpoint_margin,
|
||||
manifest_gc_duration: value.storage.manifest.gc_duration,
|
||||
max_files_in_l0: value.storage.compaction.max_files_in_level0,
|
||||
max_purge_tasks: value.storage.compaction.max_purge_tasks,
|
||||
max_flush_tasks: value.storage.flush.max_flush_tasks,
|
||||
region_write_buffer_size: value.storage.flush.region_write_buffer_size,
|
||||
picker_schedule_interval: value.storage.flush.picker_schedule_interval,
|
||||
auto_flush_interval: value.storage.flush.auto_flush_interval,
|
||||
global_write_buffer_size: value.storage.flush.global_write_buffer_size,
|
||||
global_ttl: value.storage.global_ttl,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DatanodeOptions {
|
||||
|
||||
@@ -68,7 +68,6 @@ session.workspace = true
|
||||
snafu.workspace = true
|
||||
sql.workspace = true
|
||||
sqlparser.workspace = true
|
||||
storage.workspace = true
|
||||
store-api.workspace = true
|
||||
substrait.workspace = true
|
||||
table.workspace = true
|
||||
|
||||
@@ -121,14 +121,6 @@ pub enum Error {
|
||||
source: common_datasource::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to write parquet file, path: {}", path))]
|
||||
WriteParquet {
|
||||
path: String,
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: parquet::errors::ParquetError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read parquet file, path: {}", path))]
|
||||
ReadParquet {
|
||||
path: String,
|
||||
@@ -428,7 +420,6 @@ impl ErrorExt for Error {
|
||||
|
||||
match self {
|
||||
OpenDal { .. }
|
||||
| WriteParquet { .. }
|
||||
| ReadParquet { .. }
|
||||
| WriteWal { .. }
|
||||
| ReadWal { .. }
|
||||
|
||||
@@ -17,5 +17,4 @@
|
||||
pub mod file;
|
||||
pub mod file_purger;
|
||||
pub mod parquet;
|
||||
mod stream_writer;
|
||||
pub(crate) mod version;
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
//! Parquet writer.
|
||||
|
||||
use common_datasource::file_format::parquet::BufferedWriter;
|
||||
use common_telemetry::debug;
|
||||
use common_time::Timestamp;
|
||||
use object_store::ObjectStore;
|
||||
@@ -25,11 +26,10 @@ use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
|
||||
|
||||
use crate::error::{InvalidMetadataSnafu, Result};
|
||||
use crate::error::{InvalidMetadataSnafu, Result, WriteBufferSnafu};
|
||||
use crate::read::{Batch, Source};
|
||||
use crate::sst::parquet::format::WriteFormat;
|
||||
use crate::sst::parquet::{SstInfo, WriteOptions, PARQUET_METADATA_KEY};
|
||||
use crate::sst::stream_writer::BufferedWriter;
|
||||
|
||||
/// Parquet SST writer.
|
||||
pub struct ParquetWriter {
|
||||
@@ -83,14 +83,18 @@ impl ParquetWriter {
|
||||
Some(writer_props),
|
||||
opts.write_buffer_size.as_bytes() as usize,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context(WriteBufferSnafu)?;
|
||||
|
||||
let mut stats = SourceStats::default();
|
||||
while let Some(batch) = self.source.next_batch().await? {
|
||||
stats.update(&batch);
|
||||
let arrow_batch = write_format.convert_batch(&batch)?;
|
||||
|
||||
buffered_writer.write(&arrow_batch).await?;
|
||||
buffered_writer
|
||||
.write(&arrow_batch)
|
||||
.await
|
||||
.context(WriteBufferSnafu)?;
|
||||
}
|
||||
|
||||
if stats.num_rows == 0 {
|
||||
@@ -99,11 +103,11 @@ impl ParquetWriter {
|
||||
self.file_path
|
||||
);
|
||||
|
||||
buffered_writer.close().await?;
|
||||
buffered_writer.close().await.context(WriteBufferSnafu)?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let (_file_meta, file_size) = buffered_writer.close().await?;
|
||||
let (_file_meta, file_size) = buffered_writer.close().await.context(WriteBufferSnafu)?;
|
||||
// Safety: num rows > 0 so we must have min/max.
|
||||
let time_range = stats.time_range.unwrap();
|
||||
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
|
||||
use common_datasource::buffered_writer::LazyBufferedWriter;
|
||||
use common_datasource::share_buffer::SharedBuffer;
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::arrow::ArrowWriter;
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use parquet::format::FileMetaData;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error;
|
||||
use crate::error::WriteParquetSnafu;
|
||||
|
||||
/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying
|
||||
/// storage by chunks to reduce memory consumption.
|
||||
pub struct BufferedWriter {
|
||||
inner: InnerBufferedWriter,
|
||||
}
|
||||
|
||||
type InnerBufferedWriter = LazyBufferedWriter<
|
||||
object_store::Writer,
|
||||
ArrowWriter<SharedBuffer>,
|
||||
Box<
|
||||
dyn FnMut(
|
||||
String,
|
||||
) -> Pin<
|
||||
Box<
|
||||
dyn Future<Output = common_datasource::error::Result<object_store::Writer>>
|
||||
+ Send,
|
||||
>,
|
||||
> + Send,
|
||||
>,
|
||||
>;
|
||||
|
||||
impl BufferedWriter {
|
||||
pub async fn try_new(
|
||||
path: String,
|
||||
store: ObjectStore,
|
||||
arrow_schema: SchemaRef,
|
||||
props: Option<WriterProperties>,
|
||||
buffer_threshold: usize,
|
||||
) -> error::Result<Self> {
|
||||
let buffer = SharedBuffer::with_capacity(buffer_threshold);
|
||||
|
||||
let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props)
|
||||
.context(WriteParquetSnafu { path: &path })?;
|
||||
|
||||
Ok(Self {
|
||||
inner: LazyBufferedWriter::new(
|
||||
buffer_threshold,
|
||||
buffer,
|
||||
arrow_writer,
|
||||
&path,
|
||||
Box::new(move |path| {
|
||||
let store = store.clone();
|
||||
Box::pin(async move {
|
||||
store
|
||||
.writer(&path)
|
||||
.await
|
||||
.context(common_datasource::error::WriteObjectSnafu { path })
|
||||
})
|
||||
}),
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
/// Write a record batch to stream writer.
|
||||
pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
|
||||
self.inner
|
||||
.write(arrow_batch)
|
||||
.await
|
||||
.context(error::WriteBufferSnafu)?;
|
||||
self.inner
|
||||
.try_flush(false)
|
||||
.await
|
||||
.context(error::WriteBufferSnafu)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close parquet writer.
|
||||
pub async fn close(self) -> error::Result<(FileMetaData, u64)> {
|
||||
self.inner
|
||||
.close_with_arrow_writer()
|
||||
.await
|
||||
.context(error::WriteBufferSnafu)
|
||||
}
|
||||
}
|
||||
@@ -50,7 +50,6 @@ session.workspace = true
|
||||
snafu.workspace = true
|
||||
sql.workspace = true
|
||||
sqlparser.workspace = true
|
||||
storage.workspace = true
|
||||
store-api.workspace = true
|
||||
substrait.workspace = true
|
||||
table.workspace = true
|
||||
|
||||
@@ -378,12 +378,6 @@ pub enum Error {
|
||||
error: datafusion::error::DataFusionError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to write parquet file"))]
|
||||
WriteParquet {
|
||||
location: Location,
|
||||
source: storage::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Schema datatypes not match at index {}, expected table schema: {}, actual file schema: {}",
|
||||
index,
|
||||
@@ -594,7 +588,6 @@ impl ErrorExt for Error {
|
||||
| Error::ParseUrl { source, .. }
|
||||
| Error::BuildBackend { source, .. } => source.status_code(),
|
||||
|
||||
Error::WriteParquet { source, .. } => source.status_code(),
|
||||
Error::ExecuteDdl { source, .. } => source.status_code(),
|
||||
Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::sync::Arc;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_datasource::file_format::csv::stream_to_csv;
|
||||
use common_datasource::file_format::json::stream_to_json;
|
||||
use common_datasource::file_format::parquet::stream_to_parquet;
|
||||
use common_datasource::file_format::Format;
|
||||
use common_datasource::object_store::{build_backend, parse_url};
|
||||
use common_datasource::util::find_dir_and_filename;
|
||||
@@ -31,17 +32,17 @@ use object_store::ObjectStore;
|
||||
use query::plan::LogicalPlan;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use storage::sst::SstInfo;
|
||||
use storage::{ParquetWriter, Source};
|
||||
use table::engine::TableReference;
|
||||
use table::requests::CopyTableRequest;
|
||||
use table::table::adapter::DfTableProviderAdapter;
|
||||
|
||||
use crate::error::{
|
||||
self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result, WriteParquetSnafu,
|
||||
};
|
||||
use crate::error::{self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result};
|
||||
use crate::statement::StatementExecutor;
|
||||
|
||||
// The buffer size should be greater than 5MB (minimum multipart upload size).
|
||||
/// Buffer size to flush data to object stores.
|
||||
const WRITE_BUFFER_THRESHOLD: ReadableSize = ReadableSize::mb(8);
|
||||
|
||||
impl StatementExecutor {
|
||||
async fn stream_to_file(
|
||||
&self,
|
||||
@@ -50,7 +51,7 @@ impl StatementExecutor {
|
||||
object_store: ObjectStore,
|
||||
path: &str,
|
||||
) -> Result<usize> {
|
||||
let threshold = ReadableSize::mb(4).as_bytes() as usize;
|
||||
let threshold = WRITE_BUFFER_THRESHOLD.as_bytes() as usize;
|
||||
|
||||
match format {
|
||||
Format::Csv(_) => stream_to_csv(
|
||||
@@ -69,17 +70,14 @@ impl StatementExecutor {
|
||||
)
|
||||
.await
|
||||
.context(error::WriteStreamToFileSnafu { path }),
|
||||
Format::Parquet(_) => {
|
||||
let writer = ParquetWriter::new(path, Source::Stream(stream), object_store);
|
||||
let rows_copied = writer
|
||||
.write_sst(&storage::sst::WriteOptions::default())
|
||||
.await
|
||||
.context(WriteParquetSnafu)?
|
||||
.map(|SstInfo { num_rows, .. }| num_rows)
|
||||
.unwrap_or(0);
|
||||
|
||||
Ok(rows_copied)
|
||||
}
|
||||
Format::Parquet(_) => stream_to_parquet(
|
||||
Box::pin(DfRecordBatchStreamAdapter::new(stream)),
|
||||
object_store,
|
||||
path,
|
||||
threshold,
|
||||
)
|
||||
.await
|
||||
.context(error::WriteStreamToFileSnafu { path }),
|
||||
_ => error::UnsupportedFormatSnafu { format: *format }.fail(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,7 +85,6 @@ rayon = "1.0"
|
||||
ron = "0.7"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
session = { workspace = true, features = ["testing"] }
|
||||
storage.workspace = true
|
||||
tokio-test = "0.4"
|
||||
|
||||
[[bench]]
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
[package]
|
||||
name = "storage"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
api.workspace = true
|
||||
arc-swap = "1.0"
|
||||
arrow-array.workspace = true
|
||||
arrow.workspace = true
|
||||
async-compat = "0.2"
|
||||
async-stream.workspace = true
|
||||
async-trait = "0.1"
|
||||
bytes = "1.1"
|
||||
common-base.workspace = true
|
||||
common-datasource.workspace = true
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-query.workspace = true
|
||||
common-recordbatch.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
datafusion-common.workspace = true
|
||||
datafusion-expr.workspace = true
|
||||
datafusion-physical-expr.workspace = true
|
||||
datafusion.workspace = true
|
||||
datatypes.workspace = true
|
||||
futures-util.workspace = true
|
||||
futures.workspace = true
|
||||
itertools.workspace = true
|
||||
lazy_static.workspace = true
|
||||
object-store.workspace = true
|
||||
parquet = { workspace = true, features = ["async"] }
|
||||
paste.workspace = true
|
||||
prometheus.workspace = true
|
||||
prost.workspace = true
|
||||
regex = "1.5"
|
||||
serde.workspace = true
|
||||
serde_json = "1.0"
|
||||
snafu.workspace = true
|
||||
store-api.workspace = true
|
||||
table.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio.workspace = true
|
||||
tonic.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
atomic_float = "0.1"
|
||||
common-config.workspace = true
|
||||
common-test-util.workspace = true
|
||||
criterion = "0.3"
|
||||
datatypes = { workspace = true, features = ["test"] }
|
||||
log-store.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build = "0.9"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_main"
|
||||
harness = false
|
||||
@@ -1,27 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::criterion_main;
|
||||
|
||||
mod memtable;
|
||||
mod wal;
|
||||
|
||||
criterion_main! {
|
||||
memtable::bench_memtable_read::benches,
|
||||
memtable::bench_memtable_write::benches,
|
||||
memtable::bench_memtable_read_write_ratio::benches,
|
||||
wal::bench_wal::benches,
|
||||
wal::bench_decode::benches,
|
||||
wal::bench_encode::benches,
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
|
||||
|
||||
use crate::memtable::generate_kvs;
|
||||
use crate::memtable::util::bench_context::BenchContext;
|
||||
|
||||
fn bench_memtable_read(c: &mut Criterion) {
|
||||
// the length of string in value is 20
|
||||
let kvs = generate_kvs(10, 10000, 20);
|
||||
let ctx = BenchContext::new();
|
||||
kvs.iter().for_each(|kv| ctx.write(kv));
|
||||
let mut group = c.benchmark_group("memtable_read");
|
||||
let _ = group
|
||||
.throughput(Throughput::Elements(10 * 10000))
|
||||
.bench_function("read", |b| b.iter(|| ctx.read(100)));
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_memtable_read);
|
||||
criterion_main!(benches);
|
||||
@@ -1,151 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Instant;
|
||||
|
||||
use atomic_float::AtomicF64;
|
||||
use criterion::{
|
||||
criterion_group, criterion_main, BatchSize, Bencher, BenchmarkId, Criterion, Throughput,
|
||||
};
|
||||
use rand::Rng;
|
||||
|
||||
use crate::memtable::generate_kvs;
|
||||
use crate::memtable::util::bench_context::BenchContext;
|
||||
|
||||
static READ_NUM: AtomicUsize = AtomicUsize::new(0);
|
||||
static WRITE_NUM: AtomicUsize = AtomicUsize::new(0);
|
||||
static READ_SECS: AtomicF64 = AtomicF64::new(0.0);
|
||||
static WRITE_SECS: AtomicF64 = AtomicF64::new(0.0);
|
||||
|
||||
struct Input {
|
||||
ratio: bool,
|
||||
kv_size: usize,
|
||||
batch_size: usize,
|
||||
}
|
||||
|
||||
fn memtable_round(ctx: &BenchContext, input: &Input) {
|
||||
if input.ratio {
|
||||
let now = Instant::now();
|
||||
let read_count = ctx.read(input.batch_size);
|
||||
let d = now.elapsed();
|
||||
let _ = READ_SECS.fetch_add(
|
||||
d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
let _ = READ_NUM.fetch_add(read_count, Ordering::Relaxed);
|
||||
} else {
|
||||
generate_kvs(input.kv_size, input.batch_size, 20)
|
||||
.iter()
|
||||
.for_each(|kv| {
|
||||
let now = Instant::now();
|
||||
ctx.write(kv);
|
||||
let d = now.elapsed();
|
||||
let _ = WRITE_SECS.fetch_add(
|
||||
d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
let _ = WRITE_NUM.fetch_add(kv.len(), Ordering::Relaxed);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_read_write_ctx_frac(b: &mut Bencher<'_>, frac: &usize) {
|
||||
let frac = *frac;
|
||||
let ctx = Arc::new(BenchContext::default());
|
||||
let thread_ctx = ctx.clone();
|
||||
let stop = Arc::new(AtomicBool::new(false));
|
||||
let thread_stop = stop.clone();
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
let mut rng = rand::thread_rng();
|
||||
while !thread_stop.load(Ordering::Relaxed) {
|
||||
let f = rng.gen_range(0..=10);
|
||||
let input = Input {
|
||||
ratio: f < frac,
|
||||
kv_size: 100,
|
||||
batch_size: 1000,
|
||||
};
|
||||
memtable_round(&thread_ctx, &input);
|
||||
}
|
||||
});
|
||||
|
||||
let mut rng = rand::thread_rng();
|
||||
b.iter_batched_ref(
|
||||
|| {
|
||||
let f = rng.gen_range(0..=10);
|
||||
Input {
|
||||
ratio: f < frac,
|
||||
kv_size: 100,
|
||||
batch_size: 1000,
|
||||
}
|
||||
},
|
||||
|input| {
|
||||
memtable_round(&ctx, input);
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
stop.store(true, Ordering::Relaxed);
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
#[allow(clippy::print_stdout)]
|
||||
fn bench_memtable_read_write_ratio(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("memtable_read_write_ratio");
|
||||
for i in 0..=10 {
|
||||
READ_NUM.store(0, Ordering::Relaxed);
|
||||
WRITE_NUM.store(0, Ordering::Relaxed);
|
||||
READ_SECS.store(0.0, Ordering::Relaxed);
|
||||
WRITE_SECS.store(0.0, Ordering::Relaxed);
|
||||
|
||||
let _ = group
|
||||
.bench_with_input(
|
||||
BenchmarkId::from_parameter(format!(
|
||||
"read ratio: {:.2}% , write ratio: {:.2}%",
|
||||
i as f64 / 10_f64 * 100.0,
|
||||
(10 - i) as f64 / 10_f64 * 100.0,
|
||||
)),
|
||||
&i,
|
||||
bench_read_write_ctx_frac,
|
||||
)
|
||||
.throughput(Throughput::Elements(100 * 1000));
|
||||
|
||||
// the time is a little different the real time
|
||||
let read_num = READ_NUM.load(Ordering::Relaxed);
|
||||
let read_time = READ_SECS.load(Ordering::Relaxed);
|
||||
let read_tps = if read_time != 0.0 {
|
||||
read_num as f64 / read_time
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let write_num = WRITE_NUM.load(Ordering::Relaxed);
|
||||
let write_time = WRITE_SECS.load(Ordering::Relaxed);
|
||||
let write_tps = if write_time != 0.0 {
|
||||
write_num as f64 / write_time
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
if read_num != 0 || write_num != 0 {
|
||||
println!(
|
||||
"\nread numbers: {read_num}, read thrpt: {read_tps}\nwrite numbers: {write_num}, write thrpt {write_tps}\n",
|
||||
);
|
||||
}
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_memtable_read_write_ratio);
|
||||
criterion_main!(benches);
|
||||
@@ -1,34 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
|
||||
|
||||
use crate::memtable::generate_kvs;
|
||||
use crate::memtable::util::bench_context::BenchContext;
|
||||
|
||||
pub fn bench_memtable_write(c: &mut Criterion) {
|
||||
// the length of string in value is 20
|
||||
let kvs = generate_kvs(10, 1000, 20);
|
||||
let mut group = c.benchmark_group("memtable_write");
|
||||
let _ = group
|
||||
.throughput(Throughput::Elements(10 * 1000))
|
||||
.bench_function("write", |b| {
|
||||
let ctx = BenchContext::new();
|
||||
b.iter(|| kvs.iter().for_each(|kv| ctx.write(kv)))
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_memtable_write);
|
||||
criterion_main!(benches);
|
||||
@@ -1,121 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod bench_memtable_read;
|
||||
pub mod bench_memtable_read_write_ratio;
|
||||
pub mod bench_memtable_write;
|
||||
pub mod util;
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use datatypes::prelude::ScalarVectorBuilder;
|
||||
use datatypes::timestamp::TimestampMillisecond;
|
||||
use datatypes::vectors::{
|
||||
StringVectorBuilder, TimestampMillisecondVectorBuilder, UInt64VectorBuilder,
|
||||
};
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand::prelude::ThreadRng;
|
||||
use rand::Rng;
|
||||
use storage::memtable::KeyValues;
|
||||
use store_api::storage::SequenceNumber;
|
||||
|
||||
static NEXT_SEQUENCE: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
fn get_sequence() -> SequenceNumber {
|
||||
NEXT_SEQUENCE.fetch_add(1, Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn random_kv(rng: &mut ThreadRng, value_size: usize) -> ((i64, u64), (Option<u64>, String)) {
|
||||
let key0 = rng.gen_range(0..10000);
|
||||
let key1 = rng.gen::<u64>();
|
||||
let value1 = Some(rng.gen::<u64>());
|
||||
let value2 = rand::thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(value_size)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
((key0, key1), (value1, value2))
|
||||
}
|
||||
type KeyTuple = (i64, u64);
|
||||
type ValueTuple = (Option<u64>, String);
|
||||
|
||||
fn random_kvs(len: usize, value_size: usize) -> (Vec<KeyTuple>, Vec<ValueTuple>) {
|
||||
let mut keys = Vec::with_capacity(len);
|
||||
let mut values = Vec::with_capacity(len);
|
||||
for _ in 0..len {
|
||||
let mut rng = rand::thread_rng();
|
||||
let (key, value) = random_kv(&mut rng, value_size);
|
||||
keys.push(key);
|
||||
values.push(value);
|
||||
}
|
||||
(keys, values)
|
||||
}
|
||||
|
||||
fn kvs_with_index(
|
||||
sequence: SequenceNumber,
|
||||
op_type: OpType,
|
||||
start_index_in_batch: usize,
|
||||
keys: &[(i64, u64)],
|
||||
values: &[(Option<u64>, String)],
|
||||
) -> KeyValues {
|
||||
let mut key_builders = (
|
||||
TimestampMillisecondVectorBuilder::with_capacity(keys.len()),
|
||||
UInt64VectorBuilder::with_capacity(keys.len()),
|
||||
);
|
||||
for key in keys {
|
||||
key_builders.0.push(Some(TimestampMillisecond::from(key.0)));
|
||||
key_builders.1.push(Some(key.1));
|
||||
}
|
||||
let row_keys = vec![Arc::new(key_builders.1.finish()) as _];
|
||||
|
||||
let mut value_builders = (
|
||||
UInt64VectorBuilder::with_capacity(values.len()),
|
||||
StringVectorBuilder::with_capacity(values.len()),
|
||||
);
|
||||
for value in values {
|
||||
value_builders.0.push(value.0);
|
||||
value_builders.1.push(Some(&value.1));
|
||||
}
|
||||
let row_values = vec![
|
||||
Arc::new(value_builders.0.finish()) as _,
|
||||
Arc::new(value_builders.1.finish()) as _,
|
||||
];
|
||||
KeyValues {
|
||||
sequence,
|
||||
op_type,
|
||||
start_index_in_batch,
|
||||
keys: row_keys,
|
||||
values: row_values,
|
||||
timestamp: Some(Arc::new(key_builders.0.finish()) as _),
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_kv(kv_size: usize, start_index_in_batch: usize, value_size: usize) -> KeyValues {
|
||||
let (keys, values) = random_kvs(kv_size, value_size);
|
||||
kvs_with_index(
|
||||
get_sequence(),
|
||||
OpType::Put,
|
||||
start_index_in_batch,
|
||||
&keys,
|
||||
&values,
|
||||
)
|
||||
}
|
||||
|
||||
fn generate_kvs(kv_size: usize, size: usize, value_size: usize) -> Vec<KeyValues> {
|
||||
(0..size)
|
||||
.map(|i| generate_kv(kv_size, i, value_size))
|
||||
.collect()
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use storage::memtable::{IterContext, KeyValues, MemtableRef};
|
||||
|
||||
use crate::memtable::util::new_memtable;
|
||||
|
||||
pub struct BenchContext {
|
||||
memtable: MemtableRef,
|
||||
}
|
||||
impl Default for BenchContext {
|
||||
fn default() -> Self {
|
||||
BenchContext::new()
|
||||
}
|
||||
}
|
||||
impl BenchContext {
|
||||
pub fn new() -> BenchContext {
|
||||
BenchContext {
|
||||
memtable: new_memtable(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write(&self, kvs: &KeyValues) {
|
||||
self.memtable.write(kvs).unwrap();
|
||||
}
|
||||
|
||||
pub fn read(&self, batch_size: usize) -> usize {
|
||||
let mut read_count = 0;
|
||||
let iter_ctx = IterContext {
|
||||
batch_size,
|
||||
..Default::default()
|
||||
};
|
||||
let iter = self.memtable.iter(iter_ctx).unwrap();
|
||||
for batch in iter {
|
||||
let _ = batch.unwrap();
|
||||
read_count += batch_size;
|
||||
}
|
||||
read_count
|
||||
}
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod bench_context;
|
||||
pub mod regiondesc_util;
|
||||
pub mod schema_util;
|
||||
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use storage::memtable::{DefaultMemtableBuilder, MemtableBuilder, MemtableRef};
|
||||
use storage::metadata::RegionMetadata;
|
||||
use storage::schema::RegionSchemaRef;
|
||||
|
||||
use crate::memtable::util::regiondesc_util::RegionDescBuilder;
|
||||
|
||||
pub const TIMESTAMP_NAME: &str = "timestamp";
|
||||
|
||||
pub fn schema_for_test() -> RegionSchemaRef {
|
||||
let desc = RegionDescBuilder::new("bench")
|
||||
.push_field_column(("v1", LogicalTypeId::UInt64, true))
|
||||
.push_field_column(("v2", LogicalTypeId::String, true))
|
||||
.build();
|
||||
let metadata: RegionMetadata = desc.try_into().unwrap();
|
||||
|
||||
metadata.schema().clone()
|
||||
}
|
||||
|
||||
pub fn new_memtable() -> MemtableRef {
|
||||
DefaultMemtableBuilder::default().build(schema_for_test())
|
||||
}
|
||||
@@ -1,80 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use store_api::storage::{
|
||||
ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId,
|
||||
RegionDescriptor, RowKeyDescriptorBuilder,
|
||||
};
|
||||
|
||||
use super::schema_util::ColumnDef;
|
||||
use super::TIMESTAMP_NAME;
|
||||
|
||||
pub struct RegionDescBuilder {
|
||||
name: String,
|
||||
last_column_id: ColumnId,
|
||||
key_builder: RowKeyDescriptorBuilder,
|
||||
default_cf_builder: ColumnFamilyDescriptorBuilder,
|
||||
}
|
||||
|
||||
impl RegionDescBuilder {
|
||||
pub fn new<T: Into<String>>(name: T) -> Self {
|
||||
let key_builder = RowKeyDescriptorBuilder::new(
|
||||
ColumnDescriptorBuilder::new(
|
||||
1,
|
||||
TIMESTAMP_NAME,
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
)
|
||||
.is_nullable(false)
|
||||
.build()
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
Self {
|
||||
name: name.into(),
|
||||
last_column_id: 1,
|
||||
key_builder,
|
||||
default_cf_builder: ColumnFamilyDescriptorBuilder::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_field_column(mut self, column_def: ColumnDef) -> Self {
|
||||
let column = self.new_column(column_def);
|
||||
self.default_cf_builder = self.default_cf_builder.push_column(column);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> RegionDescriptor {
|
||||
RegionDescriptor {
|
||||
id: 0.into(),
|
||||
name: self.name,
|
||||
row_key: self.key_builder.build().unwrap(),
|
||||
default_cf: self.default_cf_builder.build().unwrap(),
|
||||
extra_cfs: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn alloc_column_id(&mut self) -> ColumnId {
|
||||
self.last_column_id += 1;
|
||||
self.last_column_id
|
||||
}
|
||||
|
||||
fn new_column(&mut self, column_def: ColumnDef) -> ColumnDescriptor {
|
||||
let datatype = column_def.1.data_type();
|
||||
ColumnDescriptorBuilder::new(self.alloc_column_id(), column_def.0, datatype)
|
||||
.is_nullable(column_def.2)
|
||||
.build()
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
@@ -1,46 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::prelude::*;
|
||||
use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef};
|
||||
|
||||
/// Column definition: (name, datatype, is_nullable)
|
||||
pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool);
|
||||
|
||||
pub fn new_schema(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> Schema {
|
||||
let column_schemas: Vec<_> = column_defs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(index, column_def)| {
|
||||
let datatype = column_def.1.data_type();
|
||||
if let Some(timestamp_index) = timestamp_index {
|
||||
ColumnSchema::new(column_def.0, datatype, column_def.2)
|
||||
.with_time_index(index == timestamp_index)
|
||||
} else {
|
||||
ColumnSchema::new(column_def.0, datatype, column_def.2)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
SchemaBuilder::try_from(column_schemas)
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn new_schema_ref(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> SchemaRef {
|
||||
Arc::new(new_schema(column_defs, timestamp_index))
|
||||
}
|
||||
@@ -1,73 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use storage::codec::{Decoder, Encoder};
|
||||
use storage::write_batch::{codec, WriteBatch};
|
||||
|
||||
use crate::wal::util::gen_new_batch_and_types;
|
||||
|
||||
/*
|
||||
-------------------------------------
|
||||
decode |
|
||||
-------------------------------------
|
||||
rows | protobuf | arrow |
|
||||
------------------------------------
|
||||
10 | 8.6485 us | 8.8028 us |
|
||||
------------------------------------
|
||||
100 | 63.850 us | 46.174 us |
|
||||
------------------------------------
|
||||
10000| 654.46 us | 433.58 us |
|
||||
------------------------------------
|
||||
*/
|
||||
|
||||
fn encode_arrow(batch: &WriteBatch, dst: &mut Vec<u8>) {
|
||||
let encoder = codec::PayloadEncoder::new();
|
||||
encoder.encode(batch.payload(), dst).unwrap();
|
||||
}
|
||||
|
||||
fn decode_arrow(dst: &[u8], mutation_types: &[i32]) {
|
||||
let decoder = codec::PayloadDecoder::new(mutation_types);
|
||||
let _ = decoder.decode(dst).unwrap();
|
||||
}
|
||||
|
||||
fn bench_wal_decode(c: &mut Criterion) {
|
||||
let (batch_10, types_10) = gen_new_batch_and_types(1);
|
||||
let (batch_100, types_100) = gen_new_batch_and_types(10);
|
||||
let (batch_10000, types_10000) = gen_new_batch_and_types(100);
|
||||
|
||||
let mut dst_arrow_10 = vec![];
|
||||
let mut dst_arrow_100 = vec![];
|
||||
let mut dst_arrow_10000 = vec![];
|
||||
|
||||
encode_arrow(&batch_10, &mut dst_arrow_10);
|
||||
encode_arrow(&batch_100, &mut dst_arrow_100);
|
||||
encode_arrow(&batch_10000, &mut dst_arrow_10000);
|
||||
|
||||
let mut group = c.benchmark_group("wal_decode");
|
||||
let _ = group
|
||||
.bench_function("arrow_decode_with_10_num_rows", |b| {
|
||||
b.iter(|| decode_arrow(&dst_arrow_10, &types_10))
|
||||
})
|
||||
.bench_function("arrow_decode_with_100_num_rows", |b| {
|
||||
b.iter(|| decode_arrow(&dst_arrow_100, &types_100))
|
||||
})
|
||||
.bench_function("arrow_decode_with_10000_num_rows", |b| {
|
||||
b.iter(|| decode_arrow(&dst_arrow_10000, &types_10000))
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_wal_decode);
|
||||
criterion_main!(benches);
|
||||
@@ -1,61 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use storage::codec::Encoder;
|
||||
use storage::write_batch::{codec, WriteBatch};
|
||||
|
||||
use crate::wal::util::gen_new_batch_and_types;
|
||||
|
||||
/*
|
||||
-------------------------------------
|
||||
encode |
|
||||
-------------------------------------
|
||||
rows | protobuf | arrow |
|
||||
------------------------------------
|
||||
10 | 4.8732 us | 5.7388 us |
|
||||
------------------------------------
|
||||
100 | 40.928 us | 24.988 us |
|
||||
------------------------------------
|
||||
10000| 425.69 us | 229.74 us |
|
||||
------------------------------------
|
||||
*/
|
||||
|
||||
fn encode_arrow(batch: &WriteBatch) {
|
||||
let encoder = codec::PayloadEncoder::new();
|
||||
let mut dst = vec![];
|
||||
encoder.encode(batch.payload(), &mut dst).unwrap();
|
||||
}
|
||||
|
||||
fn bench_wal_encode(c: &mut Criterion) {
|
||||
let (batch_10, _) = gen_new_batch_and_types(1);
|
||||
let (batch_100, _) = gen_new_batch_and_types(10);
|
||||
let (batch_10000, _) = gen_new_batch_and_types(100);
|
||||
|
||||
let mut group = c.benchmark_group("wal_encode");
|
||||
let _ = group
|
||||
.bench_function("arrow_encode_with_10_num_rows", |b| {
|
||||
b.iter(|| encode_arrow(&batch_10))
|
||||
})
|
||||
.bench_function("arrow_encode_with_100_num_rows", |b| {
|
||||
b.iter(|| encode_arrow(&batch_100))
|
||||
})
|
||||
.bench_function("arrow_encode_with_10000_num_rows", |b| {
|
||||
b.iter(|| encode_arrow(&batch_10000))
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_wal_encode);
|
||||
criterion_main!(benches);
|
||||
@@ -1,64 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use storage::codec::{Decoder, Encoder};
|
||||
use storage::write_batch::{codec, WriteBatch};
|
||||
|
||||
use crate::wal::util::gen_new_batch_and_types;
|
||||
|
||||
/*
|
||||
-------------------------------------
|
||||
encode & decode |
|
||||
-------------------------------------
|
||||
rows | protobuf | arrow |
|
||||
------------------------------------
|
||||
10 | 13.845 us | 15.093 us |
|
||||
------------------------------------
|
||||
100 | 106.70 us | 73.895 us |
|
||||
------------------------------------
|
||||
10000| 1.0860 ms | 680.12 us |
|
||||
------------------------------------
|
||||
*/
|
||||
|
||||
fn codec_arrow(batch: &WriteBatch, mutation_types: &[i32]) {
|
||||
let encoder = codec::PayloadEncoder::new();
|
||||
let mut dst = vec![];
|
||||
encoder.encode(batch.payload(), &mut dst).unwrap();
|
||||
|
||||
let decoder = codec::PayloadDecoder::new(mutation_types);
|
||||
let _ = decoder.decode(&dst).unwrap();
|
||||
}
|
||||
|
||||
fn bench_wal_encode_decode(c: &mut Criterion) {
|
||||
let (batch_10, types_10) = gen_new_batch_and_types(1);
|
||||
let (batch_100, types_100) = gen_new_batch_and_types(10);
|
||||
let (batch_10000, types_10000) = gen_new_batch_and_types(100);
|
||||
|
||||
let mut group = c.benchmark_group("wal_encode_decode");
|
||||
let _ = group
|
||||
.bench_function("arrow_encode_decode_with_10_num_rows", |b| {
|
||||
b.iter(|| codec_arrow(&batch_10, &types_10))
|
||||
})
|
||||
.bench_function("arrow_encode_decode_with_100_num_rows", |b| {
|
||||
b.iter(|| codec_arrow(&batch_100, &types_100))
|
||||
})
|
||||
.bench_function("arrow_encode_decode_with_10000_num_rows", |b| {
|
||||
b.iter(|| codec_arrow(&batch_10000, &types_10000))
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_wal_encode_decode);
|
||||
criterion_main!(benches);
|
||||
@@ -1,18 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod bench_decode;
|
||||
pub mod bench_encode;
|
||||
pub mod bench_wal;
|
||||
pub mod util;
|
||||
@@ -1,94 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod write_batch_util;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use datatypes::vectors::{
|
||||
BooleanVector, Float64Vector, StringVector, TimestampMillisecondVector, UInt64Vector, VectorRef,
|
||||
};
|
||||
use rand::Rng;
|
||||
use storage::proto;
|
||||
use storage::write_batch::WriteBatch;
|
||||
use store_api::storage::WriteRequest;
|
||||
|
||||
pub fn new_test_batch() -> WriteBatch {
|
||||
write_batch_util::new_write_batch(
|
||||
&[
|
||||
("k1", LogicalTypeId::UInt64, false),
|
||||
("ts", LogicalTypeId::TimestampMillisecond, false),
|
||||
("v1", LogicalTypeId::Boolean, true),
|
||||
("4", LogicalTypeId::Float64, false),
|
||||
("5", LogicalTypeId::Float64, false),
|
||||
("6", LogicalTypeId::Float64, false),
|
||||
("7", LogicalTypeId::Float64, false),
|
||||
("8", LogicalTypeId::Float64, false),
|
||||
("9", LogicalTypeId::Float64, false),
|
||||
("10", LogicalTypeId::String, false),
|
||||
],
|
||||
Some(2),
|
||||
3,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn gen_new_batch_and_types(putdate_nums: usize) -> (WriteBatch, Vec<i32>) {
|
||||
let mut batch = new_test_batch();
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..putdate_nums {
|
||||
let mut intvs = [0u64; 10];
|
||||
let mut boolvs = [true; 10];
|
||||
let mut tsvs = [0i64; 10];
|
||||
let mut fvs = [0.0_f64; 10];
|
||||
let svs = [
|
||||
"value1_string",
|
||||
"value2_string",
|
||||
"value3_string",
|
||||
"value4_string",
|
||||
"value5_string",
|
||||
"value6_string",
|
||||
"value7_string",
|
||||
"value8_string",
|
||||
"value9_string",
|
||||
"value10_string",
|
||||
];
|
||||
rng.fill(&mut intvs[..]);
|
||||
rng.fill(&mut boolvs[..]);
|
||||
rng.fill(&mut tsvs[..]);
|
||||
rng.fill(&mut fvs[..]);
|
||||
let intv = Arc::new(UInt64Vector::from_slice(intvs)) as VectorRef;
|
||||
let boolv = Arc::new(BooleanVector::from(boolvs.to_vec())) as VectorRef;
|
||||
let tsv = Arc::new(TimestampMillisecondVector::from_values(tsvs)) as VectorRef;
|
||||
let fvs = Arc::new(Float64Vector::from_slice(fvs)) as VectorRef;
|
||||
let svs = Arc::new(StringVector::from_slice(&svs)) as VectorRef;
|
||||
let put_data = HashMap::from([
|
||||
("k1".to_string(), intv.clone()),
|
||||
("v1".to_string(), boolv),
|
||||
("ts".to_string(), tsv.clone()),
|
||||
("4".to_string(), fvs.clone()),
|
||||
("5".to_string(), fvs.clone()),
|
||||
("6".to_string(), fvs.clone()),
|
||||
("7".to_string(), fvs.clone()),
|
||||
("8".to_string(), fvs.clone()),
|
||||
("9".to_string(), fvs),
|
||||
("10".to_string(), svs),
|
||||
]);
|
||||
batch.put(put_data).unwrap();
|
||||
}
|
||||
let types = proto::wal::gen_mutation_types(batch.payload());
|
||||
(batch, types)
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use storage::write_batch::WriteBatch;
|
||||
|
||||
use crate::memtable::util::schema_util::{self, ColumnDef};
|
||||
|
||||
pub fn new_write_batch(
|
||||
column_defs: &[ColumnDef],
|
||||
timestamp_index: Option<usize>,
|
||||
row_key_end: usize,
|
||||
) -> WriteBatch {
|
||||
let schema = schema_util::new_schema_ref(column_defs, timestamp_index);
|
||||
|
||||
WriteBatch::new(schema, row_key_end)
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
fn main() {
|
||||
tonic_build::configure()
|
||||
.compile(&["proto/wal.proto"], &["."])
|
||||
.expect("compile proto");
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package greptime.storage.wal.v1;
|
||||
|
||||
message WalHeader {
|
||||
uint64 last_manifest_version = 1;
|
||||
// Type of each mutation in payload, now only arrow payload uses this field.
|
||||
repeated MutationType mutation_types = 2;
|
||||
}
|
||||
|
||||
enum MutationType {
|
||||
DELETE = 0;
|
||||
PUT = 1;
|
||||
}
|
||||
@@ -1,451 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_query::logical_plan::Expr;
|
||||
use common_recordbatch::OrderOption;
|
||||
use common_telemetry::logging;
|
||||
use common_time::range::TimestampRange;
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::{Chunk, ChunkReader, RegionId, SchemaRef, SequenceNumber};
|
||||
use table::predicate::{Predicate, TimeRangePredicateBuilder};
|
||||
|
||||
use crate::error::{self, Error, Result};
|
||||
use crate::memtable::{IterContext, MemtableRef};
|
||||
use crate::read::{
|
||||
Batch, BoxedBatchReader, ChainReader, DedupReader, MergeReaderBuilder, WindowedReader,
|
||||
};
|
||||
use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
|
||||
use crate::sst::{AccessLayerRef, FileHandle, LevelMetas, ReadOptions};
|
||||
use crate::window_infer::{PlainWindowInference, WindowInfer};
|
||||
|
||||
/// Chunk reader implementation.
|
||||
// Now we use async-trait to implement the chunk reader, which is easier to implement than
|
||||
// using `Stream`, maybe change to `Stream` if we find out it is more efficient and have
|
||||
// necessary to do so.
|
||||
pub struct ChunkReaderImpl {
|
||||
schema: ProjectedSchemaRef,
|
||||
batch_reader: BoxedBatchReader,
|
||||
output_ordering: Option<Vec<OrderOption>>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ChunkReader for ChunkReaderImpl {
|
||||
type Error = Error;
|
||||
|
||||
fn user_schema(&self) -> &SchemaRef {
|
||||
self.schema.projected_user_schema()
|
||||
}
|
||||
|
||||
async fn next_chunk(&mut self) -> Result<Option<Chunk>> {
|
||||
let batch = match self.batch_reader.next_batch().await? {
|
||||
Some(b) => b,
|
||||
None => return Ok(None),
|
||||
};
|
||||
Ok(Some(Chunk::new(batch.columns)))
|
||||
}
|
||||
|
||||
fn project_chunk(&self, chunk: Chunk) -> Chunk {
|
||||
let batch = Batch {
|
||||
columns: chunk.columns,
|
||||
};
|
||||
self.schema.batch_to_chunk(&batch)
|
||||
}
|
||||
|
||||
fn output_ordering(&self) -> Option<Vec<OrderOption>> {
|
||||
self.output_ordering.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl ChunkReaderImpl {
|
||||
pub fn new(
|
||||
schema: ProjectedSchemaRef,
|
||||
batch_reader: BoxedBatchReader,
|
||||
output_ordering: Option<Vec<OrderOption>>,
|
||||
) -> ChunkReaderImpl {
|
||||
ChunkReaderImpl {
|
||||
schema,
|
||||
batch_reader,
|
||||
output_ordering,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn projected_schema(&self) -> &ProjectedSchemaRef {
|
||||
&self.schema
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder to create a new [ChunkReaderImpl] from scan request.
|
||||
pub struct ChunkReaderBuilder {
|
||||
region_id: RegionId,
|
||||
schema: RegionSchemaRef,
|
||||
projection: Option<Vec<usize>>,
|
||||
filters: Vec<Expr>,
|
||||
sst_layer: AccessLayerRef,
|
||||
iter_ctx: IterContext,
|
||||
memtables: Vec<MemtableRef>,
|
||||
files_to_read: Vec<FileHandle>,
|
||||
output_ordering: Option<Vec<OrderOption>>,
|
||||
use_chain_reader: bool,
|
||||
}
|
||||
|
||||
impl ChunkReaderBuilder {
|
||||
pub fn new(region_id: RegionId, schema: RegionSchemaRef, sst_layer: AccessLayerRef) -> Self {
|
||||
ChunkReaderBuilder {
|
||||
region_id,
|
||||
schema,
|
||||
projection: None,
|
||||
filters: vec![],
|
||||
sst_layer,
|
||||
iter_ctx: IterContext::default(),
|
||||
memtables: Vec::new(),
|
||||
files_to_read: Vec::new(),
|
||||
output_ordering: None,
|
||||
use_chain_reader: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reserve space for iterating `num` memtables.
|
||||
pub fn reserve_num_memtables(mut self, num: usize) -> Self {
|
||||
self.memtables.reserve(num);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn projection(mut self, projection: Option<Vec<usize>>) -> Self {
|
||||
self.projection = projection;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn filters(mut self, filters: Vec<Expr>) -> Self {
|
||||
self.filters = filters;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn output_ordering(mut self, ordering: Option<Vec<OrderOption>>) -> Self {
|
||||
self.output_ordering = ordering;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn batch_size(mut self, batch_size: usize) -> Self {
|
||||
self.iter_ctx.batch_size = batch_size;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn visible_sequence(mut self, sequence: SequenceNumber) -> Self {
|
||||
self.iter_ctx.visible_sequence = sequence;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn pick_memtables(mut self, memtables: MemtableRef) -> Self {
|
||||
self.memtables.push(memtables);
|
||||
self
|
||||
}
|
||||
|
||||
/// Partition files and memtables according to their time windows and scan time windows
|
||||
/// one by one.
|
||||
///
|
||||
/// Note that compaction should not enable this.
|
||||
pub fn use_chain_reader(mut self, use_chain_reader: bool) -> Self {
|
||||
self.use_chain_reader = use_chain_reader;
|
||||
self
|
||||
}
|
||||
|
||||
/// Picks all SSTs in all levels
|
||||
pub fn pick_all_ssts(mut self, ssts: &LevelMetas) -> Result<Self> {
|
||||
let files = ssts.levels().iter().flat_map(|level| level.files());
|
||||
// Now we read all files, so just reserve enough space to hold all files.
|
||||
self.files_to_read.reserve(files.size_hint().0);
|
||||
for file in files {
|
||||
// We can't invoke async functions here, so we collects all files first, and
|
||||
// create the batch reader later in `ChunkReaderBuilder`.
|
||||
self.files_to_read.push(file.clone());
|
||||
}
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Picks given SSTs to read.
|
||||
pub fn pick_ssts(mut self, ssts: &[FileHandle]) -> Self {
|
||||
for file in ssts {
|
||||
self.files_to_read.push(file.clone());
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Try to infer time window from output ordering. If the result
|
||||
/// is `None` means the output ordering is not obeyed, otherwise
|
||||
/// means the output ordering is obeyed and is same with request.
|
||||
fn infer_time_windows(&self, output_ordering: &[OrderOption]) -> Option<Vec<TimestampRange>> {
|
||||
if output_ordering.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let OrderOption { name, options } = &output_ordering[0];
|
||||
|
||||
if name != self.schema.timestamp_column_name() {
|
||||
return None;
|
||||
}
|
||||
let memtable_stats = self
|
||||
.memtables
|
||||
.iter()
|
||||
.filter(|m| m.num_rows() > 0) // Skip empty memtables.
|
||||
.map(|m| m.stats())
|
||||
.collect::<Vec<_>>();
|
||||
let files = self
|
||||
.files_to_read
|
||||
.iter()
|
||||
.map(FileHandle::meta)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
Some(PlainWindowInference {}.infer_window(&files, &memtable_stats, options.descending))
|
||||
}
|
||||
|
||||
async fn build_windowed(
|
||||
self,
|
||||
schema: &ProjectedSchemaRef,
|
||||
time_range_predicate: &TimestampRange,
|
||||
windows: Vec<TimestampRange>,
|
||||
order_options: Vec<OrderOption>,
|
||||
) -> Result<BoxedBatchReader> {
|
||||
let mut readers = Vec::with_capacity(windows.len());
|
||||
for window in windows {
|
||||
let time_range_predicate = time_range_predicate.and(&window);
|
||||
let reader = self.build_reader(schema, &time_range_predicate).await?;
|
||||
readers.push(reader);
|
||||
}
|
||||
let windowed_reader = WindowedReader::new(schema.clone(), readers, order_options);
|
||||
Ok(Box::new(windowed_reader) as Box<_>)
|
||||
}
|
||||
|
||||
async fn build_reader(
|
||||
&self,
|
||||
schema: &ProjectedSchemaRef,
|
||||
time_range: &TimestampRange,
|
||||
) -> Result<BoxedBatchReader> {
|
||||
let num_sources = self.memtables.len() + self.files_to_read.len();
|
||||
let mut reader_builder = MergeReaderBuilder::with_capacity(schema.clone(), num_sources)
|
||||
.batch_size(self.iter_ctx.batch_size);
|
||||
|
||||
for mem in &self.memtables {
|
||||
let mut iter_ctx = self.iter_ctx.clone();
|
||||
iter_ctx.time_range = Some(*time_range);
|
||||
let iter = mem.iter(iter_ctx)?;
|
||||
reader_builder = reader_builder.push_batch_iter(iter);
|
||||
}
|
||||
|
||||
let predicate = Predicate::new(self.filters.clone());
|
||||
|
||||
let read_opts = ReadOptions {
|
||||
batch_size: self.iter_ctx.batch_size,
|
||||
projected_schema: schema.clone(),
|
||||
predicate,
|
||||
time_range: *time_range,
|
||||
};
|
||||
|
||||
let mut num_read_files = 0;
|
||||
for file in &self.files_to_read {
|
||||
if !Self::file_in_range(file, time_range) {
|
||||
logging::debug!(
|
||||
"Skip region {} file {:?}, predicate: {:?}",
|
||||
self.region_id,
|
||||
file,
|
||||
time_range
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
let reader = self.sst_layer.read_sst(file.clone(), &read_opts).await?;
|
||||
reader_builder = reader_builder.push_batch_reader(reader);
|
||||
num_read_files += 1;
|
||||
}
|
||||
|
||||
logging::debug!(
|
||||
"build reader done, region_id: {}, time_range: {:?}, total_files: {}, num_read_files: {}",
|
||||
self.region_id,
|
||||
time_range,
|
||||
self.files_to_read.len(),
|
||||
num_read_files,
|
||||
);
|
||||
|
||||
let reader = reader_builder.build();
|
||||
let reader = DedupReader::new(schema.clone(), reader);
|
||||
Ok(Box::new(reader) as Box<_>)
|
||||
}
|
||||
|
||||
pub async fn build(mut self) -> Result<ChunkReaderImpl> {
|
||||
let time_range_predicate = self.build_time_range_predicate();
|
||||
let schema = Arc::new(
|
||||
ProjectedSchema::new(self.schema.clone(), self.projection.clone())
|
||||
.context(error::InvalidProjectionSnafu)?,
|
||||
);
|
||||
self.iter_ctx.projected_schema = Some(schema.clone());
|
||||
|
||||
let mut output_ordering = None;
|
||||
let reader = if let Some(ordering) = self.output_ordering.take() &&
|
||||
let Some(windows) = self.infer_time_windows(&ordering) {
|
||||
output_ordering = Some(ordering.clone());
|
||||
self.build_windowed(&schema, &time_range_predicate, windows, ordering)
|
||||
.await?
|
||||
} else if self.use_chain_reader {
|
||||
self.build_chained(&schema, &time_range_predicate).await?
|
||||
} else {
|
||||
self.build_reader(&schema, &time_range_predicate).await?
|
||||
};
|
||||
|
||||
Ok(ChunkReaderImpl::new(schema, reader, output_ordering))
|
||||
}
|
||||
|
||||
async fn build_chained(
|
||||
&self,
|
||||
schema: &ProjectedSchemaRef,
|
||||
time_range: &TimestampRange,
|
||||
) -> Result<BoxedBatchReader> {
|
||||
let windows = self.infer_window_for_chain_reader(time_range);
|
||||
|
||||
logging::debug!(
|
||||
"Infer window for chain reader, region_id: {}, memtables: {}, files: {}, num_windows: {}",
|
||||
self.region_id,
|
||||
self.memtables.len(),
|
||||
self.files_to_read.len(),
|
||||
windows.len(),
|
||||
);
|
||||
|
||||
let mut readers = Vec::with_capacity(windows.len());
|
||||
for window in &windows {
|
||||
let time_range = time_range.and(window);
|
||||
let reader = self.build_reader(schema, &time_range).await?;
|
||||
readers.push(reader);
|
||||
}
|
||||
|
||||
logging::debug!(
|
||||
"Build chain reader, region_id: {}, time_range: {:?}, num_readers: {}",
|
||||
self.region_id,
|
||||
time_range,
|
||||
readers.len(),
|
||||
);
|
||||
|
||||
let chain_reader = ChainReader::new(schema.clone(), readers);
|
||||
Ok(Box::new(chain_reader) as Box<_>)
|
||||
}
|
||||
|
||||
/// Build time range predicate from schema and filters.
|
||||
fn build_time_range_predicate(&self) -> TimestampRange {
|
||||
let Some(ts_col) = self.schema.user_schema().timestamp_column() else {
|
||||
return TimestampRange::min_to_max();
|
||||
};
|
||||
let unit = ts_col
|
||||
.data_type
|
||||
.as_timestamp()
|
||||
.expect("Timestamp column must have timestamp-compatible type")
|
||||
.unit();
|
||||
TimeRangePredicateBuilder::new(&ts_col.name, unit, &self.filters).build()
|
||||
}
|
||||
|
||||
/// Check if SST file's time range matches predicate.
|
||||
fn file_in_range(file: &FileHandle, predicate: &TimestampRange) -> bool {
|
||||
if predicate == &TimestampRange::min_to_max() {
|
||||
return true;
|
||||
}
|
||||
// end_timestamp of sst file is inclusive.
|
||||
let Some((start, end)) = *file.time_range() else {
|
||||
return true;
|
||||
};
|
||||
let file_ts_range = TimestampRange::new_inclusive(Some(start), Some(end));
|
||||
file_ts_range.intersects(predicate)
|
||||
}
|
||||
|
||||
/// Returns the time range of memtables to read.
|
||||
fn compute_memtable_range(&self) -> Option<TimestampRange> {
|
||||
let (min_timestamp, max_timestamp) = self
|
||||
.memtables
|
||||
.iter()
|
||||
.filter(|m| m.num_rows() > 0) // Skip empty memtables.
|
||||
.map(|m| {
|
||||
let stats = m.stats();
|
||||
(stats.min_timestamp, stats.max_timestamp)
|
||||
})
|
||||
.reduce(|acc, e| (acc.0.min(e.0), acc.1.max(e.1)))?;
|
||||
|
||||
logging::debug!(
|
||||
"Compute memtable range, region_id: {}, min: {:?}, max: {:?}",
|
||||
self.region_id,
|
||||
min_timestamp,
|
||||
max_timestamp,
|
||||
);
|
||||
|
||||
Some(TimestampRange::new_inclusive(
|
||||
Some(min_timestamp),
|
||||
Some(max_timestamp),
|
||||
))
|
||||
}
|
||||
|
||||
/// Infer time window for chain reader according to the time range of memtables and files.
|
||||
fn infer_window_for_chain_reader(&self, time_range: &TimestampRange) -> Vec<TimestampRange> {
|
||||
let mut memtable_range = self.compute_memtable_range();
|
||||
// file ranges: (start, end)
|
||||
let mut file_ranges = Vec::with_capacity(self.files_to_read.len());
|
||||
for file in &self.files_to_read {
|
||||
if !Self::file_in_range(file, time_range) || file.time_range().is_none() {
|
||||
continue;
|
||||
}
|
||||
// Safety: we have skip files whose range is `None`.
|
||||
let range = file.time_range().unwrap();
|
||||
|
||||
// Filter by memtable's time range.
|
||||
if let Some(mem_range) = &mut memtable_range {
|
||||
let file_range = TimestampRange::new_inclusive(Some(range.0), Some(range.1));
|
||||
if mem_range.intersects(&file_range) {
|
||||
// If the range of the SST intersects with the range of the
|
||||
// memtable, we merge it into the memtable's range.
|
||||
*mem_range = mem_range.or(&file_range);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
file_ranges.push((range.0, range.1));
|
||||
}
|
||||
|
||||
if file_ranges.is_empty() {
|
||||
return memtable_range.map(|range| vec![range]).unwrap_or_default();
|
||||
}
|
||||
|
||||
// Sort by start times.
|
||||
file_ranges.sort_unstable_by(|left, right| left.0.cmp(&right.0));
|
||||
|
||||
// Compute ranges for all SSTs.
|
||||
let mut time_ranges = Vec::with_capacity(file_ranges.len() + 1);
|
||||
// Safety: file_ranges is not empty.
|
||||
let mut prev =
|
||||
TimestampRange::new_inclusive(Some(file_ranges[0].0), Some(file_ranges[0].1));
|
||||
for file_range in &file_ranges[1..] {
|
||||
let current = TimestampRange::new_inclusive(Some(file_range.0), Some(file_range.1));
|
||||
if prev.intersects(¤t) {
|
||||
prev = prev.or(¤t);
|
||||
} else {
|
||||
time_ranges.push(prev);
|
||||
prev = current;
|
||||
}
|
||||
}
|
||||
time_ranges.push(prev);
|
||||
|
||||
if let Some(mem_range) = memtable_range {
|
||||
time_ranges.push(mem_range);
|
||||
// We have pushed the memtable range, resort the array.
|
||||
time_ranges.sort_unstable_by(|left, right| left.start().cmp(right.start()));
|
||||
}
|
||||
|
||||
time_ranges
|
||||
}
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_error::ext::ErrorExt;
|
||||
|
||||
pub trait Encoder {
|
||||
/// The type that is decoded.
|
||||
type Item;
|
||||
type Error: ErrorExt;
|
||||
|
||||
/// Encodes a message into the bytes buffer.
|
||||
fn encode(&self, item: &Self::Item, dst: &mut Vec<u8>) -> Result<(), Self::Error>;
|
||||
}
|
||||
|
||||
pub trait Decoder {
|
||||
/// The type that is decoded.
|
||||
type Item;
|
||||
type Error: ErrorExt;
|
||||
|
||||
/// Decodes a message from the bytes buffer.
|
||||
fn decode(&self, src: &[u8]) -> Result<Self::Item, Self::Error>;
|
||||
}
|
||||
@@ -1,193 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod noop;
|
||||
mod picker;
|
||||
mod scheduler;
|
||||
mod task;
|
||||
mod twcs;
|
||||
mod writer;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::warn;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::Timestamp;
|
||||
pub use picker::{LeveledTimeWindowPicker, Picker, PickerContext};
|
||||
pub use scheduler::{CompactionHandler, CompactionRequestImpl};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::CompactionStrategy;
|
||||
pub use task::{CompactionTask, CompactionTaskImpl};
|
||||
pub use twcs::TwcsPicker;
|
||||
|
||||
use crate::scheduler::Scheduler;
|
||||
use crate::sst::FileHandle;
|
||||
|
||||
pub type CompactionPickerRef<S> =
|
||||
Arc<dyn Picker<Request = CompactionRequestImpl<S>, Task = CompactionTaskImpl<S>> + Send + Sync>;
|
||||
|
||||
pub type CompactionSchedulerRef<S> =
|
||||
Arc<dyn Scheduler<Request = CompactionRequestImpl<S>> + Send + Sync>;
|
||||
|
||||
/// Infers the suitable time bucket duration.
|
||||
/// Now it simply find the max and min timestamp across all SSTs in level and fit the time span
|
||||
/// into time bucket.
|
||||
pub(crate) fn infer_time_bucket<'a>(files: impl Iterator<Item = &'a FileHandle>) -> i64 {
|
||||
let mut max_ts = Timestamp::new(i64::MIN, TimeUnit::Second);
|
||||
let mut min_ts = Timestamp::new(i64::MAX, TimeUnit::Second);
|
||||
|
||||
for f in files {
|
||||
if let Some((start, end)) = f.time_range() {
|
||||
min_ts = min_ts.min(*start);
|
||||
max_ts = max_ts.max(*end);
|
||||
} else {
|
||||
// we don't expect an SST file without time range,
|
||||
// it's either a bug or data corruption.
|
||||
warn!("Found SST file without time range metadata: {f:?}");
|
||||
}
|
||||
}
|
||||
|
||||
// safety: Convert whatever timestamp into seconds will not cause overflow.
|
||||
let min_sec = min_ts.convert_to(TimeUnit::Second).unwrap().value();
|
||||
let max_sec = max_ts.convert_to(TimeUnit::Second).unwrap().value();
|
||||
|
||||
max_sec
|
||||
.checked_sub(min_sec)
|
||||
.map(|span| TIME_BUCKETS.fit_time_bucket(span)) // return the max bucket on subtraction overflow.
|
||||
.unwrap_or_else(|| TIME_BUCKETS.max()) // safety: TIME_BUCKETS cannot be empty.
|
||||
}
|
||||
|
||||
pub(crate) struct TimeBuckets([i64; 7]);
|
||||
|
||||
impl TimeBuckets {
|
||||
/// Fits a given time span into time bucket by find the minimum bucket that can cover the span.
|
||||
/// Returns the max bucket if no such bucket can be found.
|
||||
fn fit_time_bucket(&self, span_sec: i64) -> i64 {
|
||||
assert!(span_sec >= 0);
|
||||
match self.0.binary_search(&span_sec) {
|
||||
Ok(idx) => self.0[idx],
|
||||
Err(idx) => {
|
||||
if idx < self.0.len() {
|
||||
self.0[idx]
|
||||
} else {
|
||||
self.0.last().copied().unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn get(&self, idx: usize) -> i64 {
|
||||
self.0[idx]
|
||||
}
|
||||
|
||||
fn max(&self) -> i64 {
|
||||
self.0.last().copied().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of predefined time buckets.
|
||||
pub(crate) const TIME_BUCKETS: TimeBuckets = TimeBuckets([
|
||||
60 * 60, // one hour
|
||||
2 * 60 * 60, // two hours
|
||||
12 * 60 * 60, // twelve hours
|
||||
24 * 60 * 60, // one day
|
||||
7 * 24 * 60 * 60, // one week
|
||||
365 * 24 * 60 * 60, // one year
|
||||
10 * 365 * 24 * 60 * 60, // ten years
|
||||
]);
|
||||
|
||||
pub fn compaction_strategy_to_picker<S: LogStore>(
|
||||
strategy: &CompactionStrategy,
|
||||
) -> CompactionPickerRef<S> {
|
||||
match strategy {
|
||||
CompactionStrategy::Twcs(twcs_opts) => Arc::new(TwcsPicker::new(
|
||||
twcs_opts.max_active_window_files,
|
||||
twcs_opts.max_inactive_window_files,
|
||||
twcs_opts.time_window_seconds,
|
||||
)) as Arc<_>,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_time::Timestamp;
|
||||
|
||||
use super::*;
|
||||
use crate::file_purger::noop::new_noop_file_purger;
|
||||
use crate::sst::{FileHandle, FileId, FileMeta, Level};
|
||||
|
||||
/// Test util to create file handles.
|
||||
pub fn new_file_handle(
|
||||
file_id: FileId,
|
||||
start_ts_millis: i64,
|
||||
end_ts_millis: i64,
|
||||
level: Level,
|
||||
) -> FileHandle {
|
||||
let file_purger = new_noop_file_purger();
|
||||
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
|
||||
FileHandle::new(
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id,
|
||||
time_range: Some((
|
||||
Timestamp::new_millisecond(start_ts_millis),
|
||||
Timestamp::new_millisecond(end_ts_millis),
|
||||
)),
|
||||
level,
|
||||
file_size: 0,
|
||||
},
|
||||
layer,
|
||||
file_purger,
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_time_bucket() {
|
||||
assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(1));
|
||||
assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(60 * 60));
|
||||
assert_eq!(
|
||||
TIME_BUCKETS.get(1),
|
||||
TIME_BUCKETS.fit_time_bucket(60 * 60 + 1)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
TIME_BUCKETS.get(2),
|
||||
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2) - 1)
|
||||
);
|
||||
assert_eq!(
|
||||
TIME_BUCKETS.get(2),
|
||||
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2))
|
||||
);
|
||||
assert_eq!(
|
||||
TIME_BUCKETS.get(3),
|
||||
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(3) - 1)
|
||||
);
|
||||
assert_eq!(TIME_BUCKETS.get(6), TIME_BUCKETS.fit_time_bucket(i64::MAX));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_infer_time_buckets() {
|
||||
assert_eq!(
|
||||
TIME_BUCKETS.get(0),
|
||||
infer_time_bucket(
|
||||
[
|
||||
new_file_handle(FileId::random(), 0, TIME_BUCKETS.get(0) * 1000 - 1, 0),
|
||||
new_file_handle(FileId::random(), 1, 10_000, 0)
|
||||
]
|
||||
.iter()
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::compaction::{CompactionTask, Picker};
|
||||
use crate::error::Result;
|
||||
use crate::scheduler::{Request, Scheduler};
|
||||
|
||||
pub struct NoopCompactionScheduler<R> {
|
||||
_phantom_data: PhantomData<R>,
|
||||
}
|
||||
|
||||
impl<R> Default for NoopCompactionScheduler<R> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
_phantom_data: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> Debug for NoopCompactionScheduler<R> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("NoopCompactionScheduler<...>").finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct NoopCompactionRequest;
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct NoopCompactionPicker;
|
||||
|
||||
impl Picker for NoopCompactionPicker {
|
||||
type Request = NoopCompactionRequest;
|
||||
type Task = NoopCompactionTask;
|
||||
|
||||
fn pick(&self, _req: &Self::Request) -> Result<Option<Self::Task>> {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct NoopCompactionTask;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl CompactionTask for NoopCompactionTask {
|
||||
async fn run(self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Request for NoopCompactionRequest {
|
||||
type Key = RegionId;
|
||||
|
||||
fn key(&self) -> Self::Key {
|
||||
RegionId::from(0)
|
||||
}
|
||||
|
||||
fn complete(self, _result: Result<()>) {}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<R> Scheduler for NoopCompactionScheduler<R>
|
||||
where
|
||||
R: Request<Key = RegionId>,
|
||||
{
|
||||
type Request = R;
|
||||
|
||||
fn schedule(&self, _request: Self::Request) -> Result<bool> {
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
async fn stop(&self, _await_termination: bool) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,432 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::marker::PhantomData;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_telemetry::{debug, error, info, warn};
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::timestamp_millis::BucketAligned;
|
||||
use common_time::Timestamp;
|
||||
use snafu::ResultExt;
|
||||
use store_api::logstore::LogStore;
|
||||
|
||||
use crate::compaction::infer_time_bucket;
|
||||
use crate::compaction::scheduler::CompactionRequestImpl;
|
||||
use crate::compaction::task::{CompactionOutput, CompactionTask, CompactionTaskImpl};
|
||||
use crate::error::{Result, TtlCalculationSnafu};
|
||||
use crate::scheduler::Request;
|
||||
use crate::sst::{FileHandle, FileId, LevelMeta};
|
||||
|
||||
/// Picker picks input SST files and builds the compaction task.
|
||||
/// Different compaction strategy may implement different pickers.
|
||||
pub trait Picker: Debug + Send + 'static {
|
||||
type Request: Request;
|
||||
type Task: CompactionTask;
|
||||
|
||||
fn pick(&self, req: &Self::Request) -> Result<Option<Self::Task>>;
|
||||
}
|
||||
|
||||
pub(crate) fn get_expired_ssts(
|
||||
levels: &[LevelMeta],
|
||||
ttl: Option<Duration>,
|
||||
now: Timestamp,
|
||||
) -> Result<Vec<FileHandle>> {
|
||||
let Some(ttl) = ttl else {
|
||||
return Ok(vec![]);
|
||||
};
|
||||
|
||||
let expire_time = now.sub_duration(ttl).context(TtlCalculationSnafu)?;
|
||||
|
||||
let expired_ssts = levels
|
||||
.iter()
|
||||
.flat_map(|l| l.get_expired_files(&expire_time).into_iter())
|
||||
.collect();
|
||||
Ok(expired_ssts)
|
||||
}
|
||||
|
||||
pub struct PickerContext {
|
||||
compaction_time_window: Option<i64>,
|
||||
}
|
||||
|
||||
impl PickerContext {
|
||||
pub fn with(compaction_time_window: Option<i64>) -> Self {
|
||||
Self {
|
||||
compaction_time_window,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compaction_time_window(&self) -> Option<i64> {
|
||||
self.compaction_time_window
|
||||
}
|
||||
}
|
||||
|
||||
/// `LeveledTimeWindowPicker` only handles level 0 to level 1 compaction in a time-window tiered
|
||||
/// manner. It picks all SSTs in level 0 and writes rows in these SSTs to a new file partitioned
|
||||
/// by a inferred time bucket in level 1.
|
||||
pub struct LeveledTimeWindowPicker<S> {
|
||||
_phantom_data: PhantomData<S>,
|
||||
}
|
||||
|
||||
impl<S> Debug for LeveledTimeWindowPicker<S> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "LeveledTimeWindowPicker{{..}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Default for LeveledTimeWindowPicker<S> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> LeveledTimeWindowPicker<S> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
_phantom_data: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> Picker for LeveledTimeWindowPicker<S> {
|
||||
type Request = CompactionRequestImpl<S>;
|
||||
type Task = CompactionTaskImpl<S>;
|
||||
|
||||
fn pick(&self, req: &CompactionRequestImpl<S>) -> Result<Option<CompactionTaskImpl<S>>> {
|
||||
let levels = &req.levels();
|
||||
let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())
|
||||
.map_err(|e| {
|
||||
error!(e;"Failed to get region expired SST files, region: {}, ttl: {:?}", req.region_id, req.ttl);
|
||||
e
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
if !expired_ssts.is_empty() {
|
||||
info!(
|
||||
"Expired SSTs in region {}: {:?}",
|
||||
req.region_id, expired_ssts
|
||||
);
|
||||
// here we mark expired SSTs as compacting to avoid them being picked.
|
||||
expired_ssts.iter().for_each(|f| f.mark_compacting(true));
|
||||
}
|
||||
|
||||
let ctx = &PickerContext::with(req.compaction_time_window);
|
||||
|
||||
let mut outputs = vec![];
|
||||
for level_num in 0..levels.level_num() {
|
||||
let level = levels.level(level_num as u8);
|
||||
let compaction_time_window = Self::pick_level(ctx, level, &mut outputs);
|
||||
|
||||
if outputs.is_empty() {
|
||||
debug!(
|
||||
"No SST file can be compacted at level {}, path: {:?}",
|
||||
level_num, req.sst_layer
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"Found SST files to compact {:?} on level: {}, compaction window: {:?}",
|
||||
outputs, level_num, compaction_time_window,
|
||||
);
|
||||
return Ok(Some(CompactionTaskImpl {
|
||||
schema: req.schema(),
|
||||
sst_layer: req.sst_layer.clone(),
|
||||
outputs,
|
||||
writer: req.writer.clone(),
|
||||
shared_data: req.shared.clone(),
|
||||
wal: req.wal.clone(),
|
||||
manifest: req.manifest.clone(),
|
||||
expired_ssts,
|
||||
sst_write_buffer_size: req.sst_write_buffer_size,
|
||||
compaction_time_window,
|
||||
reschedule_on_finish: req.reschedule_on_finish,
|
||||
}));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> LeveledTimeWindowPicker<S> {
|
||||
fn pick_level(
|
||||
ctx: &PickerContext,
|
||||
level: &LevelMeta,
|
||||
results: &mut Vec<CompactionOutput>,
|
||||
) -> Option<i64> {
|
||||
// SimpleTimeWindowStrategy only handles level 0 to level 1 compaction.
|
||||
if level.level() != 0 {
|
||||
return None;
|
||||
}
|
||||
let files = find_compactable_files(level);
|
||||
debug!("Compactable files found: {:?}", files);
|
||||
if files.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let time_window = ctx.compaction_time_window().unwrap_or_else(|| {
|
||||
let inferred = infer_time_bucket(files.iter());
|
||||
debug!(
|
||||
"Compaction window is not present, inferring from files: {:?}",
|
||||
inferred
|
||||
);
|
||||
inferred
|
||||
});
|
||||
let buckets = calculate_time_buckets(time_window, &files);
|
||||
debug!("File bucket:{}, file groups: {:?}", time_window, buckets);
|
||||
|
||||
results.extend(buckets.into_iter().map(|(bound, files)| CompactionOutput {
|
||||
output_file_id: FileId::random(),
|
||||
output_level: 1,
|
||||
time_window_bound: bound,
|
||||
time_window_sec: time_window,
|
||||
inputs: files,
|
||||
// strict window is used in simple time window strategy in that rows in one file
|
||||
// may get compacted to multiple destinations.
|
||||
strict_window: true,
|
||||
}));
|
||||
Some(time_window)
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds files that can be compacted in given level.
|
||||
/// Currently they're files that is not currently under compaction.
|
||||
#[inline]
|
||||
fn find_compactable_files(level: &LevelMeta) -> Vec<FileHandle> {
|
||||
level.files().filter(|f| !f.compacting()).cloned().collect()
|
||||
}
|
||||
|
||||
/// Calculates buckets for files. If file does not contain a time range in metadata, it will be
|
||||
/// assigned to a special bucket `i64::MAX` (normally no timestamp can be aligned to this bucket)
|
||||
/// so that all files without timestamp can be compacted together.
|
||||
fn calculate_time_buckets(bucket_sec: i64, files: &[FileHandle]) -> HashMap<i64, Vec<FileHandle>> {
|
||||
let mut buckets = HashMap::new();
|
||||
|
||||
for file in files {
|
||||
if let Some((start, end)) = file.time_range() {
|
||||
let bounds = file_time_bucket_span(
|
||||
start.convert_to(TimeUnit::Second).unwrap().value(),
|
||||
end.convert_to(TimeUnit::Second).unwrap().value(),
|
||||
bucket_sec,
|
||||
);
|
||||
for bound in bounds {
|
||||
buckets
|
||||
.entry(bound)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(file.clone());
|
||||
}
|
||||
} else {
|
||||
warn!("Found corrupted SST without timestamp bounds: {:?}", file);
|
||||
}
|
||||
}
|
||||
buckets
|
||||
}
|
||||
|
||||
/// Calculates timestamp span between start and end timestamp.
|
||||
fn file_time_bucket_span(start_sec: i64, end_sec: i64, bucket_sec: i64) -> Vec<i64> {
|
||||
assert!(start_sec <= end_sec);
|
||||
|
||||
// if timestamp is between `[i64::MIN, i64::MIN.align_by_bucket(bucket)]`, which cannot
|
||||
// be aligned to a valid i64 bound, simply return `i64::MIN` rather than just underflow.
|
||||
let mut start_aligned = start_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
|
||||
let end_aligned = end_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
|
||||
|
||||
let mut res = Vec::with_capacity(((end_aligned - start_aligned) / bucket_sec + 1) as usize);
|
||||
while start_aligned < end_aligned {
|
||||
res.push(start_aligned);
|
||||
start_aligned += bucket_sec;
|
||||
}
|
||||
res.push(end_aligned);
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use crate::compaction::tests::new_file_handle;
|
||||
use crate::compaction::TIME_BUCKETS;
|
||||
use crate::file_purger::noop::new_noop_file_purger;
|
||||
use crate::sst::{FileId, Level, LevelMetas};
|
||||
|
||||
#[test]
|
||||
fn test_time_bucket_span() {
|
||||
assert_eq!(vec![0], file_time_bucket_span(1, 9, 10));
|
||||
|
||||
assert_eq!(vec![0, 10], file_time_bucket_span(1, 10, 10));
|
||||
|
||||
assert_eq!(vec![-10], file_time_bucket_span(-10, -1, 10));
|
||||
|
||||
assert_eq!(vec![-10, 0], file_time_bucket_span(-10, 0, 10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_time_bucket_span_large() {
|
||||
assert_eq!(
|
||||
vec![
|
||||
(i64::MAX - 10).align_by_bucket(10).unwrap(),
|
||||
i64::MAX.align_by_bucket(10).unwrap(),
|
||||
],
|
||||
file_time_bucket_span(i64::MAX - 10, i64::MAX, 10)
|
||||
);
|
||||
|
||||
// magic hmmm?
|
||||
for bucket in 1..100 {
|
||||
assert_eq!(
|
||||
vec![
|
||||
i64::MIN,
|
||||
(i64::MIN + bucket).align_by_bucket(bucket).unwrap()
|
||||
],
|
||||
file_time_bucket_span(i64::MIN, i64::MIN + bucket, bucket)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn new_file_handles(input: &[(FileId, i64, i64)]) -> Vec<FileHandle> {
|
||||
input
|
||||
.iter()
|
||||
.map(|(file_id, start, end)| new_file_handle(*file_id, *start, *end, 0))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn check_bucket_calculation(
|
||||
bucket_sec: i64,
|
||||
files: Vec<FileHandle>,
|
||||
expected: &[(i64, &[FileId])],
|
||||
) {
|
||||
let res = calculate_time_buckets(bucket_sec, &files);
|
||||
|
||||
let expected = expected
|
||||
.iter()
|
||||
.map(|(bucket, file_ids)| (*bucket, file_ids.iter().copied().collect::<HashSet<_>>()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
for (bucket, file_ids) in expected {
|
||||
let actual = res
|
||||
.get(&bucket)
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|f| f.file_id())
|
||||
.collect();
|
||||
assert_eq!(
|
||||
file_ids, actual,
|
||||
"bucket: {bucket}, expected: {file_ids:?}, actual: {actual:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_time_buckets() {
|
||||
let file_id_a = FileId::random();
|
||||
let file_id_b = FileId::random();
|
||||
// simple case, files with disjoint
|
||||
check_bucket_calculation(
|
||||
10,
|
||||
new_file_handles(&[(file_id_a, 0, 9000), (file_id_b, 10000, 19000)]),
|
||||
&[(0, &[file_id_a]), (10, &[file_id_b])],
|
||||
);
|
||||
|
||||
// files across buckets
|
||||
check_bucket_calculation(
|
||||
10,
|
||||
new_file_handles(&[(file_id_a, 0, 10001), (file_id_b, 10000, 19000)]),
|
||||
&[(0, &[file_id_a]), (10, &[file_id_a, file_id_b])],
|
||||
);
|
||||
check_bucket_calculation(
|
||||
10,
|
||||
new_file_handles(&[(file_id_a, 0, 10000)]),
|
||||
&[(0, &[file_id_a]), (10, &[file_id_a])],
|
||||
);
|
||||
|
||||
// file with an large time range
|
||||
let file_id_array = &[file_id_a];
|
||||
let expected = (0..(TIME_BUCKETS.get(4) / TIME_BUCKETS.get(0)))
|
||||
.map(|b| (b * TIME_BUCKETS.get(0), file_id_array as _))
|
||||
.collect::<Vec<_>>();
|
||||
check_bucket_calculation(
|
||||
TIME_BUCKETS.get(0),
|
||||
new_file_handles(&[(file_id_a, 0, TIME_BUCKETS.get(4) * 1000)]),
|
||||
&expected,
|
||||
);
|
||||
}
|
||||
|
||||
struct TtlTester {
|
||||
files: Vec<(FileId, i64, i64, Level)>,
|
||||
ttl: Option<Duration>,
|
||||
expired: Vec<usize>,
|
||||
now: Timestamp,
|
||||
}
|
||||
|
||||
impl TtlTester {
|
||||
fn check(&self) {
|
||||
let expected_expired = self
|
||||
.expired
|
||||
.iter()
|
||||
.map(|idx| self.files[*idx].0)
|
||||
.collect::<HashSet<_>>();
|
||||
let file_purger = new_noop_file_purger();
|
||||
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
|
||||
let file_handles = self
|
||||
.files
|
||||
.iter()
|
||||
.map(|(file_id, start_ts, end_ts, level)| {
|
||||
new_file_handle(*file_id, *start_ts, *end_ts, *level).meta()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let levels = LevelMetas::new(layer, file_purger).merge(
|
||||
file_handles.into_iter(),
|
||||
vec![].into_iter(),
|
||||
None,
|
||||
);
|
||||
let expired = get_expired_ssts(levels.levels(), self.ttl, self.now)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|f| f.file_id())
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(expected_expired, expired);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_expired_ssts() {
|
||||
TtlTester {
|
||||
files: vec![
|
||||
(FileId::random(), 8000, 9000, 0),
|
||||
(FileId::random(), 10000, 11000, 0),
|
||||
(FileId::random(), 8000, 11000, 1),
|
||||
(FileId::random(), 2000, 3000, 1),
|
||||
],
|
||||
ttl: Some(Duration::from_secs(1)),
|
||||
expired: vec![3],
|
||||
now: Timestamp::new_second(10),
|
||||
}
|
||||
.check();
|
||||
|
||||
TtlTester {
|
||||
files: vec![
|
||||
(FileId::random(), 8000, 8999, 0),
|
||||
(FileId::random(), 10000, 11000, 0),
|
||||
(FileId::random(), 8000, 11000, 1),
|
||||
(FileId::random(), 2000, 3000, 1),
|
||||
],
|
||||
ttl: Some(Duration::from_secs(1)),
|
||||
expired: vec![0, 3],
|
||||
now: Timestamp::new_second(10),
|
||||
}
|
||||
.check();
|
||||
}
|
||||
}
|
||||
@@ -1,157 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_telemetry::{debug, error, info};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::oneshot::Sender;
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use crate::compaction::task::CompactionTask;
|
||||
use crate::compaction::CompactionPickerRef;
|
||||
use crate::error::Result;
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::region::{RegionWriterRef, SharedDataRef};
|
||||
use crate::scheduler::rate_limit::BoxedRateLimitToken;
|
||||
use crate::scheduler::{Handler, Request};
|
||||
use crate::schema::RegionSchemaRef;
|
||||
use crate::sst::AccessLayerRef;
|
||||
use crate::version::LevelMetasRef;
|
||||
use crate::wal::Wal;
|
||||
|
||||
impl<S: LogStore> Request for CompactionRequestImpl<S> {
|
||||
type Key = RegionId;
|
||||
|
||||
#[inline]
|
||||
fn key(&self) -> RegionId {
|
||||
self.region_id
|
||||
}
|
||||
|
||||
fn complete(self, result: Result<()>) {
|
||||
if let Some(sender) = self.sender {
|
||||
// We don't care the send result as callers might not
|
||||
// wait the result.
|
||||
let _ = sender.send(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Region compaction request.
|
||||
pub struct CompactionRequestImpl<S: LogStore> {
|
||||
pub region_id: RegionId,
|
||||
pub sst_layer: AccessLayerRef,
|
||||
pub writer: RegionWriterRef<S>,
|
||||
pub shared: SharedDataRef,
|
||||
pub manifest: RegionManifest,
|
||||
pub wal: Wal<S>,
|
||||
pub ttl: Option<Duration>,
|
||||
pub compaction_time_window: Option<i64>,
|
||||
/// Compaction result sender.
|
||||
pub sender: Option<Sender<Result<()>>>,
|
||||
pub picker: CompactionPickerRef<S>,
|
||||
pub sst_write_buffer_size: ReadableSize,
|
||||
/// Whether to immediately reschedule another compaction when finished.
|
||||
pub reschedule_on_finish: bool,
|
||||
}
|
||||
|
||||
impl<S: LogStore> CompactionRequestImpl<S> {
|
||||
#[inline]
|
||||
pub(crate) fn schema(&self) -> RegionSchemaRef {
|
||||
self.shared.version_control.current().schema().clone()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn levels(&self) -> LevelMetasRef {
|
||||
self.shared.version_control.current().ssts().clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CompactionHandler<S: LogStore> {
|
||||
_phantom_data: PhantomData<S>,
|
||||
#[cfg(test)]
|
||||
pub pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
|
||||
}
|
||||
|
||||
impl<S: LogStore> Default for CompactionHandler<S> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
_phantom_data: Default::default(),
|
||||
#[cfg(test)]
|
||||
pending_tasks: Arc::new(Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> CompactionHandler<S> {
|
||||
#[cfg(test)]
|
||||
pub fn new_with_pending_tasks(
|
||||
tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
_phantom_data: Default::default(),
|
||||
pending_tasks: tasks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S> Handler for CompactionHandler<S>
|
||||
where
|
||||
S: LogStore,
|
||||
{
|
||||
type Request = CompactionRequestImpl<S>;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
let region_id = req.key();
|
||||
let Some(task) = req.picker.pick(&req)? else {
|
||||
info!("No file needs compaction in region: {:?}", region_id);
|
||||
req.complete(Ok(()));
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
debug!("Compaction task, region: {:?}, task: {:?}", region_id, task);
|
||||
// TODO(hl): we need to keep a track of task handle here to allow task cancellation.
|
||||
let _handle = common_runtime::spawn_bg(async move {
|
||||
if let Err(e) = task.run().await {
|
||||
// TODO(hl): maybe resubmit compaction task on failure?
|
||||
error!(e; "Failed to compact region: {:?}", region_id);
|
||||
|
||||
req.complete(Err(e));
|
||||
} else {
|
||||
info!("Successfully compacted region: {:?}", region_id);
|
||||
|
||||
req.complete(Ok(()));
|
||||
}
|
||||
// releases rate limit token
|
||||
token.try_release();
|
||||
// notify scheduler to schedule next task when current task finishes.
|
||||
finish_notifier.notify_one();
|
||||
});
|
||||
|
||||
#[cfg(test)]
|
||||
self.pending_tasks.write().await.push(_handle);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,309 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_telemetry::{debug, error, info};
|
||||
use itertools::Itertools;
|
||||
use snafu::ResultExt;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::{CompactContext, RegionId};
|
||||
|
||||
use crate::compaction::writer::build_sst_reader;
|
||||
use crate::error;
|
||||
use crate::error::Result;
|
||||
use crate::manifest::action::RegionEdit;
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::region::{RegionWriterRef, SharedDataRef, WriterCompactRequest};
|
||||
use crate::schema::RegionSchemaRef;
|
||||
use crate::sst::{
|
||||
AccessLayerRef, FileHandle, FileId, FileMeta, Level, Source, SstInfo, WriteOptions,
|
||||
};
|
||||
use crate::wal::Wal;
|
||||
|
||||
const MAX_PARALLEL_COMPACTION: usize = 8;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait CompactionTask: Debug + Send + Sync + 'static {
|
||||
async fn run(self) -> Result<()>;
|
||||
}
|
||||
|
||||
pub struct CompactionTaskImpl<S: LogStore> {
|
||||
pub schema: RegionSchemaRef,
|
||||
pub sst_layer: AccessLayerRef,
|
||||
pub outputs: Vec<CompactionOutput>,
|
||||
pub writer: RegionWriterRef<S>,
|
||||
pub shared_data: SharedDataRef,
|
||||
pub wal: Wal<S>,
|
||||
pub manifest: RegionManifest,
|
||||
pub expired_ssts: Vec<FileHandle>,
|
||||
pub sst_write_buffer_size: ReadableSize,
|
||||
pub compaction_time_window: Option<i64>,
|
||||
pub reschedule_on_finish: bool,
|
||||
}
|
||||
|
||||
impl<S: LogStore> Debug for CompactionTaskImpl<S> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("CompactionTaskImpl")
|
||||
.field("region_name", &self.shared_data.name())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> Drop for CompactionTaskImpl<S> {
|
||||
fn drop(&mut self) {
|
||||
self.mark_files_compacting(false);
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> CompactionTaskImpl<S> {
|
||||
/// Compacts inputs SSTs, returns `(output file, compacted input file)`.
|
||||
async fn merge_ssts(&mut self) -> Result<(HashSet<FileMeta>, HashSet<FileMeta>)> {
|
||||
let mut futs = Vec::with_capacity(self.outputs.len());
|
||||
let mut compacted_inputs = HashSet::new();
|
||||
let region_id = self.shared_data.id();
|
||||
for output in self.outputs.drain(..) {
|
||||
let schema = self.schema.clone();
|
||||
let sst_layer = self.sst_layer.clone();
|
||||
let sst_write_buffer_size = self.sst_write_buffer_size;
|
||||
compacted_inputs.extend(output.inputs.iter().map(FileHandle::meta));
|
||||
|
||||
info!(
|
||||
"Compaction output [{}]-> {}",
|
||||
output
|
||||
.inputs
|
||||
.iter()
|
||||
.map(|f| f.file_id().to_string())
|
||||
.join(","),
|
||||
output.output_file_id
|
||||
);
|
||||
|
||||
// TODO(hl): Maybe spawn to runtime to exploit in-job parallelism.
|
||||
futs.push(async move {
|
||||
output
|
||||
.build(region_id, schema, sst_layer, sst_write_buffer_size)
|
||||
.await
|
||||
});
|
||||
}
|
||||
|
||||
let mut outputs = HashSet::with_capacity(futs.len());
|
||||
while !futs.is_empty() {
|
||||
let mut task_chunk = Vec::with_capacity(MAX_PARALLEL_COMPACTION);
|
||||
for _ in 0..MAX_PARALLEL_COMPACTION {
|
||||
if let Some(task) = futs.pop() {
|
||||
task_chunk.push(common_runtime::spawn_bg(task));
|
||||
}
|
||||
}
|
||||
let metas = futures::future::try_join_all(task_chunk)
|
||||
.await
|
||||
.context(error::JoinSnafu)?
|
||||
.into_iter()
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
outputs.extend(metas.into_iter().flatten());
|
||||
}
|
||||
|
||||
let inputs = compacted_inputs.into_iter().collect();
|
||||
Ok((outputs, inputs))
|
||||
}
|
||||
|
||||
/// Writes updated SST info into manifest.
|
||||
async fn write_manifest_and_apply(
|
||||
&self,
|
||||
output: HashSet<FileMeta>,
|
||||
input: HashSet<FileMeta>,
|
||||
) -> Result<()> {
|
||||
let version = &self.shared_data.version_control;
|
||||
let region_version = version.metadata().version();
|
||||
|
||||
let edit = RegionEdit {
|
||||
region_version,
|
||||
flushed_sequence: None,
|
||||
files_to_add: Vec::from_iter(output),
|
||||
files_to_remove: Vec::from_iter(input),
|
||||
compaction_time_window: self.compaction_time_window,
|
||||
};
|
||||
debug!(
|
||||
"Compacted region: {}, region edit: {:?}",
|
||||
version.metadata().name(),
|
||||
edit
|
||||
);
|
||||
self.writer
|
||||
.write_edit_and_apply(&self.wal, &self.shared_data, &self.manifest, edit, None)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Mark files are under compaction.
|
||||
fn mark_files_compacting(&self, compacting: bool) {
|
||||
for o in &self.outputs {
|
||||
for input in &o.inputs {
|
||||
input.mark_compacting(compacting);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S: LogStore> CompactionTask for CompactionTaskImpl<S> {
|
||||
async fn run(mut self) -> Result<()> {
|
||||
let _timer = crate::metrics::COMPACT_ELAPSED.start_timer();
|
||||
self.mark_files_compacting(true);
|
||||
|
||||
let (output, mut compacted) = self.merge_ssts().await.map_err(|e| {
|
||||
error!(e; "Failed to compact region: {}", self.shared_data.name());
|
||||
e
|
||||
})?;
|
||||
compacted.extend(self.expired_ssts.iter().map(FileHandle::meta));
|
||||
|
||||
let input_ids = compacted.iter().map(|f| f.file_id).collect::<Vec<_>>();
|
||||
let output_ids = output.iter().map(|f| f.file_id).collect::<Vec<_>>();
|
||||
info!(
|
||||
"Compacting SST files, input: {:?}, output: {:?}, window: {:?}",
|
||||
input_ids, output_ids, self.compaction_time_window
|
||||
);
|
||||
|
||||
let no_output = output.is_empty();
|
||||
let write_result = self
|
||||
.write_manifest_and_apply(output, compacted)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
error!(e; "Failed to update region manifest: {}", self.shared_data.name());
|
||||
e
|
||||
});
|
||||
|
||||
if !no_output && self.reschedule_on_finish {
|
||||
// only reschedule another compaction if current compaction has output and it's
|
||||
// triggered by flush.
|
||||
if let Err(e) = self
|
||||
.writer
|
||||
.compact(WriterCompactRequest {
|
||||
shared_data: self.shared_data.clone(),
|
||||
sst_layer: self.sst_layer.clone(),
|
||||
manifest: self.manifest.clone(),
|
||||
wal: self.wal.clone(),
|
||||
region_writer: self.writer.clone(),
|
||||
compact_ctx: CompactContext { wait: false },
|
||||
})
|
||||
.await
|
||||
{
|
||||
error!(e; "Failed to schedule a compaction after compaction, region id: {}", self.shared_data.id());
|
||||
} else {
|
||||
info!(
|
||||
"Immediately schedule another compaction for region: {}",
|
||||
self.shared_data.id()
|
||||
);
|
||||
}
|
||||
}
|
||||
write_result
|
||||
}
|
||||
}
|
||||
|
||||
/// Many-to-many compaction can be decomposed to a many-to-one compaction from level n to level n+1
|
||||
/// and a many-to-one compaction from level n+1 to level n+1.
|
||||
#[derive(Debug)]
|
||||
pub struct CompactionOutput {
|
||||
pub output_file_id: FileId,
|
||||
/// Compaction output file level.
|
||||
pub output_level: Level,
|
||||
/// The left bound of time window.
|
||||
pub time_window_bound: i64,
|
||||
/// Time window size in seconds.
|
||||
pub time_window_sec: i64,
|
||||
/// Compaction input files.
|
||||
pub inputs: Vec<FileHandle>,
|
||||
/// If the compaction output is strictly windowed.
|
||||
pub strict_window: bool,
|
||||
}
|
||||
|
||||
impl CompactionOutput {
|
||||
async fn build(
|
||||
&self,
|
||||
region_id: RegionId,
|
||||
schema: RegionSchemaRef,
|
||||
sst_layer: AccessLayerRef,
|
||||
sst_write_buffer_size: ReadableSize,
|
||||
) -> Result<Option<FileMeta>> {
|
||||
let time_range = if self.strict_window {
|
||||
(
|
||||
Some(self.time_window_bound),
|
||||
Some(self.time_window_bound + self.time_window_sec),
|
||||
)
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
let reader = build_sst_reader(
|
||||
region_id,
|
||||
schema,
|
||||
sst_layer.clone(),
|
||||
&self.inputs,
|
||||
time_range,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let opts = WriteOptions {
|
||||
sst_write_buffer_size,
|
||||
};
|
||||
let _timer = crate::metrics::MERGE_ELAPSED.start_timer();
|
||||
let meta = sst_layer
|
||||
.write_sst(self.output_file_id, Source::Reader(reader), &opts)
|
||||
.await?
|
||||
.map(
|
||||
|SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
..
|
||||
}| FileMeta {
|
||||
region_id,
|
||||
file_id: self.output_file_id,
|
||||
time_range,
|
||||
level: self.output_level,
|
||||
file_size,
|
||||
},
|
||||
);
|
||||
Ok(meta)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use crate::compaction::task::CompactionTask;
|
||||
|
||||
pub type CallbackRef = Arc<dyn Fn() + Send + Sync>;
|
||||
|
||||
pub struct NoopCompactionTask {
|
||||
pub cbs: Vec<CallbackRef>,
|
||||
}
|
||||
|
||||
impl Debug for NoopCompactionTask {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("storage::compaction::task::tests::NoopCompactionTask")
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl CompactionTask for NoopCompactionTask {
|
||||
async fn run(self) -> Result<()> {
|
||||
for cb in &self.cbs {
|
||||
cb()
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,406 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Time-window compaction strategy
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use common_telemetry::{debug, info, warn};
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::timestamp_millis::BucketAligned;
|
||||
use common_time::Timestamp;
|
||||
use store_api::logstore::LogStore;
|
||||
|
||||
use crate::compaction::picker::get_expired_ssts;
|
||||
use crate::compaction::task::CompactionOutput;
|
||||
use crate::compaction::{infer_time_bucket, CompactionRequestImpl, CompactionTaskImpl, Picker};
|
||||
use crate::sst::{FileHandle, FileId, LevelMeta};
|
||||
|
||||
/// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction
|
||||
/// candidates.
|
||||
pub struct TwcsPicker<S> {
|
||||
max_active_window_files: usize,
|
||||
max_inactive_window_files: usize,
|
||||
time_window_seconds: Option<i64>,
|
||||
_phantom_data: PhantomData<S>,
|
||||
}
|
||||
|
||||
impl<S> Debug for TwcsPicker<S> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("TwcsPicker")
|
||||
.field("max_active_window_files", &self.max_active_window_files)
|
||||
.field("max_inactive_window_files", &self.max_inactive_window_files)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> TwcsPicker<S> {
|
||||
pub fn new(
|
||||
max_active_window_files: usize,
|
||||
max_inactive_window_files: usize,
|
||||
time_window_seconds: Option<i64>,
|
||||
) -> Self {
|
||||
Self {
|
||||
max_inactive_window_files,
|
||||
max_active_window_files,
|
||||
_phantom_data: Default::default(),
|
||||
time_window_seconds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds compaction output from files.
|
||||
/// For active writing window, we allow for at most `max_active_window_files` files to alleviate
|
||||
/// fragmentation. For other windows, we allow at most 1 file at each window.
|
||||
fn build_output(
|
||||
&self,
|
||||
time_windows: &BTreeMap<i64, Vec<FileHandle>>,
|
||||
active_window: Option<i64>,
|
||||
window_size: i64,
|
||||
) -> Vec<CompactionOutput> {
|
||||
let mut output = vec![];
|
||||
for (window, files) in time_windows {
|
||||
if let Some(active_window) = active_window && *window == active_window {
|
||||
if files.len() > self.max_active_window_files {
|
||||
output.push(CompactionOutput {
|
||||
output_file_id: FileId::random(),
|
||||
output_level: 1, // we only have two levels and always compact to l1
|
||||
time_window_bound: *window,
|
||||
time_window_sec: window_size,
|
||||
inputs: files.clone(),
|
||||
// Strict window is not needed since we always compact many files to one
|
||||
// single file in TWCS.
|
||||
strict_window: false,
|
||||
});
|
||||
} else {
|
||||
debug!("Active window not present or no enough files in active window {:?}, window: {}", active_window, *window);
|
||||
}
|
||||
} else {
|
||||
// not active writing window
|
||||
if files.len() > self.max_inactive_window_files {
|
||||
output.push(CompactionOutput {
|
||||
output_file_id: FileId::random(),
|
||||
output_level: 1,
|
||||
time_window_bound: *window,
|
||||
time_window_sec: window_size,
|
||||
inputs: files.clone(),
|
||||
strict_window: false,
|
||||
});
|
||||
} else {
|
||||
debug!("No enough files, current: {}, max_inactive_window_files: {}", files.len(), self.max_inactive_window_files)
|
||||
}
|
||||
}
|
||||
}
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> Picker for TwcsPicker<S> {
|
||||
type Request = CompactionRequestImpl<S>;
|
||||
type Task = CompactionTaskImpl<S>;
|
||||
|
||||
fn pick(&self, req: &Self::Request) -> crate::error::Result<Option<Self::Task>> {
|
||||
let levels = req.levels();
|
||||
let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())?;
|
||||
if !expired_ssts.is_empty() {
|
||||
info!(
|
||||
"Expired SSTs in region {}: {:?}",
|
||||
req.region_id, expired_ssts
|
||||
);
|
||||
// here we mark expired SSTs as compacting to avoid them being picked.
|
||||
expired_ssts.iter().for_each(|f| f.mark_compacting(true));
|
||||
}
|
||||
|
||||
let time_window_size = req
|
||||
.compaction_time_window
|
||||
.or(self.time_window_seconds)
|
||||
.unwrap_or_else(|| {
|
||||
let inferred = infer_time_bucket(req.levels().level(0).files());
|
||||
info!(
|
||||
"Compaction window for region {} is not present, inferring from files: {:?}",
|
||||
req.region_id, inferred
|
||||
);
|
||||
inferred
|
||||
});
|
||||
|
||||
// Find active window from files in level 0.
|
||||
let active_window =
|
||||
find_latest_window_in_seconds(levels.level(0).files(), time_window_size);
|
||||
|
||||
let windows = assign_to_windows(
|
||||
levels.levels().iter().flat_map(LevelMeta::files),
|
||||
time_window_size,
|
||||
);
|
||||
|
||||
let outputs = self.build_output(&windows, active_window, time_window_size);
|
||||
|
||||
if outputs.is_empty() && expired_ssts.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let task = CompactionTaskImpl {
|
||||
schema: req.schema(),
|
||||
sst_layer: req.sst_layer.clone(),
|
||||
outputs,
|
||||
writer: req.writer.clone(),
|
||||
shared_data: req.shared.clone(),
|
||||
wal: req.wal.clone(),
|
||||
manifest: req.manifest.clone(),
|
||||
expired_ssts,
|
||||
sst_write_buffer_size: req.sst_write_buffer_size,
|
||||
compaction_time_window: Some(time_window_size),
|
||||
reschedule_on_finish: req.reschedule_on_finish,
|
||||
};
|
||||
Ok(Some(task))
|
||||
}
|
||||
}
|
||||
|
||||
/// Assigns files to windows with predefined window size (in seconds) by their max timestamps.
|
||||
fn assign_to_windows<'a>(
|
||||
files: impl Iterator<Item = &'a FileHandle>,
|
||||
time_window_size: i64,
|
||||
) -> BTreeMap<i64, Vec<FileHandle>> {
|
||||
let mut windows: BTreeMap<i64, Vec<FileHandle>> = BTreeMap::new();
|
||||
// Iterates all files and assign to time windows according to max timestamp
|
||||
for file in files {
|
||||
if let Some((_, end)) = file.time_range() {
|
||||
let time_window = end
|
||||
.convert_to(TimeUnit::Second)
|
||||
.unwrap()
|
||||
.value()
|
||||
.align_to_ceil_by_bucket(time_window_size)
|
||||
.unwrap_or(i64::MIN);
|
||||
windows.entry(time_window).or_default().push(file.clone());
|
||||
} else {
|
||||
warn!("Unexpected file w/o timestamp: {:?}", file.file_id());
|
||||
}
|
||||
}
|
||||
windows
|
||||
}
|
||||
|
||||
/// Finds the latest active writing window among all files.
|
||||
/// Returns `None` when there are no files or all files are corrupted.
|
||||
fn find_latest_window_in_seconds<'a>(
|
||||
files: impl Iterator<Item = &'a FileHandle>,
|
||||
time_window_size: i64,
|
||||
) -> Option<i64> {
|
||||
let mut latest_timestamp = None;
|
||||
for f in files {
|
||||
if let Some((_, end)) = f.time_range() {
|
||||
if let Some(latest) = latest_timestamp && end > latest {
|
||||
latest_timestamp = Some(end);
|
||||
} else {
|
||||
latest_timestamp = Some(end);
|
||||
}
|
||||
} else {
|
||||
warn!("Cannot find timestamp range of file: {}", f.file_id());
|
||||
}
|
||||
}
|
||||
latest_timestamp
|
||||
.and_then(|ts| ts.convert_to_ceil(TimeUnit::Second))
|
||||
.and_then(|ts| ts.value().align_to_ceil_by_bucket(time_window_size))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use log_store::NoopLogStore;
|
||||
|
||||
use super::*;
|
||||
use crate::compaction::tests::new_file_handle;
|
||||
use crate::sst::{FileId, Level};
|
||||
|
||||
#[test]
|
||||
fn test_get_latest_window_in_seconds() {
|
||||
assert_eq!(
|
||||
Some(1),
|
||||
find_latest_window_in_seconds([new_file_handle(FileId::random(), 0, 999, 0)].iter(), 1)
|
||||
);
|
||||
assert_eq!(
|
||||
Some(1),
|
||||
find_latest_window_in_seconds(
|
||||
[new_file_handle(FileId::random(), 0, 1000, 0)].iter(),
|
||||
1
|
||||
)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
Some(-9223372036854000),
|
||||
find_latest_window_in_seconds(
|
||||
[new_file_handle(FileId::random(), i64::MIN, i64::MIN + 1, 0)].iter(),
|
||||
3600,
|
||||
)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
(i64::MAX / 10000000 + 1) * 10000,
|
||||
find_latest_window_in_seconds(
|
||||
[new_file_handle(FileId::random(), i64::MIN, i64::MAX, 0)].iter(),
|
||||
10000,
|
||||
)
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_assign_to_windows() {
|
||||
let windows = assign_to_windows(
|
||||
[
|
||||
new_file_handle(FileId::random(), 0, 999, 0),
|
||||
new_file_handle(FileId::random(), 0, 999, 0),
|
||||
new_file_handle(FileId::random(), 0, 999, 0),
|
||||
new_file_handle(FileId::random(), 0, 999, 0),
|
||||
new_file_handle(FileId::random(), 0, 999, 0),
|
||||
]
|
||||
.iter(),
|
||||
3,
|
||||
);
|
||||
assert_eq!(5, windows.get(&0).unwrap().len());
|
||||
|
||||
let files = [FileId::random(); 3];
|
||||
let windows = assign_to_windows(
|
||||
[
|
||||
new_file_handle(files[0], -2000, -3, 0),
|
||||
new_file_handle(files[1], 0, 2999, 0),
|
||||
new_file_handle(files[2], 50, 10001, 0),
|
||||
]
|
||||
.iter(),
|
||||
3,
|
||||
);
|
||||
assert_eq!(files[0], windows.get(&0).unwrap().get(0).unwrap().file_id());
|
||||
assert_eq!(files[1], windows.get(&3).unwrap().get(0).unwrap().file_id());
|
||||
assert_eq!(
|
||||
files[2],
|
||||
windows.get(&12).unwrap().get(0).unwrap().file_id()
|
||||
);
|
||||
}
|
||||
|
||||
struct CompactionPickerTestCase {
|
||||
window_size: i64,
|
||||
input_files: Vec<FileHandle>,
|
||||
expected_outputs: Vec<ExpectedOutput>,
|
||||
}
|
||||
|
||||
impl CompactionPickerTestCase {
|
||||
fn check(&self) {
|
||||
let windows = assign_to_windows(self.input_files.iter(), self.window_size);
|
||||
let active_window =
|
||||
find_latest_window_in_seconds(self.input_files.iter(), self.window_size);
|
||||
let output = TwcsPicker::<NoopLogStore>::new(4, 1, None).build_output(
|
||||
&windows,
|
||||
active_window,
|
||||
self.window_size,
|
||||
);
|
||||
|
||||
let output = output
|
||||
.iter()
|
||||
.map(|o| {
|
||||
let input_file_ids =
|
||||
o.inputs.iter().map(|f| f.file_id()).collect::<HashSet<_>>();
|
||||
(
|
||||
input_file_ids,
|
||||
o.output_level,
|
||||
o.time_window_sec,
|
||||
o.time_window_bound,
|
||||
o.strict_window,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let expected = self
|
||||
.expected_outputs
|
||||
.iter()
|
||||
.map(|o| {
|
||||
let input_file_ids = o
|
||||
.input_files
|
||||
.iter()
|
||||
.map(|idx| self.input_files[*idx].file_id())
|
||||
.collect::<HashSet<_>>();
|
||||
(
|
||||
input_file_ids,
|
||||
o.output_level,
|
||||
o.time_window_sec,
|
||||
o.time_window_bound,
|
||||
o.strict_window,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(expected, output);
|
||||
}
|
||||
}
|
||||
|
||||
struct ExpectedOutput {
|
||||
input_files: Vec<usize>,
|
||||
output_level: Level,
|
||||
time_window_sec: i64,
|
||||
time_window_bound: i64,
|
||||
strict_window: bool,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_twcs_output() {
|
||||
let file_ids = (0..4).map(|_| FileId::random()).collect::<Vec<_>>();
|
||||
|
||||
CompactionPickerTestCase {
|
||||
window_size: 3,
|
||||
input_files: [
|
||||
new_file_handle(file_ids[0], -2000, -3, 0),
|
||||
new_file_handle(file_ids[1], -3000, -100, 0),
|
||||
new_file_handle(file_ids[2], 0, 2999, 0), //active windows
|
||||
new_file_handle(file_ids[3], 50, 2998, 0), //active windows
|
||||
]
|
||||
.to_vec(),
|
||||
expected_outputs: vec![ExpectedOutput {
|
||||
input_files: vec![0, 1],
|
||||
output_level: 1,
|
||||
time_window_sec: 3,
|
||||
time_window_bound: 0,
|
||||
strict_window: false,
|
||||
}],
|
||||
}
|
||||
.check();
|
||||
|
||||
let file_ids = (0..6).map(|_| FileId::random()).collect::<Vec<_>>();
|
||||
CompactionPickerTestCase {
|
||||
window_size: 3,
|
||||
input_files: [
|
||||
new_file_handle(file_ids[0], -2000, -3, 0),
|
||||
new_file_handle(file_ids[1], -3000, -100, 0),
|
||||
new_file_handle(file_ids[2], 0, 2999, 0),
|
||||
new_file_handle(file_ids[3], 50, 2998, 0),
|
||||
new_file_handle(file_ids[4], 11, 2990, 0),
|
||||
new_file_handle(file_ids[5], 50, 4998, 0),
|
||||
]
|
||||
.to_vec(),
|
||||
expected_outputs: vec![
|
||||
ExpectedOutput {
|
||||
input_files: vec![0, 1],
|
||||
output_level: 1,
|
||||
time_window_sec: 3,
|
||||
time_window_bound: 0,
|
||||
strict_window: false,
|
||||
},
|
||||
ExpectedOutput {
|
||||
input_files: vec![2, 3, 4],
|
||||
output_level: 1,
|
||||
time_window_sec: 3,
|
||||
time_window_bound: 3,
|
||||
strict_window: false,
|
||||
},
|
||||
],
|
||||
}
|
||||
.check();
|
||||
}
|
||||
}
|
||||
@@ -1,588 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_query::logical_plan::{DfExpr, Expr};
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use datafusion_expr::Operator;
|
||||
use datatypes::value::timestamp_to_scalar_value;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
|
||||
use crate::error;
|
||||
use crate::schema::RegionSchemaRef;
|
||||
use crate::sst::{AccessLayerRef, FileHandle};
|
||||
|
||||
/// Builds an SST reader that only reads rows within given time range.
|
||||
pub(crate) async fn build_sst_reader(
|
||||
region_id: RegionId,
|
||||
schema: RegionSchemaRef,
|
||||
sst_layer: AccessLayerRef,
|
||||
files: &[FileHandle],
|
||||
time_range: (Option<i64>, Option<i64>),
|
||||
) -> error::Result<ChunkReaderImpl> {
|
||||
// TODO(hl): Schemas in different SSTs may differ, thus we should infer
|
||||
// timestamp column name from Parquet metadata.
|
||||
|
||||
// safety: Region schema's timestamp column must present
|
||||
let ts_col = schema.user_schema().timestamp_column().unwrap();
|
||||
let ts_col_unit = ts_col.data_type.as_timestamp().unwrap().unit();
|
||||
let ts_col_name = ts_col.name.clone();
|
||||
|
||||
ChunkReaderBuilder::new(region_id, schema, sst_layer)
|
||||
.pick_ssts(files)
|
||||
.filters(
|
||||
build_time_range_filter(time_range, &ts_col_name, ts_col_unit)
|
||||
.into_iter()
|
||||
.collect(),
|
||||
)
|
||||
.build()
|
||||
.await
|
||||
}
|
||||
|
||||
/// Build time range filter expr from lower (inclusive) and upper bound(exclusive).
|
||||
/// Returns `None` if time range overflows.
|
||||
fn build_time_range_filter(
|
||||
time_range: (Option<i64>, Option<i64>),
|
||||
ts_col_name: &str,
|
||||
ts_col_unit: TimeUnit,
|
||||
) -> Option<Expr> {
|
||||
let (low_ts_inclusive, high_ts_exclusive) = time_range;
|
||||
let ts_col = DfExpr::Column(datafusion_common::Column::from_name(ts_col_name));
|
||||
|
||||
// Converting seconds to whatever unit won't lose precision.
|
||||
// Here only handles overflow.
|
||||
let low_ts = low_ts_inclusive
|
||||
.map(common_time::Timestamp::new_second)
|
||||
.and_then(|ts| ts.convert_to(ts_col_unit))
|
||||
.map(|ts| ts.value());
|
||||
let high_ts = high_ts_exclusive
|
||||
.map(common_time::Timestamp::new_second)
|
||||
.and_then(|ts| ts.convert_to(ts_col_unit))
|
||||
.map(|ts| ts.value());
|
||||
|
||||
let expr = match (low_ts, high_ts) {
|
||||
(Some(low), Some(high)) => {
|
||||
let lower_bound_expr =
|
||||
DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(low)));
|
||||
let upper_bound_expr =
|
||||
DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(high)));
|
||||
Some(datafusion_expr::and(
|
||||
datafusion_expr::binary_expr(ts_col.clone(), Operator::GtEq, lower_bound_expr),
|
||||
datafusion_expr::binary_expr(ts_col, Operator::Lt, upper_bound_expr),
|
||||
))
|
||||
}
|
||||
|
||||
(Some(low), None) => {
|
||||
let lower_bound_expr =
|
||||
datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(low)));
|
||||
Some(datafusion_expr::binary_expr(
|
||||
ts_col,
|
||||
Operator::GtEq,
|
||||
lower_bound_expr,
|
||||
))
|
||||
}
|
||||
|
||||
(None, Some(high)) => {
|
||||
let upper_bound_expr =
|
||||
datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(high)));
|
||||
Some(datafusion_expr::binary_expr(
|
||||
ts_col,
|
||||
Operator::Lt,
|
||||
upper_bound_expr,
|
||||
))
|
||||
}
|
||||
|
||||
(None, None) => None,
|
||||
};
|
||||
|
||||
expr.map(Expr::from)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use common_time::Timestamp;
|
||||
use datatypes::prelude::{LogicalTypeId, ScalarVector, ScalarVectorBuilder};
|
||||
use datatypes::timestamp::TimestampMillisecond;
|
||||
use datatypes::vectors::{
|
||||
TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64VectorBuilder,
|
||||
};
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
use store_api::storage::{ChunkReader, SequenceNumber};
|
||||
|
||||
use super::*;
|
||||
use crate::file_purger::noop::new_noop_file_purger;
|
||||
use crate::memtable::{
|
||||
DefaultMemtableBuilder, IterContext, KeyValues, Memtable, MemtableBuilder,
|
||||
};
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::sst::parquet::ParquetWriter;
|
||||
use crate::sst::{self, FileId, FileMeta, FsAccessLayer, Source, SstInfo, WriteOptions};
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
|
||||
const REGION_ID: RegionId = RegionId::from_u64(1);
|
||||
|
||||
fn schema_for_test() -> RegionSchemaRef {
|
||||
// Just build a region desc and use its columns metadata.
|
||||
let desc = RegionDescBuilder::new("test")
|
||||
.push_field_column(("v", LogicalTypeId::UInt64, true))
|
||||
.build();
|
||||
let metadata: RegionMetadata = desc.try_into().unwrap();
|
||||
metadata.schema().clone()
|
||||
}
|
||||
|
||||
pub fn write_kvs(
|
||||
memtable: &dyn Memtable,
|
||||
sequence: SequenceNumber,
|
||||
op_type: OpType,
|
||||
ts: &[i64], // timestamp
|
||||
values: &[Option<u64>],
|
||||
) {
|
||||
let keys: Vec<TimestampMillisecond> = ts.iter().map(|ts| (*ts).into()).collect();
|
||||
let kvs = kvs_for_test(sequence, op_type, &keys, values);
|
||||
memtable.write(&kvs).unwrap();
|
||||
}
|
||||
|
||||
fn kvs_for_test(
|
||||
sequence: SequenceNumber,
|
||||
op_type: OpType,
|
||||
ts: &[TimestampMillisecond],
|
||||
values: &[Option<u64>],
|
||||
) -> KeyValues {
|
||||
let start_index_in_batch = 0;
|
||||
assert_eq!(ts.len(), values.len());
|
||||
let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(ts.len());
|
||||
for key in ts {
|
||||
key_builders.push(Some(*key));
|
||||
}
|
||||
let ts_col = Arc::new(key_builders.finish()) as _;
|
||||
let mut value_builders = UInt64VectorBuilder::with_capacity(values.len());
|
||||
|
||||
for value in values {
|
||||
value_builders.push(*value);
|
||||
}
|
||||
let row_values = vec![Arc::new(value_builders.finish()) as _];
|
||||
|
||||
let kvs = KeyValues {
|
||||
sequence,
|
||||
op_type,
|
||||
start_index_in_batch,
|
||||
keys: vec![],
|
||||
values: row_values,
|
||||
timestamp: Some(ts_col),
|
||||
};
|
||||
|
||||
assert_eq!(ts.len(), kvs.len());
|
||||
assert_eq!(ts.is_empty(), kvs.is_empty());
|
||||
|
||||
kvs
|
||||
}
|
||||
|
||||
async fn write_sst(
|
||||
sst_file_id: FileId,
|
||||
schema: RegionSchemaRef,
|
||||
seq: &AtomicU64,
|
||||
object_store: ObjectStore,
|
||||
ts: &[i64],
|
||||
ops: &[OpType],
|
||||
) -> FileHandle {
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
|
||||
let mut breaks = ops
|
||||
.iter()
|
||||
.zip(ops.iter().skip(1))
|
||||
.enumerate()
|
||||
.filter_map(
|
||||
|(idx, (prev, next))| {
|
||||
if prev != next {
|
||||
Some(idx + 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
breaks.insert(0, 0);
|
||||
breaks.push(ts.len());
|
||||
|
||||
for i in 0..breaks.len() - 1 {
|
||||
let op = ops[i];
|
||||
let seg_len = breaks[i + 1] - breaks[i];
|
||||
let ts_seg = ts
|
||||
.iter()
|
||||
.skip(breaks[i])
|
||||
.take(seg_len)
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
let value_seg = ts
|
||||
.iter()
|
||||
.skip(breaks[i])
|
||||
.take(seg_len)
|
||||
.map(|i| (*i) as u64)
|
||||
.map(Some)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
write_kvs(
|
||||
&*memtable,
|
||||
seq.load(Ordering::Relaxed), // sequence
|
||||
op,
|
||||
&ts_seg, // keys
|
||||
&value_seg, // values
|
||||
);
|
||||
let _ = seq.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let file_path = sst_file_id.as_parquet();
|
||||
let writer = ParquetWriter::new(&file_path, Source::Iter(iter), object_store.clone());
|
||||
|
||||
let SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
..
|
||||
} = writer
|
||||
.write_sst(&sst::WriteOptions::default())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let handle = FileHandle::new(
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id: sst_file_id,
|
||||
time_range,
|
||||
level: 0,
|
||||
file_size,
|
||||
},
|
||||
Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}),
|
||||
new_noop_file_purger(),
|
||||
);
|
||||
let _ = seq.fetch_add(1, Ordering::Relaxed);
|
||||
handle
|
||||
}
|
||||
|
||||
// The region id is only used to build the reader, we don't check its content.
|
||||
async fn check_reads(
|
||||
region_id: RegionId,
|
||||
schema: RegionSchemaRef,
|
||||
sst_layer: AccessLayerRef,
|
||||
files: &[FileHandle],
|
||||
lower_sec_inclusive: i64,
|
||||
upper_sec_exclusive: i64,
|
||||
expect: &[i64],
|
||||
) {
|
||||
let mut reader = build_sst_reader(
|
||||
region_id,
|
||||
schema,
|
||||
sst_layer,
|
||||
files,
|
||||
(Some(lower_sec_inclusive), Some(upper_sec_exclusive)),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut res = vec![];
|
||||
while let Some(f) = reader.next_chunk().await.unwrap() {
|
||||
let ts_col = f.columns[0]
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap();
|
||||
res.extend(ts_col.iter_data().map(|t| t.unwrap().0.value()));
|
||||
}
|
||||
assert_eq!(expect, &res);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sst_reader() {
|
||||
let dir = create_temp_dir("write_parquet");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(path);
|
||||
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
|
||||
let seq = AtomicU64::new(0);
|
||||
let schema = schema_for_test();
|
||||
let file1 = write_sst(
|
||||
FileId::random(),
|
||||
schema.clone(),
|
||||
&seq,
|
||||
object_store.clone(),
|
||||
&[1000, 2000, 3000, 4001, 5001],
|
||||
&[
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
],
|
||||
)
|
||||
.await;
|
||||
let file2 = write_sst(
|
||||
FileId::random(),
|
||||
schema.clone(),
|
||||
&seq,
|
||||
object_store.clone(),
|
||||
&[4002, 5002, 6000, 7000, 8000],
|
||||
&[
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
],
|
||||
)
|
||||
.await;
|
||||
let sst_layer = Arc::new(FsAccessLayer::new("./", object_store));
|
||||
|
||||
let files = vec![file1, file2];
|
||||
// read from two sst files with time range filter,
|
||||
check_reads(
|
||||
REGION_ID,
|
||||
schema.clone(),
|
||||
sst_layer.clone(),
|
||||
&files,
|
||||
3,
|
||||
6,
|
||||
&[3000, 4001, 4002, 5001, 5002],
|
||||
)
|
||||
.await;
|
||||
|
||||
check_reads(REGION_ID, schema, sst_layer, &files, 1, 2, &[1000]).await;
|
||||
}
|
||||
|
||||
async fn read_file(
|
||||
files: &[FileHandle],
|
||||
schema: RegionSchemaRef,
|
||||
sst_layer: AccessLayerRef,
|
||||
) -> Vec<i64> {
|
||||
let mut timestamps = vec![];
|
||||
let mut reader = build_sst_reader(
|
||||
REGION_ID,
|
||||
schema,
|
||||
sst_layer,
|
||||
files,
|
||||
(Some(i64::MIN), Some(i64::MAX)),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let ts = chunk.columns[0]
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap();
|
||||
timestamps.extend(ts.iter_data().map(|t| t.unwrap().0.value()));
|
||||
}
|
||||
timestamps
|
||||
}
|
||||
|
||||
/// Writes rows into file i1/i2 and splits these rows into sst file o1/o2/o3,
|
||||
/// and check the output contains the same data as input files.
|
||||
#[tokio::test]
|
||||
async fn test_sst_split() {
|
||||
let dir = create_temp_dir("write_parquet");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(path);
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
|
||||
let schema = schema_for_test();
|
||||
let seq = AtomicU64::new(0);
|
||||
|
||||
let input_file_ids = [FileId::random(), FileId::random()];
|
||||
let output_file_ids = [FileId::random(), FileId::random(), FileId::random()];
|
||||
|
||||
let file1 = write_sst(
|
||||
input_file_ids[0],
|
||||
schema.clone(),
|
||||
&seq,
|
||||
object_store.clone(),
|
||||
&[1000, 2000, 3000, 4001, 5001],
|
||||
&[
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
// in file2 we delete the row with timestamp 1000.
|
||||
let file2 = write_sst(
|
||||
input_file_ids[1],
|
||||
schema.clone(),
|
||||
&seq,
|
||||
object_store.clone(),
|
||||
&[1000, 5002, 6000, 7000, 8000],
|
||||
&[
|
||||
OpType::Delete, // a deletion
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
],
|
||||
)
|
||||
.await;
|
||||
let sst_layer = Arc::new(FsAccessLayer::new("./", object_store.clone()));
|
||||
let input_files = vec![file2, file1];
|
||||
|
||||
let reader1 = build_sst_reader(
|
||||
REGION_ID,
|
||||
schema.clone(),
|
||||
sst_layer.clone(),
|
||||
&input_files,
|
||||
(Some(0), Some(3)),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let reader2 = build_sst_reader(
|
||||
REGION_ID,
|
||||
schema.clone(),
|
||||
sst_layer.clone(),
|
||||
&input_files,
|
||||
(Some(3), Some(6)),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let reader3 = build_sst_reader(
|
||||
REGION_ID,
|
||||
schema.clone(),
|
||||
sst_layer.clone(),
|
||||
&input_files,
|
||||
(Some(6), Some(10)),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let opts = WriteOptions {
|
||||
sst_write_buffer_size: ReadableSize::mb(8),
|
||||
};
|
||||
let s1 = ParquetWriter::new(
|
||||
&output_file_ids[0].as_parquet(),
|
||||
Source::Reader(reader1),
|
||||
object_store.clone(),
|
||||
)
|
||||
.write_sst(&opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
Some((
|
||||
Timestamp::new_millisecond(2000),
|
||||
Timestamp::new_millisecond(2000)
|
||||
)),
|
||||
s1.time_range,
|
||||
);
|
||||
|
||||
let s2 = ParquetWriter::new(
|
||||
&output_file_ids[1].as_parquet(),
|
||||
Source::Reader(reader2),
|
||||
object_store.clone(),
|
||||
)
|
||||
.write_sst(&opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
Some((
|
||||
Timestamp::new_millisecond(3000),
|
||||
Timestamp::new_millisecond(5002)
|
||||
)),
|
||||
s2.time_range,
|
||||
);
|
||||
|
||||
let s3 = ParquetWriter::new(
|
||||
&output_file_ids[2].as_parquet(),
|
||||
Source::Reader(reader3),
|
||||
object_store.clone(),
|
||||
)
|
||||
.write_sst(&opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
Some((
|
||||
Timestamp::new_millisecond(6000),
|
||||
Timestamp::new_millisecond(8000)
|
||||
)),
|
||||
s3.time_range
|
||||
);
|
||||
|
||||
let output_files = output_file_ids
|
||||
.into_iter()
|
||||
.map(|f| {
|
||||
FileHandle::new(
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id: f,
|
||||
level: 1,
|
||||
time_range: None,
|
||||
file_size: 0,
|
||||
},
|
||||
Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}),
|
||||
new_noop_file_purger(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let timestamps_in_inputs = read_file(&input_files, schema.clone(), sst_layer.clone()).await;
|
||||
let timestamps_in_outputs =
|
||||
read_file(&output_files, schema.clone(), sst_layer.clone()).await;
|
||||
|
||||
assert_eq!(timestamps_in_outputs, timestamps_in_inputs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_time_range_filter() {
|
||||
assert!(build_time_range_filter(
|
||||
(Some(i64::MIN), Some(i64::MAX)),
|
||||
"ts",
|
||||
TimeUnit::Nanosecond
|
||||
)
|
||||
.is_none());
|
||||
|
||||
assert_eq!(
|
||||
Expr::from(datafusion_expr::binary_expr(
|
||||
datafusion_expr::col("ts"),
|
||||
Operator::Lt,
|
||||
datafusion_expr::lit(timestamp_to_scalar_value(
|
||||
TimeUnit::Nanosecond,
|
||||
Some(TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64),
|
||||
)),
|
||||
)),
|
||||
build_time_range_filter((Some(i64::MIN), Some(1)), "ts", TimeUnit::Nanosecond).unwrap()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
Expr::from(datafusion_expr::binary_expr(
|
||||
datafusion_expr::col("ts"),
|
||||
Operator::GtEq,
|
||||
datafusion_expr::lit(timestamp_to_scalar_value(
|
||||
TimeUnit::Nanosecond,
|
||||
Some(
|
||||
2 * TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64
|
||||
),
|
||||
)),
|
||||
)),
|
||||
build_time_range_filter((Some(2), Some(i64::MAX)), "ts", TimeUnit::Nanosecond).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,71 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! storage engine config
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
|
||||
/// Default max flush tasks.
|
||||
pub const DEFAULT_MAX_FLUSH_TASKS: usize = 8;
|
||||
/// Default region write buffer size.
|
||||
pub const DEFAULT_REGION_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(32);
|
||||
/// Default interval to trigger auto flush in millis.
|
||||
pub const DEFAULT_AUTO_FLUSH_INTERVAL: u32 = 60 * 60 * 1000;
|
||||
/// Default interval to schedule the picker to flush automatically in millis.
|
||||
pub const DEFAULT_PICKER_SCHEDULE_INTERVAL: u32 = 5 * 60 * 1000;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EngineConfig {
|
||||
pub compress_manifest: bool,
|
||||
pub manifest_checkpoint_margin: Option<u16>,
|
||||
pub manifest_gc_duration: Option<Duration>,
|
||||
pub max_files_in_l0: usize,
|
||||
pub max_purge_tasks: usize,
|
||||
/// Max inflight flush tasks.
|
||||
pub max_flush_tasks: usize,
|
||||
/// Default write buffer size for a region.
|
||||
pub region_write_buffer_size: ReadableSize,
|
||||
/// Interval to schedule the auto flush picker.
|
||||
pub picker_schedule_interval: Duration,
|
||||
/// Interval to auto flush a region if it has not flushed yet.
|
||||
pub auto_flush_interval: Duration,
|
||||
/// Limit for global write buffer size. Disabled by default.
|
||||
pub global_write_buffer_size: Option<ReadableSize>,
|
||||
/// Global retention period for all regions.
|
||||
///
|
||||
/// The precedence order is: region ttl > global ttl.
|
||||
pub global_ttl: Option<Duration>,
|
||||
}
|
||||
|
||||
impl Default for EngineConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
compress_manifest: false,
|
||||
manifest_checkpoint_margin: Some(10),
|
||||
manifest_gc_duration: Some(Duration::from_secs(30)),
|
||||
max_files_in_l0: 8,
|
||||
max_purge_tasks: 32,
|
||||
max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
|
||||
region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
|
||||
picker_schedule_interval: Duration::from_millis(
|
||||
DEFAULT_PICKER_SCHEDULE_INTERVAL.into(),
|
||||
),
|
||||
auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
|
||||
global_write_buffer_size: None,
|
||||
global_ttl: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,750 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_telemetry::logging::{self, debug};
|
||||
use object_store::{util, ObjectStore};
|
||||
use snafu::ResultExt;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::manifest::Manifest;
|
||||
use store_api::storage::{
|
||||
CloseContext, CloseOptions, CompactionStrategy, CreateOptions, EngineContext, OpenOptions,
|
||||
Region, RegionDescriptor, StorageEngine,
|
||||
};
|
||||
|
||||
use crate::compaction::CompactionSchedulerRef;
|
||||
use crate::config::EngineConfig;
|
||||
use crate::error::{self, Error, Result};
|
||||
use crate::file_purger::{FilePurgeHandler, FilePurgerRef};
|
||||
use crate::flush::{
|
||||
FlushScheduler, FlushSchedulerRef, FlushStrategyRef, PickerConfig, SizeBasedStrategy,
|
||||
};
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::manifest::storage::manifest_compress_type;
|
||||
use crate::memtable::{DefaultMemtableBuilder, MemtableBuilderRef};
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::region::{RegionImpl, StoreConfig};
|
||||
use crate::scheduler::{LocalScheduler, Scheduler, SchedulerConfig};
|
||||
use crate::sst::FsAccessLayer;
|
||||
|
||||
/// [StorageEngine] implementation.
|
||||
pub struct EngineImpl<S: LogStore> {
|
||||
inner: Arc<EngineInner<S>>,
|
||||
}
|
||||
|
||||
impl<S: LogStore> Clone for EngineImpl<S> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
inner: self.inner.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<S: LogStore> StorageEngine for EngineImpl<S> {
|
||||
type Error = Error;
|
||||
type Region = RegionImpl<S>;
|
||||
|
||||
async fn open_region(
|
||||
&self,
|
||||
_ctx: &EngineContext,
|
||||
name: &str,
|
||||
opts: &OpenOptions,
|
||||
) -> Result<Option<Self::Region>> {
|
||||
self.inner.open_region(name, opts).await
|
||||
}
|
||||
|
||||
async fn close_region(
|
||||
&self,
|
||||
_ctx: &EngineContext,
|
||||
name: &str,
|
||||
opts: &CloseOptions,
|
||||
) -> Result<()> {
|
||||
self.inner.close_region(name, opts).await
|
||||
}
|
||||
|
||||
async fn create_region(
|
||||
&self,
|
||||
_ctx: &EngineContext,
|
||||
descriptor: RegionDescriptor,
|
||||
opts: &CreateOptions,
|
||||
) -> Result<Self::Region> {
|
||||
self.inner.create_region(descriptor, opts).await
|
||||
}
|
||||
|
||||
async fn drop_region(&self, _ctx: &EngineContext, region: Self::Region) -> Result<()> {
|
||||
region.drop_region().await?;
|
||||
self.inner.remove_region(region.name());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_region(&self, _ctx: &EngineContext, name: &str) -> Result<Option<Self::Region>> {
|
||||
Ok(self.inner.get_region(name))
|
||||
}
|
||||
|
||||
async fn close(&self, _ctx: &EngineContext) -> Result<()> {
|
||||
logging::info!("Stopping storage engine");
|
||||
|
||||
self.inner.close().await?;
|
||||
|
||||
logging::info!("Storage engine stopped");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> EngineImpl<S> {
|
||||
pub fn new(
|
||||
config: EngineConfig,
|
||||
log_store: Arc<S>,
|
||||
object_store: ObjectStore,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
inner: Arc::new(EngineInner::new(
|
||||
config,
|
||||
log_store,
|
||||
object_store,
|
||||
compaction_scheduler,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate region sst path,
|
||||
/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'.
|
||||
#[inline]
|
||||
pub fn region_sst_dir(parent_dir: &str, region_name: &str) -> String {
|
||||
format!("{parent_dir}{region_name}/")
|
||||
}
|
||||
|
||||
/// Generate region manifest path,
|
||||
/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'.
|
||||
#[inline]
|
||||
pub fn region_manifest_dir(parent_dir: &str, region_name: &str) -> String {
|
||||
format!("{parent_dir}{region_name}/manifest/")
|
||||
}
|
||||
|
||||
/// A slot for region in the engine.
|
||||
///
|
||||
/// Also used as a placeholder in the region map when the region isn't ready, e.g. during
|
||||
/// creating/opening.
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum RegionSlot<S: LogStore> {
|
||||
/// The region is during creation.
|
||||
Creating,
|
||||
/// The region is during opening.
|
||||
Opening,
|
||||
/// The region is ready for access.
|
||||
Ready(RegionImpl<S>),
|
||||
}
|
||||
|
||||
impl<S: LogStore> RegionSlot<S> {
|
||||
/// Try to get a ready region.
|
||||
fn try_get_ready_region(&self) -> Result<RegionImpl<S>> {
|
||||
if let RegionSlot::Ready(region) = self {
|
||||
Ok(region.clone())
|
||||
} else {
|
||||
error::InvalidRegionStateSnafu {
|
||||
state: self.state_name(),
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the ready region or `None`.
|
||||
fn get_ready_region(&self) -> Option<RegionImpl<S>> {
|
||||
if let RegionSlot::Ready(region) = self {
|
||||
Some(region.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn state_name(&self) -> &'static str {
|
||||
match self {
|
||||
RegionSlot::Creating => "creating",
|
||||
RegionSlot::Opening => "opening",
|
||||
RegionSlot::Ready(_) => "ready",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> Clone for RegionSlot<S> {
|
||||
// Manually implement Clone due to [rust#26925](https://github.com/rust-lang/rust/issues/26925).
|
||||
// Maybe we should require `LogStore` to be clonable to work around this.
|
||||
fn clone(&self) -> RegionSlot<S> {
|
||||
match self {
|
||||
RegionSlot::Creating => RegionSlot::Creating,
|
||||
RegionSlot::Opening => RegionSlot::Opening,
|
||||
RegionSlot::Ready(region) => RegionSlot::Ready(region.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to update slot or clean the slot on failure.
|
||||
struct SlotGuard<'a, S: LogStore> {
|
||||
name: &'a str,
|
||||
regions: &'a RegionMap<S>,
|
||||
skip_clean: bool,
|
||||
}
|
||||
|
||||
impl<'a, S: LogStore> SlotGuard<'a, S> {
|
||||
fn new(name: &'a str, regions: &'a RegionMap<S>) -> SlotGuard<'a, S> {
|
||||
SlotGuard {
|
||||
name,
|
||||
regions,
|
||||
skip_clean: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the slot and skip cleaning on drop.
|
||||
fn update(&mut self, slot: RegionSlot<S>) {
|
||||
self.regions.update(self.name, slot);
|
||||
self.skip_clean = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, S: LogStore> Drop for SlotGuard<'a, S> {
|
||||
fn drop(&mut self) {
|
||||
if !self.skip_clean {
|
||||
self.regions.remove(self.name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Region slot map.
|
||||
pub struct RegionMap<S: LogStore>(RwLock<HashMap<String, RegionSlot<S>>>);
|
||||
|
||||
impl<S: LogStore> RegionMap<S> {
|
||||
/// Returns a new region map.
|
||||
pub fn new() -> RegionMap<S> {
|
||||
RegionMap(RwLock::new(HashMap::new()))
|
||||
}
|
||||
|
||||
/// Returns the `Some(slot)` if there is existing slot with given `name`, or insert
|
||||
/// given `slot` and returns `None`.
|
||||
pub(crate) fn get_or_occupy_slot(
|
||||
&self,
|
||||
name: &str,
|
||||
slot: RegionSlot<S>,
|
||||
) -> Option<RegionSlot<S>> {
|
||||
{
|
||||
// Try to get the region under read lock.
|
||||
let regions = self.0.read().unwrap();
|
||||
if let Some(slot) = regions.get(name) {
|
||||
return Some(slot.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Get the region under write lock.
|
||||
let mut regions = self.0.write().unwrap();
|
||||
if let Some(slot) = regions.get(name) {
|
||||
return Some(slot.clone());
|
||||
}
|
||||
|
||||
// No slot in map, we can insert the slot now.
|
||||
let _ = regions.insert(name.to_string(), slot);
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Gets the region by the specific name.
|
||||
fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
|
||||
let slot = self.0.read().unwrap().get(name).cloned()?;
|
||||
slot.get_ready_region()
|
||||
}
|
||||
|
||||
/// Update the slot by name.
|
||||
fn update(&self, name: &str, slot: RegionSlot<S>) {
|
||||
let mut regions = self.0.write().unwrap();
|
||||
if let Some(old) = regions.get_mut(name) {
|
||||
*old = slot;
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove region by name.
|
||||
fn remove(&self, name: &str) {
|
||||
let mut regions = self.0.write().unwrap();
|
||||
let _ = regions.remove(name);
|
||||
}
|
||||
|
||||
/// Collects regions.
|
||||
pub(crate) fn list_regions(&self) -> Vec<RegionImpl<S>> {
|
||||
let regions = self.0.read().unwrap();
|
||||
regions
|
||||
.values()
|
||||
.filter_map(|slot| slot.get_ready_region())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Clear the region map.
|
||||
pub(crate) fn clear(&self) {
|
||||
self.0.write().unwrap().clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> Default for RegionMap<S> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
struct EngineInner<S: LogStore> {
|
||||
object_store: ObjectStore,
|
||||
log_store: Arc<S>,
|
||||
regions: Arc<RegionMap<S>>,
|
||||
memtable_builder: MemtableBuilderRef,
|
||||
flush_scheduler: FlushSchedulerRef<S>,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
file_purger: FilePurgerRef,
|
||||
config: Arc<EngineConfig>,
|
||||
}
|
||||
|
||||
impl<S: LogStore> EngineInner<S> {
|
||||
pub fn new(
|
||||
config: EngineConfig,
|
||||
log_store: Arc<S>,
|
||||
object_store: ObjectStore,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
) -> Result<Self> {
|
||||
let regions = Arc::new(RegionMap::new());
|
||||
let flush_scheduler = Arc::new(FlushScheduler::new(
|
||||
SchedulerConfig {
|
||||
max_inflight_tasks: config.max_flush_tasks,
|
||||
},
|
||||
compaction_scheduler.clone(),
|
||||
regions.clone(),
|
||||
PickerConfig {
|
||||
schedule_interval: config.picker_schedule_interval,
|
||||
auto_flush_interval: config.auto_flush_interval,
|
||||
},
|
||||
)?);
|
||||
|
||||
let file_purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig {
|
||||
max_inflight_tasks: config.max_purge_tasks,
|
||||
},
|
||||
FilePurgeHandler,
|
||||
));
|
||||
let flush_strategy = Arc::new(SizeBasedStrategy::new(
|
||||
config
|
||||
.global_write_buffer_size
|
||||
.map(|size| size.as_bytes() as usize),
|
||||
));
|
||||
let memtable_builder = if config.global_write_buffer_size.is_some() {
|
||||
// If global write buffer size is provided, we set the flush strategy
|
||||
// to the memtable to track global memtable usage.
|
||||
DefaultMemtableBuilder::with_flush_strategy(Some(flush_strategy.clone()))
|
||||
} else {
|
||||
DefaultMemtableBuilder::default()
|
||||
};
|
||||
Ok(Self {
|
||||
object_store,
|
||||
log_store,
|
||||
regions,
|
||||
memtable_builder: Arc::new(memtable_builder),
|
||||
flush_scheduler,
|
||||
flush_strategy,
|
||||
compaction_scheduler,
|
||||
file_purger,
|
||||
config: Arc::new(config),
|
||||
})
|
||||
}
|
||||
|
||||
async fn close_region(&self, name: &str, opts: &CloseOptions) -> Result<()> {
|
||||
if let Some(region) = self.get_region(name) {
|
||||
let ctx = CloseContext { flush: opts.flush };
|
||||
region.close(&ctx).await?;
|
||||
}
|
||||
|
||||
self.regions.remove(name);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn open_region(&self, name: &str, opts: &OpenOptions) -> Result<Option<RegionImpl<S>>> {
|
||||
// We can wait until the state of the slot has been changed to ready, but this will
|
||||
// make the code more complicate, so we just return the error here.
|
||||
if let Some(slot) = self.regions.get_or_occupy_slot(name, RegionSlot::Opening) {
|
||||
return slot.try_get_ready_region().map(Some);
|
||||
}
|
||||
|
||||
let mut guard = SlotGuard::new(name, &self.regions);
|
||||
|
||||
let store_config = self
|
||||
.region_store_config(
|
||||
&opts.parent_dir,
|
||||
opts.write_buffer_size,
|
||||
name,
|
||||
&self.config,
|
||||
opts.ttl,
|
||||
opts.compaction_strategy.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let region = match RegionImpl::open(name.to_string(), store_config, opts).await? {
|
||||
None => return Ok(None),
|
||||
Some(v) => v,
|
||||
};
|
||||
guard.update(RegionSlot::Ready(region.clone()));
|
||||
debug!(
|
||||
"Storage engine open region {}, id: {}",
|
||||
region.name(),
|
||||
region.id()
|
||||
);
|
||||
Ok(Some(region))
|
||||
}
|
||||
|
||||
async fn create_region(
|
||||
&self,
|
||||
descriptor: RegionDescriptor,
|
||||
opts: &CreateOptions,
|
||||
) -> Result<RegionImpl<S>> {
|
||||
if let Some(slot) = self
|
||||
.regions
|
||||
.get_or_occupy_slot(&descriptor.name, RegionSlot::Creating)
|
||||
{
|
||||
return slot.try_get_ready_region();
|
||||
}
|
||||
|
||||
// Now the region in under `Creating` state.
|
||||
let region_name = descriptor.name.clone();
|
||||
let mut guard = SlotGuard::new(®ion_name, &self.regions);
|
||||
|
||||
let metadata: RegionMetadata =
|
||||
descriptor
|
||||
.try_into()
|
||||
.context(error::InvalidRegionDescSnafu {
|
||||
region: ®ion_name,
|
||||
})?;
|
||||
let store_config = self
|
||||
.region_store_config(
|
||||
&opts.parent_dir,
|
||||
opts.write_buffer_size,
|
||||
®ion_name,
|
||||
&self.config,
|
||||
opts.ttl,
|
||||
opts.compaction_strategy.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let region = RegionImpl::create(metadata, store_config).await?;
|
||||
|
||||
guard.update(RegionSlot::Ready(region.clone()));
|
||||
|
||||
debug!(
|
||||
"Storage engine create region {}, id: {}",
|
||||
region.name(),
|
||||
region.id()
|
||||
);
|
||||
|
||||
Ok(region)
|
||||
}
|
||||
|
||||
fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
|
||||
self.regions.get_region(name)
|
||||
}
|
||||
|
||||
fn remove_region(&self, name: &str) {
|
||||
self.regions.remove(name)
|
||||
}
|
||||
|
||||
async fn region_store_config(
|
||||
&self,
|
||||
parent_dir: &str,
|
||||
write_buffer_size: Option<usize>,
|
||||
region_name: &str,
|
||||
config: &EngineConfig,
|
||||
region_ttl: Option<Duration>,
|
||||
compaction_strategy: CompactionStrategy,
|
||||
) -> Result<StoreConfig<S>> {
|
||||
let parent_dir = util::normalize_dir(parent_dir);
|
||||
|
||||
let sst_dir = ®ion_sst_dir(&parent_dir, region_name);
|
||||
let sst_layer = Arc::new(FsAccessLayer::new(sst_dir, self.object_store.clone()));
|
||||
let manifest_dir = region_manifest_dir(&parent_dir, region_name);
|
||||
let manifest = RegionManifest::with_checkpointer(
|
||||
&manifest_dir,
|
||||
self.object_store.clone(),
|
||||
manifest_compress_type(config.compress_manifest),
|
||||
config.manifest_checkpoint_margin,
|
||||
config.manifest_gc_duration,
|
||||
);
|
||||
manifest.start().await?;
|
||||
let flush_strategy = self.flush_strategy.clone();
|
||||
|
||||
// If region_ttl is `None`, the global ttl takes effect.
|
||||
let ttl = region_ttl.or(self.config.global_ttl);
|
||||
|
||||
Ok(StoreConfig {
|
||||
log_store: self.log_store.clone(),
|
||||
sst_layer,
|
||||
manifest,
|
||||
memtable_builder: self.memtable_builder.clone(),
|
||||
flush_scheduler: self.flush_scheduler.clone(),
|
||||
flush_strategy,
|
||||
compaction_scheduler: self.compaction_scheduler.clone(),
|
||||
engine_config: self.config.clone(),
|
||||
file_purger: self.file_purger.clone(),
|
||||
ttl,
|
||||
write_buffer_size: write_buffer_size
|
||||
.unwrap_or(self.config.region_write_buffer_size.as_bytes() as usize),
|
||||
compaction_strategy,
|
||||
})
|
||||
}
|
||||
|
||||
async fn close(&self) -> Result<()> {
|
||||
let regions = self.regions.list_regions();
|
||||
let ctx = CloseContext::default();
|
||||
for region in regions {
|
||||
// Tolerate failure during closing regions.
|
||||
if let Err(e) = region.close(&ctx).await {
|
||||
logging::error!(e; "Failed to close region {}", region.id());
|
||||
}
|
||||
}
|
||||
// Clear regions to release references to regions in the region map.
|
||||
self.regions.clear();
|
||||
|
||||
self.compaction_scheduler.stop(true).await?;
|
||||
self.flush_scheduler.stop().await?;
|
||||
self.file_purger.stop(true).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ffi::OsStr;
|
||||
use std::path::Path;
|
||||
|
||||
use common_test_util::temp_dir::{create_temp_dir, TempDir};
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use datatypes::vectors::{Float32Vector, Int32Vector, TimestampMillisecondVector, VectorRef};
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use log_store::test_util::log_store_util;
|
||||
use object_store::services::Fs;
|
||||
use store_api::storage::{
|
||||
ChunkReader, FlushContext, ReadContext, Region, ScanRequest, Snapshot, WriteContext,
|
||||
WriteRequest,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::compaction::noop::NoopCompactionScheduler;
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
|
||||
type TestEngine = EngineImpl<RaftEngineLogStore>;
|
||||
type TestRegion = RegionImpl<RaftEngineLogStore>;
|
||||
|
||||
async fn create_engine_and_region(
|
||||
tmp_dir: &TempDir,
|
||||
log_file_dir: &TempDir,
|
||||
region_name: &str,
|
||||
region_id: u64,
|
||||
config: EngineConfig,
|
||||
) -> (TestEngine, TestRegion) {
|
||||
let log_file_dir_path = log_file_dir.path().to_str().unwrap();
|
||||
let log_store = log_store_util::create_tmp_local_file_log_store(log_file_dir_path).await;
|
||||
|
||||
let store_dir = tmp_dir.path().to_string_lossy();
|
||||
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(&store_dir);
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
|
||||
let compaction_scheduler = Arc::new(NoopCompactionScheduler::default());
|
||||
|
||||
let engine = EngineImpl::new(
|
||||
config,
|
||||
Arc::new(log_store),
|
||||
object_store,
|
||||
compaction_scheduler,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let desc = RegionDescBuilder::new(region_name)
|
||||
.id(region_id)
|
||||
.push_key_column(("k1", LogicalTypeId::Int32, false))
|
||||
.push_field_column(("v1", LogicalTypeId::Float32, true))
|
||||
.timestamp(("ts", LogicalTypeId::TimestampMillisecond, false))
|
||||
.build();
|
||||
|
||||
let region = engine
|
||||
.create_region(&EngineContext::default(), desc, &CreateOptions::default())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
(engine, region)
|
||||
}
|
||||
|
||||
fn parquet_file_num(path: &Path) -> usize {
|
||||
path.read_dir()
|
||||
.unwrap()
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.path().extension() == Some(OsStr::new("parquet")))
|
||||
.count()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_new_region() {
|
||||
let dir = create_temp_dir("test_create_region");
|
||||
let log_file_dir = create_temp_dir("test_engine_wal");
|
||||
|
||||
let region_name = "region-0";
|
||||
let region_id = 123456;
|
||||
let config = EngineConfig::default();
|
||||
|
||||
let (engine, region) =
|
||||
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
|
||||
assert_eq!(region_name, region.name());
|
||||
|
||||
let ctx = EngineContext::default();
|
||||
let region2 = engine.get_region(&ctx, region_name).unwrap().unwrap();
|
||||
assert_eq!(region_name, region2.name());
|
||||
|
||||
assert!(engine.get_region(&ctx, "no such region").unwrap().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_region_with_buffer_size() {
|
||||
let dir = create_temp_dir("test_buffer_size");
|
||||
let log_file_dir = create_temp_dir("test_buffer_wal");
|
||||
|
||||
let region_name = "region-0";
|
||||
let region_id = 123456;
|
||||
let mut config = EngineConfig::default();
|
||||
let expect_buffer_size = config.region_write_buffer_size / 2;
|
||||
config.region_write_buffer_size = expect_buffer_size;
|
||||
|
||||
let (_engine, region) =
|
||||
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
|
||||
assert_eq!(
|
||||
expect_buffer_size.as_bytes() as usize,
|
||||
region.write_buffer_size().await
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_region() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("test_drop_region");
|
||||
let log_file_dir = create_temp_dir("test_engine_wal");
|
||||
|
||||
let region_name = "test_region";
|
||||
let region_id = 123456;
|
||||
let config = EngineConfig::default();
|
||||
|
||||
let (engine, region) =
|
||||
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
|
||||
|
||||
assert_eq!(region_name, region.name());
|
||||
|
||||
let mut wb = region.write_request();
|
||||
let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef;
|
||||
let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef;
|
||||
let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef;
|
||||
|
||||
let put_data = HashMap::from([
|
||||
("k1".to_string(), k1),
|
||||
("v1".to_string(), v1),
|
||||
("ts".to_string(), tsv),
|
||||
]);
|
||||
wb.put(put_data).unwrap();
|
||||
let _ = region.write(&WriteContext::default(), wb).await.unwrap();
|
||||
|
||||
// Flush memtable to sst.
|
||||
region.flush(&FlushContext::default()).await.unwrap();
|
||||
let ctx = EngineContext::default();
|
||||
engine
|
||||
.close_region(&ctx, region.name(), &CloseOptions::default())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let dir_path = dir.path().join(region_name);
|
||||
|
||||
assert_eq!(1, parquet_file_num(&dir_path));
|
||||
|
||||
{
|
||||
let region = engine
|
||||
.open_region(&ctx, region_name, &OpenOptions::default())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
engine.drop_region(&ctx, region).await.unwrap();
|
||||
|
||||
assert!(engine.get_region(&ctx, region_name).unwrap().is_none());
|
||||
assert!(!engine
|
||||
.inner
|
||||
.object_store
|
||||
.is_exist(dir_path.join("manifest").to_str().unwrap())
|
||||
.await
|
||||
.unwrap());
|
||||
}
|
||||
|
||||
// Wait for gc
|
||||
tokio::time::sleep(Duration::from_millis(60)).await;
|
||||
assert_eq!(0, parquet_file_num(&dir_path));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_truncate_region() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("test_truncate_region");
|
||||
let log_file_dir = create_temp_dir("test_engine_wal");
|
||||
|
||||
let region_name = "test_region";
|
||||
let region_id = 123456;
|
||||
let config = EngineConfig::default();
|
||||
|
||||
let (engine, region) =
|
||||
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
|
||||
|
||||
assert_eq!(region_name, region.name());
|
||||
|
||||
let mut wb = region.write_request();
|
||||
let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef;
|
||||
let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef;
|
||||
let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef;
|
||||
|
||||
let put_data = HashMap::from([
|
||||
("k1".to_string(), k1),
|
||||
("v1".to_string(), v1),
|
||||
("ts".to_string(), tsv),
|
||||
]);
|
||||
wb.put(put_data).unwrap();
|
||||
|
||||
// Insert data.
|
||||
region.write(&WriteContext::default(), wb).await.unwrap();
|
||||
let ctx = EngineContext::default();
|
||||
|
||||
// Truncate region.
|
||||
region.truncate().await.unwrap();
|
||||
assert!(engine.get_region(&ctx, region.name()).unwrap().is_some());
|
||||
|
||||
// Scan to verify the region is empty.
|
||||
let read_ctx = ReadContext::default();
|
||||
let snapshot = region.snapshot(&read_ctx).unwrap();
|
||||
let resp = snapshot
|
||||
.scan(&read_ctx, ScanRequest::default())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut reader = resp.reader;
|
||||
assert!(reader.next_chunk().await.unwrap().is_none());
|
||||
}
|
||||
}
|
||||
@@ -1,635 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
use std::io::Error as IoError;
|
||||
use std::str::Utf8Error;
|
||||
|
||||
use common_datasource::compression::CompressionType;
|
||||
use common_error::ext::{BoxedError, ErrorExt};
|
||||
use common_error::status_code::StatusCode;
|
||||
use common_macro::stack_trace_debug;
|
||||
use common_runtime::error::Error as RuntimeError;
|
||||
use datatypes::arrow::error::ArrowError;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use object_store::ErrorKind;
|
||||
use serde_json::error::Error as JsonError;
|
||||
use snafu::{Location, Snafu};
|
||||
use store_api::manifest::action::ProtocolVersion;
|
||||
use store_api::manifest::ManifestVersion;
|
||||
use store_api::storage::{RegionId, SequenceNumber};
|
||||
use tokio::task::JoinError;
|
||||
|
||||
use crate::metadata::Error as MetadataError;
|
||||
use crate::write_batch;
|
||||
|
||||
#[derive(Snafu)]
|
||||
#[snafu(visibility(pub))]
|
||||
#[stack_trace_debug]
|
||||
pub enum Error {
|
||||
#[snafu(display("Invalid region descriptor, region: {}", region))]
|
||||
InvalidRegionDesc {
|
||||
region: String,
|
||||
location: Location,
|
||||
source: MetadataError,
|
||||
},
|
||||
|
||||
#[snafu(display("Missing column {} in write batch", column))]
|
||||
BatchMissingColumn { column: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to write parquet file"))]
|
||||
WriteParquet {
|
||||
#[snafu(source)]
|
||||
error: parquet::errors::ParquetError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to write to buffer"))]
|
||||
WriteBuffer {
|
||||
location: Location,
|
||||
source: common_datasource::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to create RecordBatch from vectors"))]
|
||||
NewRecordBatch {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: ArrowError,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to read object from path: {}", path))]
|
||||
ReadObject {
|
||||
path: String,
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: object_store::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to write object into path: {}", path))]
|
||||
WriteObject {
|
||||
path: String,
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: object_store::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to delete object from path: {}", path))]
|
||||
DeleteObject {
|
||||
path: String,
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: object_store::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to compress object by {}, path: {}", compress_type, path))]
|
||||
CompressObject {
|
||||
compress_type: CompressionType,
|
||||
path: String,
|
||||
#[snafu(source)]
|
||||
error: std::io::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to decompress object by {}, path: {}", compress_type, path))]
|
||||
DecompressObject {
|
||||
compress_type: CompressionType,
|
||||
path: String,
|
||||
#[snafu(source)]
|
||||
error: std::io::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to list objects in path: {}", path))]
|
||||
ListObjects {
|
||||
path: String,
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: object_store::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to create str from bytes"))]
|
||||
Utf8 {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: Utf8Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to encode object into json "))]
|
||||
EncodeJson {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: JsonError,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to decode object from json "))]
|
||||
DecodeJson {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: JsonError,
|
||||
},
|
||||
|
||||
#[snafu(display("Invalid scan index, start: {}, end: {}", start, end))]
|
||||
InvalidScanIndex {
|
||||
start: ManifestVersion,
|
||||
end: ManifestVersion,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to write WAL, WAL region_id: {}", region_id))]
|
||||
WriteWal {
|
||||
region_id: RegionId,
|
||||
location: Location,
|
||||
source: BoxedError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to encode WAL header"))]
|
||||
EncodeWalHeader {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: std::io::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to decode WAL header"))]
|
||||
DecodeWalHeader {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: std::io::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to wait flushing, region_id: {}", region_id))]
|
||||
WaitFlush {
|
||||
region_id: RegionId,
|
||||
#[snafu(source)]
|
||||
error: tokio::sync::oneshot::error::RecvError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Manifest protocol forbid to read, min_version: {}, supported_version: {}",
|
||||
min_version,
|
||||
supported_version
|
||||
))]
|
||||
ManifestProtocolForbidRead {
|
||||
min_version: ProtocolVersion,
|
||||
supported_version: ProtocolVersion,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Manifest protocol forbid to write, min_version: {}, supported_version: {}",
|
||||
min_version,
|
||||
supported_version
|
||||
))]
|
||||
ManifestProtocolForbidWrite {
|
||||
min_version: ProtocolVersion,
|
||||
supported_version: ProtocolVersion,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to decode action list, {}", msg))]
|
||||
DecodeMetaActionList { msg: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to read line, err"))]
|
||||
Readline {
|
||||
#[snafu(source)]
|
||||
error: IoError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read Parquet file: {}", file))]
|
||||
ReadParquet {
|
||||
file: String,
|
||||
#[snafu(source)]
|
||||
error: parquet::errors::ParquetError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Region is under {} state, cannot proceed operation", state))]
|
||||
InvalidRegionState {
|
||||
state: &'static str,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read WAL, region_id: {}", region_id))]
|
||||
ReadWal {
|
||||
region_id: RegionId,
|
||||
location: Location,
|
||||
source: BoxedError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to mark WAL as obsolete, region id: {}", region_id))]
|
||||
MarkWalObsolete {
|
||||
region_id: u64,
|
||||
location: Location,
|
||||
source: BoxedError,
|
||||
},
|
||||
|
||||
#[snafu(display("WAL data corrupted, region_id: {}, message: {}", region_id, message))]
|
||||
WalDataCorrupted {
|
||||
region_id: RegionId,
|
||||
message: String,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to delete WAL namespace, region id: {}", region_id))]
|
||||
DeleteWalNamespace {
|
||||
region_id: RegionId,
|
||||
location: Location,
|
||||
source: BoxedError,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Sequence of region should increase monotonically (should be {} < {})",
|
||||
prev,
|
||||
given
|
||||
))]
|
||||
SequenceNotMonotonic {
|
||||
prev: SequenceNumber,
|
||||
given: SequenceNumber,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to convert store schema, file: {}", file))]
|
||||
ConvertStoreSchema {
|
||||
file: String,
|
||||
location: Location,
|
||||
source: MetadataError,
|
||||
},
|
||||
|
||||
#[snafu(display("Invalid raw region metadata, region: {}", region))]
|
||||
InvalidRawRegion {
|
||||
region: String,
|
||||
location: Location,
|
||||
source: MetadataError,
|
||||
},
|
||||
|
||||
#[snafu(display("Try to write the closed region"))]
|
||||
ClosedRegion { location: Location },
|
||||
|
||||
#[snafu(display("Invalid projection"))]
|
||||
InvalidProjection {
|
||||
location: Location,
|
||||
source: MetadataError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to push data to batch builder"))]
|
||||
PushBatch {
|
||||
location: Location,
|
||||
source: datatypes::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to build batch, {}", msg))]
|
||||
BuildBatch { msg: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to filter column {}", name))]
|
||||
FilterColumn {
|
||||
name: String,
|
||||
location: Location,
|
||||
source: datatypes::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Invalid alter request"))]
|
||||
InvalidAlterRequest {
|
||||
location: Location,
|
||||
source: MetadataError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to alter metadata"))]
|
||||
AlterMetadata {
|
||||
location: Location,
|
||||
source: MetadataError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to create default value for column {}", name))]
|
||||
CreateDefault {
|
||||
name: String,
|
||||
location: Location,
|
||||
source: datatypes::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Not allowed to write data with version {} to schema with version {}",
|
||||
data_version,
|
||||
schema_version
|
||||
))]
|
||||
WriteToOldVersion {
|
||||
/// Schema version of data to write.
|
||||
data_version: u32,
|
||||
schema_version: u32,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Column {} not in schema with version {}", column, version))]
|
||||
NotInSchemaToCompat {
|
||||
column: String,
|
||||
version: u32,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Incompatible schema to read, reason: {}", reason))]
|
||||
CompatRead { reason: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to read column {}, could not create default value", column))]
|
||||
CreateDefaultToRead {
|
||||
column: String,
|
||||
location: Location,
|
||||
source: datatypes::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read column {}, no proper default value for it", column))]
|
||||
NoDefaultToRead { column: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to convert arrow chunk to batch, name: {}", name))]
|
||||
ConvertChunk {
|
||||
name: String,
|
||||
location: Location,
|
||||
source: datatypes::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Unknown column {}", name))]
|
||||
UnknownColumn { name: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to create record batch for write batch"))]
|
||||
CreateRecordBatch {
|
||||
location: Location,
|
||||
source: common_recordbatch::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Request is too large, max is {}, current is {}",
|
||||
write_batch::MAX_BATCH_SIZE,
|
||||
num_rows
|
||||
))]
|
||||
RequestTooLarge { num_rows: usize, location: Location },
|
||||
|
||||
#[snafu(display(
|
||||
"Type of column {} does not match type in schema, expect {:?}, given {:?}",
|
||||
name,
|
||||
expect,
|
||||
given
|
||||
))]
|
||||
TypeMismatch {
|
||||
name: String,
|
||||
expect: ConcreteDataType,
|
||||
given: ConcreteDataType,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Column {} is not null but input has null", name))]
|
||||
HasNull { name: String, location: Location },
|
||||
|
||||
#[snafu(display(
|
||||
"Length of column {} not equals to other columns, expect {}, given {}",
|
||||
name,
|
||||
expect,
|
||||
given
|
||||
))]
|
||||
UnequalLengths {
|
||||
name: String,
|
||||
expect: usize,
|
||||
given: usize,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to decode write batch, corrupted data {}", message))]
|
||||
BatchCorrupted { message: String, location: Location },
|
||||
|
||||
#[snafu(display("Failed to decode arrow data"))]
|
||||
DecodeArrow {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: ArrowError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to encode arrow data"))]
|
||||
EncodeArrow {
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: ArrowError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to parse schema"))]
|
||||
ParseSchema {
|
||||
location: Location,
|
||||
source: datatypes::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("More columns than expected in the request"))]
|
||||
MoreColumnThanExpected { location: Location },
|
||||
|
||||
#[snafu(display("Failed to decode parquet file time range, msg: {}", msg))]
|
||||
DecodeParquetTimeRange { msg: String, location: Location },
|
||||
|
||||
#[snafu(display("Scheduler rate limited, msg: {}", msg))]
|
||||
RateLimited { msg: String },
|
||||
|
||||
#[snafu(display("Cannot schedule request, scheduler's already stopped"))]
|
||||
IllegalSchedulerState { location: Location },
|
||||
|
||||
#[snafu(display("Failed to start manifest gc task"))]
|
||||
StartManifestGcTask {
|
||||
location: Location,
|
||||
source: RuntimeError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to stop manifest gc task"))]
|
||||
StopManifestGcTask {
|
||||
location: Location,
|
||||
source: RuntimeError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to stop scheduler"))]
|
||||
StopScheduler {
|
||||
#[snafu(source)]
|
||||
error: JoinError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to delete SST file"))]
|
||||
DeleteSst {
|
||||
#[snafu(source)]
|
||||
error: object_store::Error,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to calculate SST expire time"))]
|
||||
TtlCalculation {
|
||||
location: Location,
|
||||
source: common_time::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to create a checkpoint: {}", msg))]
|
||||
ManifestCheckpoint { msg: String, location: Location },
|
||||
|
||||
#[snafu(display("The compaction task is cancelled, region_id: {}", region_id))]
|
||||
CompactTaskCancel {
|
||||
region_id: RegionId,
|
||||
#[snafu(source)]
|
||||
error: tokio::sync::oneshot::error::RecvError,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"The flush request is duplicate, region_id: {}, sequence: {}",
|
||||
region_id,
|
||||
sequence
|
||||
))]
|
||||
DuplicateFlush {
|
||||
region_id: RegionId,
|
||||
sequence: SequenceNumber,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to start picking task for flush"))]
|
||||
StartPickTask {
|
||||
location: Location,
|
||||
source: RuntimeError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to stop picking task for flush"))]
|
||||
StopPickTask {
|
||||
location: Location,
|
||||
source: RuntimeError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to convert columns to rows"))]
|
||||
ConvertColumnsToRows {
|
||||
#[snafu(source)]
|
||||
error: ArrowError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to sort arrays"))]
|
||||
SortArrays {
|
||||
#[snafu(source)]
|
||||
error: ArrowError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to build scan predicate"))]
|
||||
BuildPredicate {
|
||||
source: table::error::Error,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to join spawned tasks"))]
|
||||
JoinError {
|
||||
#[snafu(source)]
|
||||
error: JoinError,
|
||||
location: Location,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl Error {
|
||||
/// Returns true if the error is the object path to delete
|
||||
/// doesn't exist.
|
||||
pub(crate) fn is_object_to_delete_not_found(&self) -> bool {
|
||||
if let Error::DeleteObject { error, .. } = self {
|
||||
error.kind() == ErrorKind::NotFound
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ErrorExt for Error {
|
||||
fn status_code(&self) -> StatusCode {
|
||||
use Error::*;
|
||||
|
||||
match self {
|
||||
InvalidScanIndex { .. }
|
||||
| BatchMissingColumn { .. }
|
||||
| InvalidProjection { .. }
|
||||
| BuildBatch { .. }
|
||||
| NotInSchemaToCompat { .. }
|
||||
| WriteToOldVersion { .. }
|
||||
| CreateRecordBatch { .. }
|
||||
| RequestTooLarge { .. }
|
||||
| TypeMismatch { .. }
|
||||
| HasNull { .. }
|
||||
| UnequalLengths { .. }
|
||||
| MoreColumnThanExpected { .. } => StatusCode::InvalidArguments,
|
||||
|
||||
Utf8 { .. }
|
||||
| EncodeJson { .. }
|
||||
| DecodeJson { .. }
|
||||
| WaitFlush { .. }
|
||||
| DecodeMetaActionList { .. }
|
||||
| Readline { .. }
|
||||
| WalDataCorrupted { .. }
|
||||
| SequenceNotMonotonic { .. }
|
||||
| ConvertStoreSchema { .. }
|
||||
| InvalidRawRegion { .. }
|
||||
| ClosedRegion { .. }
|
||||
| FilterColumn { .. }
|
||||
| AlterMetadata { .. }
|
||||
| CompatRead { .. }
|
||||
| CreateDefaultToRead { .. }
|
||||
| NoDefaultToRead { .. }
|
||||
| NewRecordBatch { .. }
|
||||
| BatchCorrupted { .. }
|
||||
| DecodeArrow { .. }
|
||||
| EncodeArrow { .. }
|
||||
| ManifestCheckpoint { .. }
|
||||
| CompressObject { .. }
|
||||
| DecompressObject { .. }
|
||||
| ParseSchema { .. } => StatusCode::Unexpected,
|
||||
|
||||
WriteParquet { .. }
|
||||
| ReadObject { .. }
|
||||
| WriteObject { .. }
|
||||
| ListObjects { .. }
|
||||
| DeleteObject { .. }
|
||||
| WriteWal { .. }
|
||||
| DecodeWalHeader { .. }
|
||||
| EncodeWalHeader { .. }
|
||||
| ManifestProtocolForbidRead { .. }
|
||||
| ManifestProtocolForbidWrite { .. }
|
||||
| ReadParquet { .. }
|
||||
| InvalidRegionState { .. }
|
||||
| ReadWal { .. } => StatusCode::StorageUnavailable,
|
||||
|
||||
UnknownColumn { .. } => StatusCode::TableColumnNotFound,
|
||||
|
||||
InvalidAlterRequest { source, .. } | InvalidRegionDesc { source, .. } => {
|
||||
source.status_code()
|
||||
}
|
||||
WriteBuffer { source, .. } => source.status_code(),
|
||||
PushBatch { source, .. } => source.status_code(),
|
||||
CreateDefault { source, .. } => source.status_code(),
|
||||
ConvertChunk { source, .. } => source.status_code(),
|
||||
MarkWalObsolete { source, .. } => source.status_code(),
|
||||
DeleteWalNamespace { source, .. } => source.status_code(),
|
||||
DecodeParquetTimeRange { .. } => StatusCode::Unexpected,
|
||||
RateLimited { .. } | StopScheduler { .. } | CompactTaskCancel { .. } => {
|
||||
StatusCode::Internal
|
||||
}
|
||||
DeleteSst { .. } => StatusCode::StorageUnavailable,
|
||||
|
||||
StartManifestGcTask { .. }
|
||||
| StopManifestGcTask { .. }
|
||||
| IllegalSchedulerState { .. }
|
||||
| DuplicateFlush { .. }
|
||||
| StartPickTask { .. }
|
||||
| StopPickTask { .. } => StatusCode::Unexpected,
|
||||
|
||||
TtlCalculation { source, .. } => source.status_code(),
|
||||
ConvertColumnsToRows { .. } | SortArrays { .. } => StatusCode::Unexpected,
|
||||
BuildPredicate { source, .. } => source.status_code(),
|
||||
JoinError { .. } => StatusCode::Unexpected,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
}
|
||||
@@ -1,235 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::{debug, error};
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken};
|
||||
use crate::scheduler::{Handler, LocalScheduler, Request};
|
||||
use crate::sst::{AccessLayerRef, FileId};
|
||||
|
||||
pub struct FilePurgeRequest {
|
||||
pub region_id: RegionId,
|
||||
pub file_id: FileId,
|
||||
pub sst_layer: AccessLayerRef,
|
||||
}
|
||||
|
||||
impl Request for FilePurgeRequest {
|
||||
type Key = String;
|
||||
|
||||
fn key(&self) -> Self::Key {
|
||||
format!("{}/{}", self.region_id, self.file_id)
|
||||
}
|
||||
|
||||
fn complete(self, _result: Result<()>) {}
|
||||
}
|
||||
|
||||
pub struct FilePurgeHandler;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Handler for FilePurgeHandler {
|
||||
type Request = FilePurgeRequest;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
req.sst_layer.delete_sst(req.file_id).await.map_err(|e| {
|
||||
error!(e; "Failed to delete SST file, file: {}, region: {}",
|
||||
req.file_id.as_parquet(), req.region_id);
|
||||
e
|
||||
})?;
|
||||
debug!(
|
||||
"Successfully deleted SST file: {}, region: {}",
|
||||
req.file_id.as_parquet(),
|
||||
req.region_id
|
||||
);
|
||||
token.try_release();
|
||||
finish_notifier.notify_one();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub type FilePurgerRef = Arc<LocalScheduler<FilePurgeRequest>>;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod noop {
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::file_purger::{FilePurgeRequest, FilePurgerRef};
|
||||
use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken};
|
||||
use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig};
|
||||
|
||||
pub fn new_noop_file_purger() -> FilePurgerRef {
|
||||
Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
NoopFilePurgeHandler,
|
||||
))
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct NoopFilePurgeHandler;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Handler for NoopFilePurgeHandler {
|
||||
type Request = FilePurgeRequest;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
_req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
token.try_release();
|
||||
finish_notifier.notify_one();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use api::v1::OpType;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
use super::*;
|
||||
use crate::file_purger::noop::NoopFilePurgeHandler;
|
||||
use crate::memtable::tests::{schema_for_test, write_kvs};
|
||||
use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
|
||||
use crate::scheduler::{Scheduler, SchedulerConfig};
|
||||
use crate::sst::{AccessLayer, FileHandle, FileMeta, FsAccessLayer, Source, WriteOptions};
|
||||
|
||||
struct MockRateLimitToken;
|
||||
|
||||
impl RateLimitToken for MockRateLimitToken {
|
||||
fn try_release(&self) {}
|
||||
}
|
||||
|
||||
async fn create_sst_file(
|
||||
os: ObjectStore,
|
||||
sst_file_id: FileId,
|
||||
file_purger: FilePurgerRef,
|
||||
) -> (FileHandle, String, AccessLayerRef) {
|
||||
let schema = schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
|
||||
|
||||
write_kvs(
|
||||
&*memtable,
|
||||
10,
|
||||
OpType::Put,
|
||||
&[1, 2],
|
||||
&[(Some(1), Some(1)), (Some(2), Some(2))],
|
||||
);
|
||||
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let sst_path = "table1";
|
||||
let layer = Arc::new(FsAccessLayer::new(sst_path, os.clone()));
|
||||
let sst_info = layer
|
||||
.write_sst(sst_file_id, Source::Iter(iter), &WriteOptions::default())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
(
|
||||
FileHandle::new(
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id: sst_file_id,
|
||||
time_range: None,
|
||||
level: 0,
|
||||
file_size: sst_info.file_size,
|
||||
},
|
||||
layer.clone(),
|
||||
file_purger,
|
||||
),
|
||||
sst_path.to_string(),
|
||||
layer as _,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_purger_handler() {
|
||||
let dir = create_temp_dir("file-purge");
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(dir.path().to_str().unwrap());
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
|
||||
let sst_file_id = FileId::random();
|
||||
|
||||
let noop_file_purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
NoopFilePurgeHandler,
|
||||
));
|
||||
let (_file, path, layer) =
|
||||
create_sst_file(object_store.clone(), sst_file_id, noop_file_purger).await;
|
||||
let request = FilePurgeRequest {
|
||||
region_id: 0.into(),
|
||||
file_id: sst_file_id,
|
||||
sst_layer: layer,
|
||||
};
|
||||
|
||||
let handler = FilePurgeHandler;
|
||||
let notify = Arc::new(Notify::new());
|
||||
handler
|
||||
.handle_request(request, Box::new(MockRateLimitToken {}), notify.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
notify.notified().await;
|
||||
let exists = object_store
|
||||
.is_exist(&format!("{}/{}", path, sst_file_id.as_parquet()))
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!exists);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_purge_loop() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("file-purge");
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(dir.path().to_str().unwrap());
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
let sst_file_id = FileId::random();
|
||||
let scheduler = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
FilePurgeHandler,
|
||||
));
|
||||
let (handle, path, _layer) =
|
||||
create_sst_file(object_store.clone(), sst_file_id, scheduler.clone()).await;
|
||||
|
||||
{
|
||||
// mark file as deleted and drop the handle, we expect the file is deleted.
|
||||
handle.mark_deleted();
|
||||
drop(handle);
|
||||
}
|
||||
scheduler.stop(true).await.unwrap();
|
||||
|
||||
assert!(!object_store
|
||||
.is_exist(&format!("{}/{}", path, sst_file_id.as_parquet()))
|
||||
.await
|
||||
.unwrap());
|
||||
}
|
||||
}
|
||||
@@ -1,495 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
mod picker;
|
||||
mod scheduler;
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_telemetry::logging;
|
||||
pub use picker::{FlushPicker, PickerConfig};
|
||||
pub use scheduler::{
|
||||
FlushHandle, FlushRegionRequest, FlushRequest, FlushScheduler, FlushSchedulerRef,
|
||||
};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::consts::WRITE_ROW_GROUP_SIZE;
|
||||
use store_api::storage::{RegionId, SequenceNumber};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::error::Result;
|
||||
use crate::manifest::action::*;
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::memtable::{IterContext, MemtableId, MemtableRef};
|
||||
use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED};
|
||||
use crate::region::{RegionWriterRef, SharedDataRef};
|
||||
use crate::sst::{AccessLayerRef, FileId, FileMeta, Source, SstInfo, WriteOptions};
|
||||
use crate::wal::Wal;
|
||||
|
||||
/// Current flush-related status of a region.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RegionStatus {
|
||||
/// Id of the region this status belongs to.
|
||||
pub region_id: RegionId,
|
||||
/// Size of the mutable memtable.
|
||||
pub bytes_mutable: usize,
|
||||
/// Write buffer size of the region.
|
||||
pub write_buffer_size: usize,
|
||||
}
|
||||
|
||||
/// Type of flush request to send.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum FlushType {
|
||||
/// Flush current region.
|
||||
Region,
|
||||
/// Engine level flush. Find regions to flush globally.
|
||||
Engine,
|
||||
}
|
||||
|
||||
/// Strategy to control whether to flush a region before writing to the region.
|
||||
pub trait FlushStrategy: Send + Sync + std::fmt::Debug {
|
||||
/// Returns whether to trigger a flush operation.
|
||||
fn should_flush(&self, status: RegionStatus) -> Option<FlushType>;
|
||||
|
||||
/// Reserves `mem` bytes.
|
||||
fn reserve_mem(&self, mem: usize);
|
||||
|
||||
/// Tells the strategy we are freeing `mem` bytes.
|
||||
///
|
||||
/// We are in the process of freeing `mem` bytes, so it is not considered
|
||||
/// when checking the soft limit.
|
||||
fn schedule_free_mem(&self, mem: usize);
|
||||
|
||||
/// We have freed `mem` bytes.
|
||||
fn free_mem(&self, mem: usize);
|
||||
}
|
||||
|
||||
pub type FlushStrategyRef = Arc<dyn FlushStrategy>;
|
||||
|
||||
/// Flush strategy based on memory usage.
|
||||
#[derive(Debug)]
|
||||
pub struct SizeBasedStrategy {
|
||||
/// Write buffer size for all memtables.
|
||||
global_write_buffer_size: Option<usize>,
|
||||
/// Mutable memtable memory size limitation, only valid when `global_write_buffer_size`
|
||||
/// is `Some`.
|
||||
mutable_limitation: usize,
|
||||
/// Memory in used (e.g. used by mutable and immutable memtables).
|
||||
memory_used: AtomicUsize,
|
||||
/// Memory that hasn't been scheduled to free (e.g. used by mutable memtables).
|
||||
memory_active: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SizeBasedStrategy {
|
||||
/// Returns a new [SizeBasedStrategy] with specific `global_write_buffer_size`.
|
||||
pub fn new(global_write_buffer_size: Option<usize>) -> Self {
|
||||
Self {
|
||||
global_write_buffer_size,
|
||||
mutable_limitation: get_mutable_limitation(global_write_buffer_size),
|
||||
memory_used: AtomicUsize::new(0),
|
||||
memory_active: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether to trigger an engine level flush.
|
||||
///
|
||||
/// Inspired by RocksDB's WriteBufferManager.
|
||||
/// <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94>
|
||||
fn should_flush_engine(&self) -> bool {
|
||||
// We only check global limit when it is Some.
|
||||
let Some(global_write_buffer_size) = self.global_write_buffer_size else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed);
|
||||
if mutable_memtable_memory_usage > self.mutable_limitation {
|
||||
logging::info!(
|
||||
"Engine should flush (over mutable limit), mutable_usage: {}, mutable_limitation: {}.",
|
||||
mutable_memtable_memory_usage,
|
||||
self.mutable_limitation,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
let memory_usage = self.memory_used.load(Ordering::Relaxed);
|
||||
// If the memory exceeds the buffer size, we trigger more aggressive
|
||||
// flush. But if already more than half memory is being flushed,
|
||||
// triggering more flush may not help. We will hold it instead.
|
||||
if memory_usage >= global_write_buffer_size
|
||||
&& mutable_memtable_memory_usage >= global_write_buffer_size / 2
|
||||
{
|
||||
logging::info!(
|
||||
"Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
|
||||
mutable_usage: {}.",
|
||||
memory_usage,
|
||||
global_write_buffer_size,
|
||||
mutable_memtable_memory_usage,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Returns true if the global memory limitation is enabled.
|
||||
#[inline]
|
||||
fn is_global_limit_enabled(&self) -> bool {
|
||||
self.global_write_buffer_size.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_mutable_limitation(global_write_buffer_size: Option<usize>) -> usize {
|
||||
// Inspired by RocksDB.
|
||||
// https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L86
|
||||
global_write_buffer_size
|
||||
.map(|size| size * 7 / 8)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
impl Default for SizeBasedStrategy {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
global_write_buffer_size: None,
|
||||
mutable_limitation: 0,
|
||||
memory_used: AtomicUsize::new(0),
|
||||
memory_active: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FlushStrategy for SizeBasedStrategy {
|
||||
fn should_flush(&self, status: RegionStatus) -> Option<FlushType> {
|
||||
if status.bytes_mutable >= status.write_buffer_size {
|
||||
// If the mutable memtable is full, we should freeze it and flush it.
|
||||
logging::debug!(
|
||||
"Region should flush as mutable memtable is full, region: {}, bytes_mutable: {}, \
|
||||
write_buffer_size: {}.",
|
||||
status.region_id,
|
||||
status.bytes_mutable,
|
||||
status.write_buffer_size,
|
||||
);
|
||||
|
||||
return Some(FlushType::Region);
|
||||
}
|
||||
|
||||
if self.should_flush_engine() {
|
||||
return Some(FlushType::Engine);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn reserve_mem(&self, mem: usize) {
|
||||
if self.is_global_limit_enabled() {
|
||||
let _ = self.memory_used.fetch_add(mem, Ordering::Relaxed);
|
||||
let _ = self.memory_active.fetch_add(mem, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
fn schedule_free_mem(&self, mem: usize) {
|
||||
if self.is_global_limit_enabled() {
|
||||
let _ = self.memory_active.fetch_sub(mem, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
fn free_mem(&self, mem: usize) {
|
||||
if self.is_global_limit_enabled() {
|
||||
let _ = self.memory_used.fetch_sub(mem, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FlushJob<S: LogStore> {
|
||||
/// Max memtable id in these memtables,
|
||||
/// used to remove immutable memtables in current version.
|
||||
pub max_memtable_id: MemtableId,
|
||||
/// Memtables to be flushed.
|
||||
pub memtables: Vec<MemtableRef>,
|
||||
/// Last sequence of data to be flushed.
|
||||
pub flush_sequence: SequenceNumber,
|
||||
/// Shared data of region to be flushed.
|
||||
pub shared: SharedDataRef,
|
||||
/// Sst access layer of the region.
|
||||
pub sst_layer: AccessLayerRef,
|
||||
/// Region writer, used to persist log entry that points to the latest manifest file.
|
||||
pub writer: RegionWriterRef<S>,
|
||||
/// Region write-ahead logging, used to write data/meta to the log file.
|
||||
pub wal: Wal<S>,
|
||||
/// Region manifest service, used to persist metadata.
|
||||
pub manifest: RegionManifest,
|
||||
/// Storage engine config
|
||||
pub engine_config: Arc<EngineConfig>,
|
||||
}
|
||||
|
||||
impl<S: LogStore> FlushJob<S> {
|
||||
/// Execute the flush job.
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
let _timer = FLUSH_ELAPSED.start_timer();
|
||||
|
||||
let file_metas = self.write_memtables_to_layer().await?;
|
||||
if file_metas.is_empty() {
|
||||
// skip writing manifest and wal if no files are flushed.
|
||||
return Ok(());
|
||||
}
|
||||
self.write_manifest_and_apply(&file_metas).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn write_memtables_to_layer(&mut self) -> Result<Vec<FileMeta>> {
|
||||
let region_id = self.shared.id();
|
||||
let mut futures = Vec::with_capacity(self.memtables.len());
|
||||
let iter_ctx = IterContext {
|
||||
// TODO(ruihang): dynamic row group size based on content (#412)
|
||||
batch_size: WRITE_ROW_GROUP_SIZE,
|
||||
// All sequences are visible by default.
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
for m in &self.memtables {
|
||||
// skip empty memtable
|
||||
if m.num_rows() == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_id = FileId::random();
|
||||
// TODO(hl): Check if random file name already exists in meta.
|
||||
let iter = m.iter(iter_ctx.clone())?;
|
||||
let sst_layer = self.sst_layer.clone();
|
||||
let write_options = WriteOptions {
|
||||
sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
|
||||
};
|
||||
futures.push(async move {
|
||||
Ok(sst_layer
|
||||
.write_sst(file_id, Source::Iter(iter), &write_options)
|
||||
.await?
|
||||
.map(
|
||||
|SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
..
|
||||
}| FileMeta {
|
||||
region_id,
|
||||
file_id,
|
||||
time_range,
|
||||
level: 0,
|
||||
file_size,
|
||||
},
|
||||
))
|
||||
});
|
||||
}
|
||||
|
||||
let metas: Vec<_> = futures_util::future::try_join_all(futures)
|
||||
.await?
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
|
||||
let flush_bytes = metas.iter().map(|f| f.file_size).sum();
|
||||
|
||||
FLUSH_BYTES_TOTAL.inc_by(flush_bytes);
|
||||
|
||||
let file_ids = metas.iter().map(|f| f.file_id).collect::<Vec<_>>();
|
||||
logging::info!("Successfully flush memtables, region:{region_id}, files: {file_ids:?}");
|
||||
Ok(metas)
|
||||
}
|
||||
|
||||
async fn write_manifest_and_apply(&mut self, file_metas: &[FileMeta]) -> Result<()> {
|
||||
let edit = RegionEdit {
|
||||
region_version: self.shared.version_control.metadata().version(),
|
||||
flushed_sequence: Some(self.flush_sequence),
|
||||
files_to_add: file_metas.to_vec(),
|
||||
files_to_remove: Vec::default(),
|
||||
compaction_time_window: None,
|
||||
};
|
||||
|
||||
self.writer
|
||||
.write_edit_and_apply(
|
||||
&self.wal,
|
||||
&self.shared,
|
||||
&self.manifest,
|
||||
edit,
|
||||
Some(self.max_memtable_id),
|
||||
)
|
||||
.await?;
|
||||
self.wal.obsolete(self.flush_sequence).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::memtable::AllocTracker;
|
||||
|
||||
#[test]
|
||||
fn test_get_mutable_limitation() {
|
||||
assert_eq!(7, get_mutable_limitation(Some(8)));
|
||||
assert_eq!(8, get_mutable_limitation(Some(10)));
|
||||
assert_eq!(56, get_mutable_limitation(Some(64)));
|
||||
assert_eq!(0, get_mutable_limitation(None));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_global_disabled() {
|
||||
let strategy = SizeBasedStrategy::new(None);
|
||||
strategy.reserve_mem(1000);
|
||||
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
|
||||
strategy.schedule_free_mem(1000);
|
||||
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
|
||||
strategy.free_mem(1000);
|
||||
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
|
||||
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 400,
|
||||
write_buffer_size: 300,
|
||||
};
|
||||
assert_eq!(Some(FlushType::Region), strategy.should_flush(status));
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 100,
|
||||
write_buffer_size: 300,
|
||||
};
|
||||
assert_eq!(None, strategy.should_flush(status));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_over_mutable_limit() {
|
||||
let strategy = SizeBasedStrategy::new(Some(1000));
|
||||
strategy.reserve_mem(500);
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 300,
|
||||
write_buffer_size: 500,
|
||||
};
|
||||
assert_eq!(None, strategy.should_flush(status));
|
||||
strategy.reserve_mem(400);
|
||||
|
||||
// Flush region.
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 400,
|
||||
write_buffer_size: 300,
|
||||
};
|
||||
assert_eq!(Some(FlushType::Region), strategy.should_flush(status));
|
||||
|
||||
// More than mutable limitation, Flush global.
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 100,
|
||||
write_buffer_size: 300,
|
||||
};
|
||||
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
|
||||
|
||||
strategy.schedule_free_mem(500);
|
||||
assert_eq!(None, strategy.should_flush(status));
|
||||
assert_eq!(900, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed));
|
||||
|
||||
strategy.free_mem(500);
|
||||
assert_eq!(400, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strategy_over_global() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let strategy = SizeBasedStrategy::new(Some(1000));
|
||||
strategy.reserve_mem(1100);
|
||||
strategy.schedule_free_mem(200);
|
||||
// More than global limit.
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 100,
|
||||
write_buffer_size: 300,
|
||||
};
|
||||
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
|
||||
|
||||
// More than global limit, but mutable not enough (< 500).
|
||||
strategy.schedule_free_mem(450);
|
||||
let status = RegionStatus {
|
||||
region_id: 1.into(),
|
||||
bytes_mutable: 100,
|
||||
write_buffer_size: 300,
|
||||
};
|
||||
assert_eq!(None, strategy.should_flush(status));
|
||||
strategy.schedule_free_mem(100);
|
||||
assert_eq!(None, strategy.should_flush(status));
|
||||
|
||||
// Now mutable is enough.
|
||||
strategy.reserve_mem(150);
|
||||
// We can flush again.
|
||||
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
|
||||
strategy.reserve_mem(100);
|
||||
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alloc_tracker_without_strategy() {
|
||||
let tracker = AllocTracker::new(None);
|
||||
assert_eq!(0, tracker.bytes_allocated());
|
||||
tracker.on_allocate(100);
|
||||
assert_eq!(100, tracker.bytes_allocated());
|
||||
tracker.on_allocate(200);
|
||||
assert_eq!(300, tracker.bytes_allocated());
|
||||
|
||||
tracker.done_allocating();
|
||||
assert_eq!(300, tracker.bytes_allocated());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alloc_tracker_with_strategy() {
|
||||
let strategy = Arc::new(SizeBasedStrategy::new(Some(1000)));
|
||||
{
|
||||
let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef));
|
||||
|
||||
tracker.on_allocate(100);
|
||||
assert_eq!(100, tracker.bytes_allocated());
|
||||
assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed));
|
||||
|
||||
for _ in 0..2 {
|
||||
// Done allocating won't free the same memory multiple times.
|
||||
tracker.done_allocating();
|
||||
assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alloc_tracker_without_done_allocating() {
|
||||
let strategy = Arc::new(SizeBasedStrategy::new(Some(1000)));
|
||||
{
|
||||
let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef));
|
||||
|
||||
tracker.on_allocate(100);
|
||||
assert_eq!(100, tracker.bytes_allocated());
|
||||
assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed));
|
||||
}
|
||||
|
||||
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
|
||||
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
|
||||
}
|
||||
}
|
||||
@@ -1,263 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_telemetry::logging;
|
||||
use common_time::util;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::{FlushContext, FlushReason, Region};
|
||||
|
||||
use crate::config::{DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_PICKER_SCHEDULE_INTERVAL};
|
||||
use crate::region::RegionImpl;
|
||||
|
||||
/// Config for [FlushPicker].
|
||||
pub struct PickerConfig {
|
||||
/// Interval to schedule the picker.
|
||||
pub schedule_interval: Duration,
|
||||
/// Interval to auto flush a region if it has not flushed yet.
|
||||
pub auto_flush_interval: Duration,
|
||||
}
|
||||
|
||||
impl PickerConfig {
|
||||
/// Returns the auto flush interval in millis or a default value
|
||||
/// if overflow occurs.
|
||||
fn auto_flush_interval_millis(&self) -> i64 {
|
||||
self.auto_flush_interval
|
||||
.as_millis()
|
||||
.try_into()
|
||||
.unwrap_or(DEFAULT_AUTO_FLUSH_INTERVAL.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PickerConfig {
|
||||
fn default() -> Self {
|
||||
PickerConfig {
|
||||
schedule_interval: Duration::from_millis(DEFAULT_PICKER_SCHEDULE_INTERVAL.into()),
|
||||
auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush task picker.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FlushPicker {
|
||||
/// Interval to flush a region automatically.
|
||||
auto_flush_interval_millis: i64,
|
||||
}
|
||||
|
||||
impl FlushPicker {
|
||||
/// Returns a new FlushPicker.
|
||||
pub fn new(config: PickerConfig) -> FlushPicker {
|
||||
FlushPicker {
|
||||
auto_flush_interval_millis: config.auto_flush_interval_millis(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Picks regions and flushes them by interval.
|
||||
///
|
||||
/// Returns the number of flushed regions.
|
||||
pub async fn pick_by_interval<T: FlushItem>(&self, regions: &[T]) -> usize {
|
||||
let now = util::current_time_millis();
|
||||
// Flush regions by interval.
|
||||
if let Some(earliest_flush_millis) = now.checked_sub(self.auto_flush_interval_millis) {
|
||||
flush_regions_by_interval(regions, earliest_flush_millis).await
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Picks and flushes regions when the write buffer is full.
|
||||
pub async fn pick_by_write_buffer_full<T: FlushItem>(&self, regions: &[T]) {
|
||||
// In such case, we pick the oldest region to flush. If this is not enough,
|
||||
// the next time the region writer will trigger the picker again. Then we
|
||||
// can pick another region to flush. The total memory will go down eventually.
|
||||
let target = regions
|
||||
.iter()
|
||||
.filter(|region| region.mutable_memtable_usage() > 0)
|
||||
.min_by_key(|region| region.last_flush_time());
|
||||
if let Some(region) = target {
|
||||
logging::debug!(
|
||||
"Request flush for region {} due to global buffer is full",
|
||||
region.item_id()
|
||||
);
|
||||
|
||||
region.request_flush(FlushReason::GlobalBufferFull).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Item for picker to flush.
|
||||
#[async_trait]
|
||||
pub trait FlushItem {
|
||||
/// Id of the item.
|
||||
fn item_id(&self) -> u64;
|
||||
|
||||
/// Last flush time in millis.
|
||||
fn last_flush_time(&self) -> i64;
|
||||
|
||||
/// Mutable memtable usage.
|
||||
fn mutable_memtable_usage(&self) -> usize;
|
||||
|
||||
/// Requests the item to schedule a flush for specific `reason`.
|
||||
///
|
||||
/// The flush job itself should run in background.
|
||||
async fn request_flush(&self, reason: FlushReason);
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<S: LogStore> FlushItem for RegionImpl<S> {
|
||||
fn item_id(&self) -> u64 {
|
||||
self.id().into()
|
||||
}
|
||||
|
||||
fn last_flush_time(&self) -> i64 {
|
||||
self.last_flush_millis()
|
||||
}
|
||||
|
||||
fn mutable_memtable_usage(&self) -> usize {
|
||||
let current = self.version_control().current();
|
||||
let memtables = current.memtables();
|
||||
memtables.mutable_bytes_allocated()
|
||||
}
|
||||
|
||||
async fn request_flush(&self, reason: FlushReason) {
|
||||
let ctx = FlushContext {
|
||||
wait: false,
|
||||
reason,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
if let Err(e) = self.flush(&ctx).await {
|
||||
logging::error!(e; "Failed to flush region {}", self.id());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Auto flush regions based on last flush time.
|
||||
///
|
||||
/// Returns the number of flushed regions.
|
||||
async fn flush_regions_by_interval<T: FlushItem>(
|
||||
regions: &[T],
|
||||
earliest_flush_millis: i64,
|
||||
) -> usize {
|
||||
let mut flushed = 0;
|
||||
for region in regions {
|
||||
if region.last_flush_time() < earliest_flush_millis {
|
||||
logging::debug!(
|
||||
"Auto flush region {} due to last flush time ({} < {})",
|
||||
region.item_id(),
|
||||
region.last_flush_time(),
|
||||
earliest_flush_millis,
|
||||
);
|
||||
|
||||
flushed += 1;
|
||||
region.request_flush(FlushReason::Periodically).await;
|
||||
}
|
||||
}
|
||||
|
||||
flushed
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Mutex;
|
||||
|
||||
use super::*;
|
||||
|
||||
struct MockItem {
|
||||
id: u64,
|
||||
last_flush_time: i64,
|
||||
usage: usize,
|
||||
flush_reason: Mutex<Option<FlushReason>>,
|
||||
}
|
||||
|
||||
impl MockItem {
|
||||
fn new(id: u64, last_flush_time: i64, usage: usize) -> MockItem {
|
||||
MockItem {
|
||||
id,
|
||||
last_flush_time,
|
||||
usage,
|
||||
flush_reason: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_reason(&self) -> Option<FlushReason> {
|
||||
*self.flush_reason.lock().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FlushItem for MockItem {
|
||||
fn item_id(&self) -> u64 {
|
||||
self.id
|
||||
}
|
||||
|
||||
fn last_flush_time(&self) -> i64 {
|
||||
self.last_flush_time
|
||||
}
|
||||
|
||||
fn mutable_memtable_usage(&self) -> usize {
|
||||
self.usage
|
||||
}
|
||||
|
||||
async fn request_flush(&self, reason: FlushReason) {
|
||||
let mut flush_reason = self.flush_reason.lock().unwrap();
|
||||
*flush_reason = Some(reason);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pick_by_interval() {
|
||||
let regions = [
|
||||
MockItem::new(0, util::current_time_millis(), 1),
|
||||
MockItem::new(1, util::current_time_millis() - 60 * 1000, 1),
|
||||
];
|
||||
let picker = FlushPicker::new(PickerConfig {
|
||||
// schedule_interval is unused in this test.
|
||||
schedule_interval: Duration::from_millis(10),
|
||||
auto_flush_interval: Duration::from_millis(30 * 1000),
|
||||
});
|
||||
let flushed = picker.pick_by_interval(®ions).await;
|
||||
assert_eq!(1, flushed);
|
||||
assert!(regions[0].flush_reason().is_none());
|
||||
assert_eq!(Some(FlushReason::Periodically), regions[1].flush_reason());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pick_by_buffer_full() {
|
||||
let regions = [
|
||||
MockItem::new(0, util::current_time_millis(), 10),
|
||||
MockItem::new(1, util::current_time_millis() - 60 * 1000, 0),
|
||||
MockItem::new(1, util::current_time_millis() - 60 * 1000, 10),
|
||||
];
|
||||
let picker = FlushPicker::new(PickerConfig {
|
||||
schedule_interval: Duration::from_millis(10),
|
||||
auto_flush_interval: Duration::from_millis(30 * 1000),
|
||||
});
|
||||
picker.pick_by_write_buffer_full(®ions).await;
|
||||
assert!(regions[0].flush_reason().is_none());
|
||||
assert!(regions[1].flush_reason().is_none());
|
||||
assert_eq!(
|
||||
Some(FlushReason::GlobalBufferFull),
|
||||
regions[2].flush_reason()
|
||||
);
|
||||
|
||||
// No target.
|
||||
let regions = [MockItem::new(1, util::current_time_millis(), 0)];
|
||||
picker.pick_by_write_buffer_full(®ions).await;
|
||||
assert!(regions[0].flush_reason().is_none());
|
||||
}
|
||||
}
|
||||
@@ -1,378 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_runtime::{RepeatedTask, TaskFunction};
|
||||
use common_telemetry::logging;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::{RegionId, SequenceNumber};
|
||||
use tokio::sync::oneshot::{Receiver, Sender};
|
||||
use tokio::sync::{oneshot, Notify};
|
||||
|
||||
use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
|
||||
use crate::config::EngineConfig;
|
||||
use crate::engine::RegionMap;
|
||||
use crate::error::{
|
||||
DuplicateFlushSnafu, Error, Result, StartPickTaskSnafu, StopPickTaskSnafu, WaitFlushSnafu,
|
||||
};
|
||||
use crate::flush::{FlushJob, FlushPicker, PickerConfig};
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::memtable::{MemtableId, MemtableRef};
|
||||
use crate::metrics::FLUSH_ERRORS_TOTAL;
|
||||
use crate::region;
|
||||
use crate::region::{RegionWriterRef, SharedDataRef};
|
||||
use crate::scheduler::rate_limit::BoxedRateLimitToken;
|
||||
use crate::scheduler::{Handler, LocalScheduler, Request, Scheduler, SchedulerConfig};
|
||||
use crate::sst::AccessLayerRef;
|
||||
use crate::wal::Wal;
|
||||
|
||||
/// Key for [FlushRequest].
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
|
||||
pub enum FlushKey {
|
||||
Engine,
|
||||
Region(RegionId, SequenceNumber),
|
||||
}
|
||||
|
||||
/// Flush request.
|
||||
pub enum FlushRequest<S: LogStore> {
|
||||
/// Flush the engine.
|
||||
Engine,
|
||||
/// Flush a region.
|
||||
Region {
|
||||
/// Region flush request.
|
||||
req: FlushRegionRequest<S>,
|
||||
/// Flush result sender.
|
||||
sender: Sender<Result<()>>,
|
||||
},
|
||||
}
|
||||
|
||||
impl<S: LogStore> Request for FlushRequest<S> {
|
||||
type Key = FlushKey;
|
||||
|
||||
#[inline]
|
||||
fn key(&self) -> FlushKey {
|
||||
match &self {
|
||||
FlushRequest::Engine => FlushKey::Engine,
|
||||
FlushRequest::Region { req, .. } => {
|
||||
FlushKey::Region(req.shared.id(), req.flush_sequence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn complete(self, result: Result<()>) {
|
||||
if let FlushRequest::Region { sender, .. } = self {
|
||||
let _ = sender.send(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Region flush request.
|
||||
pub struct FlushRegionRequest<S: LogStore> {
|
||||
/// Max memtable id in these memtables,
|
||||
/// used to remove immutable memtables in current version.
|
||||
pub max_memtable_id: MemtableId,
|
||||
/// Memtables to be flushed.
|
||||
pub memtables: Vec<MemtableRef>,
|
||||
/// Last sequence of data to be flushed.
|
||||
pub flush_sequence: SequenceNumber,
|
||||
/// Shared data of region to be flushed.
|
||||
pub shared: SharedDataRef,
|
||||
/// Sst access layer of the region.
|
||||
pub sst_layer: AccessLayerRef,
|
||||
/// Region writer, used to persist log entry that points to the latest manifest file.
|
||||
pub writer: RegionWriterRef<S>,
|
||||
/// Region write-ahead logging, used to write data/meta to the log file.
|
||||
pub wal: Wal<S>,
|
||||
/// Region manifest service, used to persist metadata.
|
||||
pub manifest: RegionManifest,
|
||||
/// Storage engine config
|
||||
pub engine_config: Arc<EngineConfig>,
|
||||
|
||||
// Compaction related options:
|
||||
/// TTL of the region.
|
||||
pub ttl: Option<Duration>,
|
||||
/// Time window for compaction.
|
||||
pub compaction_time_window: Option<i64>,
|
||||
pub compaction_picker: CompactionPickerRef<S>,
|
||||
}
|
||||
|
||||
impl<S: LogStore> FlushRegionRequest<S> {
|
||||
#[inline]
|
||||
fn region_id(&self) -> RegionId {
|
||||
self.shared.id()
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> From<&FlushRegionRequest<S>> for FlushJob<S> {
|
||||
fn from(req: &FlushRegionRequest<S>) -> FlushJob<S> {
|
||||
FlushJob {
|
||||
max_memtable_id: req.max_memtable_id,
|
||||
memtables: req.memtables.clone(),
|
||||
flush_sequence: req.flush_sequence,
|
||||
shared: req.shared.clone(),
|
||||
sst_layer: req.sst_layer.clone(),
|
||||
writer: req.writer.clone(),
|
||||
wal: req.wal.clone(),
|
||||
manifest: req.manifest.clone(),
|
||||
engine_config: req.engine_config.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> From<&FlushRegionRequest<S>> for CompactionRequestImpl<S> {
|
||||
fn from(req: &FlushRegionRequest<S>) -> CompactionRequestImpl<S> {
|
||||
CompactionRequestImpl {
|
||||
region_id: req.region_id(),
|
||||
sst_layer: req.sst_layer.clone(),
|
||||
writer: req.writer.clone(),
|
||||
shared: req.shared.clone(),
|
||||
manifest: req.manifest.clone(),
|
||||
wal: req.wal.clone(),
|
||||
ttl: req.ttl,
|
||||
compaction_time_window: req.compaction_time_window,
|
||||
sender: None,
|
||||
picker: req.compaction_picker.clone(),
|
||||
sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
|
||||
// compaction triggered by flush always reschedules
|
||||
reschedule_on_finish: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A handle to get the flush result.
|
||||
#[derive(Debug)]
|
||||
pub struct FlushHandle {
|
||||
region_id: RegionId,
|
||||
receiver: Receiver<Result<()>>,
|
||||
}
|
||||
|
||||
impl FlushHandle {
|
||||
/// Waits until the flush job is finished.
|
||||
pub async fn wait(self) -> Result<()> {
|
||||
self.receiver.await.context(WaitFlushSnafu {
|
||||
region_id: self.region_id,
|
||||
})?
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush scheduler.
|
||||
pub struct FlushScheduler<S: LogStore> {
|
||||
/// Flush task scheduler.
|
||||
scheduler: LocalScheduler<FlushRequest<S>>,
|
||||
/// Auto flush task.
|
||||
auto_flush_task: RepeatedTask<Error>,
|
||||
#[cfg(test)]
|
||||
pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
|
||||
}
|
||||
|
||||
pub type FlushSchedulerRef<S> = Arc<FlushScheduler<S>>;
|
||||
|
||||
impl<S: LogStore> FlushScheduler<S> {
|
||||
/// Returns a new [FlushScheduler].
|
||||
pub fn new(
|
||||
config: SchedulerConfig,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
regions: Arc<RegionMap<S>>,
|
||||
picker_config: PickerConfig,
|
||||
) -> Result<Self> {
|
||||
let task_interval = picker_config.schedule_interval;
|
||||
let picker = FlushPicker::new(picker_config);
|
||||
// Now we just clone the picker since we don't need to share states and
|
||||
// the clone of picker is cheap.
|
||||
let task_fn = AutoFlushFunction {
|
||||
regions: regions.clone(),
|
||||
picker: picker.clone(),
|
||||
};
|
||||
let auto_flush_task = RepeatedTask::new(task_interval, Box::new(task_fn));
|
||||
auto_flush_task
|
||||
.start(common_runtime::bg_runtime())
|
||||
.context(StartPickTaskSnafu)?;
|
||||
#[cfg(test)]
|
||||
let pending_tasks = Arc::new(tokio::sync::RwLock::new(vec![]));
|
||||
let handler = FlushHandler {
|
||||
compaction_scheduler,
|
||||
regions,
|
||||
picker,
|
||||
#[cfg(test)]
|
||||
pending_tasks: pending_tasks.clone(),
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
scheduler: LocalScheduler::new(config, handler),
|
||||
auto_flush_task,
|
||||
#[cfg(test)]
|
||||
pending_tasks,
|
||||
})
|
||||
}
|
||||
|
||||
/// Schedules a region flush request and return the handle to the flush task.
|
||||
pub fn schedule_region_flush(&self, req: FlushRegionRequest<S>) -> Result<FlushHandle> {
|
||||
let region_id = req.region_id();
|
||||
let sequence = req.flush_sequence;
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
|
||||
let scheduled = self
|
||||
.scheduler
|
||||
.schedule(FlushRequest::Region { req, sender })?;
|
||||
// Normally we should not have duplicate flush request.
|
||||
ensure!(
|
||||
scheduled,
|
||||
DuplicateFlushSnafu {
|
||||
region_id,
|
||||
sequence,
|
||||
}
|
||||
);
|
||||
|
||||
Ok(FlushHandle {
|
||||
region_id,
|
||||
receiver,
|
||||
})
|
||||
}
|
||||
|
||||
/// Schedules a engine flush request.
|
||||
pub fn schedule_engine_flush(&self) -> Result<()> {
|
||||
let _ = self.scheduler.schedule(FlushRequest::Engine)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the scheduler.
|
||||
pub async fn stop(&self) -> Result<()> {
|
||||
self.auto_flush_task
|
||||
.stop()
|
||||
.await
|
||||
.context(StopPickTaskSnafu)?;
|
||||
self.scheduler.stop(true).await?;
|
||||
|
||||
#[cfg(test)]
|
||||
let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct FlushHandler<S: LogStore> {
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
regions: Arc<RegionMap<S>>,
|
||||
picker: FlushPicker,
|
||||
#[cfg(test)]
|
||||
pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S: LogStore> Handler for FlushHandler<S> {
|
||||
type Request = FlushRequest<S>;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
req: FlushRequest<S>,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
let compaction_scheduler = self.compaction_scheduler.clone();
|
||||
let region_map = self.regions.clone();
|
||||
let picker = self.picker.clone();
|
||||
let _handle = common_runtime::spawn_bg(async move {
|
||||
match req {
|
||||
FlushRequest::Engine => {
|
||||
let regions = region_map.list_regions();
|
||||
picker.pick_by_write_buffer_full(®ions).await;
|
||||
}
|
||||
FlushRequest::Region { req, sender } => {
|
||||
execute_flush_region(req, sender, compaction_scheduler).await;
|
||||
}
|
||||
}
|
||||
|
||||
// releases rate limit token
|
||||
token.try_release();
|
||||
// notify scheduler to schedule next task when current task finishes.
|
||||
finish_notifier.notify_one();
|
||||
});
|
||||
|
||||
#[cfg(test)]
|
||||
self.pending_tasks.write().await.push(_handle);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn execute_flush_region<S: LogStore>(
|
||||
req: FlushRegionRequest<S>,
|
||||
sender: Sender<Result<()>>,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
) {
|
||||
let mut flush_job = FlushJob::from(&req);
|
||||
|
||||
if let Err(e) = flush_job.run().await {
|
||||
logging::error!(e; "Failed to flush region {}", req.region_id());
|
||||
|
||||
FLUSH_ERRORS_TOTAL.inc();
|
||||
|
||||
FlushRequest::Region { req, sender }.complete(Err(e));
|
||||
} else {
|
||||
logging::debug!("Successfully flush region: {}", req.region_id());
|
||||
|
||||
// Update last flush time.
|
||||
req.shared.update_flush_millis();
|
||||
|
||||
let compaction_request = CompactionRequestImpl::from(&req);
|
||||
let max_files_in_l0 = req.engine_config.max_files_in_l0;
|
||||
let shared_data = req.shared.clone();
|
||||
|
||||
let level0_file_num = shared_data
|
||||
.version_control
|
||||
.current()
|
||||
.ssts()
|
||||
.level(0)
|
||||
.file_num();
|
||||
if level0_file_num <= max_files_in_l0 {
|
||||
logging::debug!(
|
||||
"No enough SST files in level 0 (threshold: {}), skip compaction",
|
||||
max_files_in_l0
|
||||
);
|
||||
} else {
|
||||
// If flush is success, schedule a compaction request for this region.
|
||||
let _ =
|
||||
region::schedule_compaction(shared_data, compaction_scheduler, compaction_request);
|
||||
}
|
||||
|
||||
// Complete the request.
|
||||
FlushRequest::Region { req, sender }.complete(Ok(()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Task function to pick regions to flush.
|
||||
struct AutoFlushFunction<S: LogStore> {
|
||||
/// Regions of the engine.
|
||||
regions: Arc<RegionMap<S>>,
|
||||
picker: FlushPicker,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<S: LogStore> TaskFunction<Error> for AutoFlushFunction<S> {
|
||||
async fn call(&mut self) -> Result<()> {
|
||||
// Get all regions.
|
||||
let regions = self.regions.list_regions();
|
||||
let _ = self.picker.pick_by_interval(®ions).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"FlushPicker-pick-task"
|
||||
}
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Storage engine implementation.
|
||||
|
||||
#![feature(let_chains)]
|
||||
|
||||
mod chunk;
|
||||
pub mod codec;
|
||||
pub mod compaction;
|
||||
pub mod config;
|
||||
mod engine;
|
||||
pub mod error;
|
||||
mod flush;
|
||||
pub mod manifest;
|
||||
pub mod memtable;
|
||||
pub mod metadata;
|
||||
pub mod proto;
|
||||
pub mod read;
|
||||
pub mod region;
|
||||
pub mod scheduler;
|
||||
pub mod schema;
|
||||
mod snapshot;
|
||||
pub mod sst;
|
||||
mod sync;
|
||||
#[cfg(test)]
|
||||
mod test_util;
|
||||
mod version;
|
||||
mod wal;
|
||||
pub mod write_batch;
|
||||
|
||||
pub use engine::EngineImpl;
|
||||
mod file_purger;
|
||||
mod metrics;
|
||||
mod window_infer;
|
||||
|
||||
pub use sst::parquet::ParquetWriter;
|
||||
pub use sst::Source;
|
||||
@@ -1,26 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! manifest storage
|
||||
pub(crate) mod action;
|
||||
pub mod checkpoint;
|
||||
pub mod helper;
|
||||
mod impl_;
|
||||
pub mod region;
|
||||
pub(crate) mod storage;
|
||||
#[cfg(test)]
|
||||
pub mod test_utils;
|
||||
|
||||
pub use self::impl_::*;
|
||||
pub use self::storage::manifest_compress_type;
|
||||
@@ -1,443 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json as json;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::manifest::action::{ProtocolAction, ProtocolVersion, VersionHeader};
|
||||
use store_api::manifest::{Checkpoint, ManifestVersion, MetaAction};
|
||||
use store_api::storage::{RegionId, SequenceNumber};
|
||||
|
||||
use crate::error::{
|
||||
self, DecodeJsonSnafu, DecodeMetaActionListSnafu, ManifestProtocolForbidReadSnafu,
|
||||
ReadlineSnafu, Result,
|
||||
};
|
||||
use crate::manifest::helper;
|
||||
use crate::metadata::{ColumnFamilyMetadata, ColumnMetadata, VersionNumber};
|
||||
use crate::sst::{FileId, FileMeta};
|
||||
|
||||
/// Minimal data that could be used to persist and recover [RegionMetadata](crate::metadata::RegionMetadata).
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct RawRegionMetadata {
|
||||
pub id: RegionId,
|
||||
pub name: String,
|
||||
pub columns: RawColumnsMetadata,
|
||||
pub column_families: RawColumnFamiliesMetadata,
|
||||
pub version: VersionNumber,
|
||||
}
|
||||
|
||||
/// Minimal data that could be used to persist and recover [ColumnsMetadata](crate::metadata::ColumnsMetadata).
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct RawColumnsMetadata {
|
||||
pub columns: Vec<ColumnMetadata>,
|
||||
pub row_key_end: usize,
|
||||
pub timestamp_key_index: usize,
|
||||
pub user_column_end: usize,
|
||||
}
|
||||
|
||||
/// Minimal data that could be used to persist and recover [ColumnFamiliesMetadata](crate::metadata::ColumnFamiliesMetadata).
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct RawColumnFamiliesMetadata {
|
||||
pub column_families: Vec<ColumnFamilyMetadata>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RegionChange {
|
||||
/// The committed sequence of the region when this change happens. So the
|
||||
/// data with sequence **greater than** this sequence would use the new
|
||||
/// metadata.
|
||||
pub committed_sequence: SequenceNumber,
|
||||
/// The metadata after changed.
|
||||
pub metadata: RawRegionMetadata,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RegionRemove {
|
||||
pub region_id: RegionId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RegionEdit {
|
||||
pub region_version: VersionNumber,
|
||||
pub flushed_sequence: Option<SequenceNumber>,
|
||||
pub files_to_add: Vec<FileMeta>,
|
||||
pub files_to_remove: Vec<FileMeta>,
|
||||
pub compaction_time_window: Option<i64>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RegionTruncate {
|
||||
pub region_id: RegionId,
|
||||
pub committed_sequence: SequenceNumber,
|
||||
}
|
||||
|
||||
/// The region version checkpoint
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RegionVersion {
|
||||
pub manifest_version: ManifestVersion,
|
||||
pub flushed_sequence: Option<SequenceNumber>,
|
||||
pub files: HashMap<FileId, FileMeta>,
|
||||
}
|
||||
|
||||
/// The region manifest data checkpoint
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)]
|
||||
pub struct RegionManifestData {
|
||||
pub committed_sequence: SequenceNumber,
|
||||
pub metadata: RawRegionMetadata,
|
||||
pub version: Option<RegionVersion>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegionManifestDataBuilder {
|
||||
committed_sequence: SequenceNumber,
|
||||
metadata: RawRegionMetadata,
|
||||
version: Option<RegionVersion>,
|
||||
}
|
||||
|
||||
impl RegionManifestDataBuilder {
|
||||
pub fn with_checkpoint(checkpoint: Option<RegionManifestData>) -> Self {
|
||||
if let Some(s) = checkpoint {
|
||||
Self {
|
||||
metadata: s.metadata,
|
||||
version: s.version,
|
||||
committed_sequence: s.committed_sequence,
|
||||
}
|
||||
} else {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply_change(&mut self, change: RegionChange) {
|
||||
self.metadata = change.metadata;
|
||||
self.committed_sequence = change.committed_sequence;
|
||||
}
|
||||
|
||||
pub fn apply_edit(&mut self, manifest_version: ManifestVersion, edit: RegionEdit) {
|
||||
if let Some(version) = &mut self.version {
|
||||
version.manifest_version = manifest_version;
|
||||
version.flushed_sequence = edit.flushed_sequence;
|
||||
for file in edit.files_to_add {
|
||||
let _ = version.files.insert(file.file_id, file);
|
||||
}
|
||||
for file in edit.files_to_remove {
|
||||
let _ = version.files.remove(&file.file_id);
|
||||
}
|
||||
} else {
|
||||
self.version = Some(RegionVersion {
|
||||
manifest_version,
|
||||
flushed_sequence: edit.flushed_sequence,
|
||||
files: edit
|
||||
.files_to_add
|
||||
.into_iter()
|
||||
.map(|f| (f.file_id, f))
|
||||
.collect(),
|
||||
});
|
||||
}
|
||||
}
|
||||
pub fn build(self) -> RegionManifestData {
|
||||
RegionManifestData {
|
||||
metadata: self.metadata,
|
||||
version: self.version,
|
||||
committed_sequence: self.committed_sequence,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The checkpoint of region manifest, generated by checkpoint.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
pub struct RegionCheckpoint {
|
||||
/// The snasphot protocol
|
||||
pub protocol: ProtocolAction,
|
||||
/// The last manifest version that this checkpoint compacts(inclusive).
|
||||
pub last_version: ManifestVersion,
|
||||
// The number of manifest actions that this checkpoint compacts.
|
||||
pub compacted_actions: usize,
|
||||
// The checkpoint data
|
||||
pub checkpoint: Option<RegionManifestData>,
|
||||
}
|
||||
|
||||
impl Checkpoint for RegionCheckpoint {
|
||||
type Error = error::Error;
|
||||
|
||||
fn set_protocol(&mut self, action: ProtocolAction) {
|
||||
self.protocol = action;
|
||||
}
|
||||
|
||||
fn last_version(&self) -> ManifestVersion {
|
||||
self.last_version
|
||||
}
|
||||
|
||||
fn encode(&self) -> Result<Vec<u8>> {
|
||||
helper::encode_checkpoint(self)
|
||||
}
|
||||
|
||||
fn decode(bs: &[u8], reader_version: ProtocolVersion) -> Result<Self> {
|
||||
helper::decode_checkpoint(bs, reader_version)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum RegionMetaAction {
|
||||
Protocol(ProtocolAction),
|
||||
Change(RegionChange),
|
||||
Remove(RegionRemove),
|
||||
Edit(RegionEdit),
|
||||
Truncate(RegionTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RegionMetaActionList {
|
||||
pub actions: Vec<RegionMetaAction>,
|
||||
pub prev_version: ManifestVersion,
|
||||
}
|
||||
|
||||
impl RegionMetaActionList {
|
||||
pub fn with_action(action: RegionMetaAction) -> Self {
|
||||
Self {
|
||||
actions: vec![action],
|
||||
prev_version: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(actions: Vec<RegionMetaAction>) -> Self {
|
||||
Self {
|
||||
actions,
|
||||
prev_version: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MetaAction for RegionMetaActionList {
|
||||
type Error = error::Error;
|
||||
|
||||
fn set_protocol(&mut self, action: ProtocolAction) {
|
||||
// The protocol action should be the first action in action list by convention.
|
||||
self.actions.insert(0, RegionMetaAction::Protocol(action));
|
||||
}
|
||||
|
||||
fn set_prev_version(&mut self, version: ManifestVersion) {
|
||||
self.prev_version = version;
|
||||
}
|
||||
|
||||
/// Encode self into json in the form of string lines, starts with prev_version and then action json list.
|
||||
fn encode(&self) -> Result<Vec<u8>> {
|
||||
helper::encode_actions(self.prev_version, &self.actions)
|
||||
}
|
||||
|
||||
fn decode(
|
||||
bs: &[u8],
|
||||
reader_version: ProtocolVersion,
|
||||
) -> Result<(Self, Option<ProtocolAction>)> {
|
||||
let mut lines = BufReader::new(bs).lines();
|
||||
|
||||
let mut action_list = RegionMetaActionList {
|
||||
actions: Vec::default(),
|
||||
prev_version: 0,
|
||||
};
|
||||
|
||||
{
|
||||
let first_line = lines
|
||||
.next()
|
||||
.with_context(|| DecodeMetaActionListSnafu {
|
||||
msg: format!(
|
||||
"Invalid content in manifest: {}",
|
||||
std::str::from_utf8(bs).unwrap_or("**invalid bytes**")
|
||||
),
|
||||
})?
|
||||
.context(ReadlineSnafu)?;
|
||||
|
||||
// Decode prev_version
|
||||
let v: VersionHeader = json::from_str(&first_line).context(DecodeJsonSnafu)?;
|
||||
action_list.prev_version = v.prev_version;
|
||||
}
|
||||
|
||||
// Decode actions
|
||||
let mut protocol_action = None;
|
||||
let mut actions = Vec::default();
|
||||
for line in lines {
|
||||
let line = &line.context(ReadlineSnafu)?;
|
||||
let action: RegionMetaAction = json::from_str(line).context(DecodeJsonSnafu)?;
|
||||
|
||||
if let RegionMetaAction::Protocol(p) = &action {
|
||||
ensure!(
|
||||
p.is_readable(reader_version),
|
||||
ManifestProtocolForbidReadSnafu {
|
||||
min_version: p.min_reader_version,
|
||||
supported_version: reader_version,
|
||||
}
|
||||
);
|
||||
protocol_action = Some(p.clone());
|
||||
}
|
||||
|
||||
actions.push(action);
|
||||
}
|
||||
action_list.actions = actions;
|
||||
|
||||
Ok((action_list, protocol_action))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_telemetry::logging;
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
|
||||
use super::*;
|
||||
use crate::manifest::test_utils;
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::sst::FileId;
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_encode_decode_action_list() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut protocol = ProtocolAction::new();
|
||||
protocol.min_reader_version = 1;
|
||||
let mut action_list = RegionMetaActionList::new(vec![
|
||||
RegionMetaAction::Protocol(protocol.clone()),
|
||||
RegionMetaAction::Edit(test_utils::build_region_edit(
|
||||
99,
|
||||
&[FileId::random(), FileId::random()],
|
||||
&[FileId::random()],
|
||||
)),
|
||||
]);
|
||||
action_list.set_prev_version(3);
|
||||
|
||||
let bs = action_list.encode().unwrap();
|
||||
// {"prev_version":3}
|
||||
// {"Protocol":{"min_reader_version":1,"min_writer_version":0}}
|
||||
// {"Edit":{"region_version":0,"flush_sequence":99,"files_to_add":[{"file_name":"test1","level":1},{"file_name":"test2","level":2}],"files_to_remove":[{"file_name":"test0","level":0}]}}
|
||||
|
||||
logging::debug!(
|
||||
"Encoded action list: \r\n{}",
|
||||
String::from_utf8(bs.clone()).unwrap()
|
||||
);
|
||||
|
||||
let e = RegionMetaActionList::decode(&bs, 0);
|
||||
assert!(e.is_err());
|
||||
assert_eq!(
|
||||
"Manifest protocol forbid to read, min_version: 1, supported_version: 0",
|
||||
format!("{}", e.err().unwrap())
|
||||
);
|
||||
|
||||
let (decode_list, p) = RegionMetaActionList::decode(&bs, 1).unwrap();
|
||||
assert_eq!(decode_list, action_list);
|
||||
assert_eq!(p.unwrap(), protocol);
|
||||
}
|
||||
|
||||
// These tests are used to ensure backward compatibility of manifest files.
|
||||
// DO NOT modify the serialized string when they fail, check if your
|
||||
// modification to manifest-related structs is compatible with older manifests.
|
||||
#[test]
|
||||
fn test_region_manifest_compatibility() {
|
||||
let region_edit = r#"{"region_version":0,"flushed_sequence":null,"files_to_add":[{"region_id":4402341478400,"file_name":"4b220a70-2b03-4641-9687-b65d94641208.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":1}],"files_to_remove":[{"region_id":4402341478400,"file_name":"34b6ebb9-b8a5-4a4b-b744-56f67defad02.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":0}]}"#;
|
||||
let _ = serde_json::from_str::<RegionEdit>(region_edit).unwrap();
|
||||
|
||||
let region_change = r#" {"committed_sequence":42,"metadata":{"id":0,"name":"region-0","columns":{"columns":[{"cf_id":0,"desc":{"id":2,"name":"k1","data_type":{"Int32":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":0,"desc":{"id":1,"name":"timestamp","data_type":{"Timestamp":{"Millisecond":null}},"is_nullable":false,"is_time_index":true,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":3,"name":"v1","data_type":{"Float32":{}},"is_nullable":true,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483649,"name":"__sequence","data_type":{"UInt64":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483650,"name":"__op_type","data_type":{"UInt8":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}}],"row_key_end":2,"timestamp_key_index":1,"enable_version_column":false,"user_column_end":3},"column_families":{"column_families":[{"name":"default","cf_id":1,"column_index_start":2,"column_index_end":3}]},"version":0}}"#;
|
||||
let _ = serde_json::from_str::<RegionChange>(region_change).unwrap();
|
||||
|
||||
let region_remove = r#"{"region_id":42}"#;
|
||||
let _ = serde_json::from_str::<RegionRemove>(region_remove).unwrap();
|
||||
|
||||
let protocol_action = r#"{"min_reader_version":1,"min_writer_version":2}"#;
|
||||
let _ = serde_json::from_str::<ProtocolAction>(protocol_action).unwrap();
|
||||
}
|
||||
|
||||
fn mock_file_meta() -> FileMeta {
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id: FileId::random(),
|
||||
time_range: None,
|
||||
level: 0,
|
||||
file_size: 1024,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_region_manifest_builder() {
|
||||
let desc = RegionDescBuilder::new("test_region_manifest_builder")
|
||||
.push_field_column(("v0", LogicalTypeId::Int64, true))
|
||||
.build();
|
||||
let region_metadata: RegionMetadata = desc.try_into().unwrap();
|
||||
|
||||
let mut builder = RegionManifestDataBuilder::with_checkpoint(None);
|
||||
|
||||
builder.apply_change(RegionChange {
|
||||
committed_sequence: 42,
|
||||
metadata: RawRegionMetadata::from(®ion_metadata),
|
||||
});
|
||||
let files = vec![mock_file_meta(), mock_file_meta()];
|
||||
builder.apply_edit(
|
||||
84,
|
||||
RegionEdit {
|
||||
region_version: 0,
|
||||
flushed_sequence: Some(99),
|
||||
files_to_add: files.clone(),
|
||||
files_to_remove: vec![],
|
||||
compaction_time_window: None,
|
||||
},
|
||||
);
|
||||
builder.apply_edit(
|
||||
85,
|
||||
RegionEdit {
|
||||
region_version: 0,
|
||||
flushed_sequence: Some(100),
|
||||
files_to_add: vec![],
|
||||
files_to_remove: vec![files[0].clone()],
|
||||
compaction_time_window: None,
|
||||
},
|
||||
);
|
||||
|
||||
let manifest = builder.build();
|
||||
assert_eq!(manifest.metadata, RawRegionMetadata::from(®ion_metadata));
|
||||
assert_eq!(manifest.committed_sequence, 42);
|
||||
assert_eq!(
|
||||
manifest.version,
|
||||
Some(RegionVersion {
|
||||
manifest_version: 85,
|
||||
flushed_sequence: Some(100),
|
||||
files: files[1..].iter().map(|f| (f.file_id, f.clone())).collect(),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_decode_region_checkpoint() {
|
||||
let region_checkpoint = RegionCheckpoint {
|
||||
protocol: ProtocolAction::default(),
|
||||
last_version: 42,
|
||||
compacted_actions: 10,
|
||||
checkpoint: Some(RegionManifestData {
|
||||
committed_sequence: 100,
|
||||
metadata: RawRegionMetadata::default(),
|
||||
version: Some(RegionVersion {
|
||||
manifest_version: 84,
|
||||
flushed_sequence: Some(99),
|
||||
files: vec![mock_file_meta(), mock_file_meta()]
|
||||
.into_iter()
|
||||
.map(|f| (f.file_id, f))
|
||||
.collect(),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
|
||||
let bytes = region_checkpoint.encode().unwrap();
|
||||
assert!(!bytes.is_empty());
|
||||
let decoded_checkpoint = RegionCheckpoint::decode(&bytes, 0).unwrap();
|
||||
assert_eq!(region_checkpoint, decoded_checkpoint);
|
||||
}
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use store_api::manifest::{Checkpoint, MetaAction};
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
use crate::manifest::ManifestImpl;
|
||||
|
||||
#[async_trait]
|
||||
pub trait Checkpointer: Send + Sync + std::fmt::Debug {
|
||||
type Checkpoint: Checkpoint<Error = Error>;
|
||||
type MetaAction: MetaAction<Error = Error>;
|
||||
|
||||
/// Try to create a checkpoint, return the checkpoint if successes.
|
||||
async fn do_checkpoint(
|
||||
&self,
|
||||
manifest: &ManifestImpl<Self::Checkpoint, Self::MetaAction>,
|
||||
) -> Result<Option<Self::Checkpoint>>;
|
||||
|
||||
fn as_any(&self) -> &dyn Any;
|
||||
}
|
||||
@@ -1,69 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::io::Write;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde_json::to_writer;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::manifest::action::{ProtocolVersion, VersionHeader};
|
||||
use store_api::manifest::ManifestVersion;
|
||||
|
||||
use crate::error::{
|
||||
DecodeJsonSnafu, EncodeJsonSnafu, ManifestProtocolForbidReadSnafu, Result, Utf8Snafu,
|
||||
};
|
||||
use crate::manifest::action::RegionCheckpoint;
|
||||
|
||||
pub const NEWLINE: &[u8] = b"\n";
|
||||
|
||||
pub fn encode_actions<T: Serialize>(
|
||||
prev_version: ManifestVersion,
|
||||
actions: &[T],
|
||||
) -> Result<Vec<u8>> {
|
||||
let mut bytes = Vec::default();
|
||||
{
|
||||
// Encode prev_version
|
||||
let v = VersionHeader { prev_version };
|
||||
|
||||
to_writer(&mut bytes, &v).context(EncodeJsonSnafu)?;
|
||||
// unwrap is fine here, because we write into a buffer.
|
||||
bytes.write_all(NEWLINE).unwrap();
|
||||
}
|
||||
|
||||
for action in actions {
|
||||
to_writer(&mut bytes, action).context(EncodeJsonSnafu)?;
|
||||
bytes.write_all(NEWLINE).unwrap();
|
||||
}
|
||||
|
||||
Ok(bytes)
|
||||
}
|
||||
|
||||
pub fn encode_checkpoint(snasphot: &RegionCheckpoint) -> Result<Vec<u8>> {
|
||||
let s = serde_json::to_string(snasphot).context(EncodeJsonSnafu)?;
|
||||
Ok(s.into_bytes())
|
||||
}
|
||||
|
||||
pub fn decode_checkpoint(bs: &[u8], reader_version: ProtocolVersion) -> Result<RegionCheckpoint> {
|
||||
let s = std::str::from_utf8(bs).context(Utf8Snafu)?;
|
||||
let checkpoint: RegionCheckpoint = serde_json::from_str(s).context(DecodeJsonSnafu)?;
|
||||
ensure!(
|
||||
checkpoint.protocol.is_readable(reader_version),
|
||||
ManifestProtocolForbidReadSnafu {
|
||||
min_version: checkpoint.protocol.min_reader_version,
|
||||
supported_version: reader_version,
|
||||
}
|
||||
);
|
||||
|
||||
Ok(checkpoint)
|
||||
}
|
||||
@@ -1,405 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use async_trait::async_trait;
|
||||
use common_datasource::compression::CompressionType;
|
||||
use common_runtime::{RepeatedTask, TaskFunction};
|
||||
use common_telemetry::{debug, logging, warn};
|
||||
use object_store::ObjectStore;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::manifest::action::{self, ProtocolAction, ProtocolVersion};
|
||||
use store_api::manifest::*;
|
||||
|
||||
use crate::error::{
|
||||
Error, ManifestProtocolForbidWriteSnafu, Result, StartManifestGcTaskSnafu,
|
||||
StopManifestGcTaskSnafu,
|
||||
};
|
||||
use crate::manifest::action::RegionCheckpoint;
|
||||
use crate::manifest::checkpoint::Checkpointer;
|
||||
use crate::manifest::storage::{ManifestObjectStore, ObjectStoreLogIterator};
|
||||
|
||||
const CHECKPOINT_ACTIONS_MARGIN: u16 = 10;
|
||||
const GC_DURATION_SECS: u64 = 600;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ManifestImpl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
|
||||
inner: Arc<ManifestImplInner<S, M>>,
|
||||
checkpointer: Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>>,
|
||||
last_checkpoint_version: Arc<AtomicU64>,
|
||||
checkpoint_actions_margin: u16,
|
||||
gc_task: Option<Arc<RepeatedTask<Error>>>,
|
||||
}
|
||||
|
||||
impl<S: 'static + Checkpoint<Error = Error>, M: 'static + MetaAction<Error = Error>>
|
||||
ManifestImpl<S, M>
|
||||
{
|
||||
pub fn new(
|
||||
manifest_dir: &str,
|
||||
object_store: ObjectStore,
|
||||
compress_type: CompressionType,
|
||||
checkpoint_actions_margin: Option<u16>,
|
||||
gc_duration: Option<Duration>,
|
||||
checkpointer: Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>>,
|
||||
) -> Self {
|
||||
let inner = Arc::new(ManifestImplInner::new(
|
||||
manifest_dir,
|
||||
object_store,
|
||||
compress_type,
|
||||
));
|
||||
let gc_task = if checkpointer.is_some() {
|
||||
// only start gc task when checkpoint is enabled.
|
||||
Some(Arc::new(RepeatedTask::new(
|
||||
gc_duration.unwrap_or_else(|| Duration::from_secs(GC_DURATION_SECS)),
|
||||
Box::new(ManifestGcTask {
|
||||
inner: inner.clone(),
|
||||
}),
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
ManifestImpl {
|
||||
inner,
|
||||
checkpointer,
|
||||
checkpoint_actions_margin: checkpoint_actions_margin
|
||||
.unwrap_or(CHECKPOINT_ACTIONS_MARGIN),
|
||||
last_checkpoint_version: Arc::new(AtomicU64::new(MIN_VERSION)),
|
||||
gc_task,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create(
|
||||
manifest_dir: &str,
|
||||
object_store: ObjectStore,
|
||||
compress_type: CompressionType,
|
||||
) -> Self {
|
||||
Self::new(manifest_dir, object_store, compress_type, None, None, None)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn checkpointer(
|
||||
&self,
|
||||
) -> &Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>> {
|
||||
&self.checkpointer
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn set_last_checkpoint_version(&self, version: ManifestVersion) {
|
||||
self.last_checkpoint_version
|
||||
.store(version, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Update inner state.
|
||||
pub fn update_state(&self, version: ManifestVersion, protocol: Option<ProtocolAction>) {
|
||||
self.inner.update_state(version, protocol);
|
||||
}
|
||||
|
||||
pub(crate) async fn save_checkpoint(&self, checkpoint: &RegionCheckpoint) -> Result<()> {
|
||||
ensure!(
|
||||
checkpoint
|
||||
.protocol
|
||||
.is_writable(self.inner.supported_writer_version),
|
||||
ManifestProtocolForbidWriteSnafu {
|
||||
min_version: checkpoint.protocol.min_writer_version,
|
||||
supported_version: self.inner.supported_writer_version,
|
||||
}
|
||||
);
|
||||
let bytes = checkpoint.encode()?;
|
||||
self.manifest_store()
|
||||
.save_checkpoint(checkpoint.last_version, &bytes)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn may_do_checkpoint(&self, version: ManifestVersion) -> Result<()> {
|
||||
if version - self.last_checkpoint_version.load(Ordering::Relaxed)
|
||||
>= self.checkpoint_actions_margin as u64
|
||||
{
|
||||
let s = self.do_checkpoint().await?;
|
||||
debug!("Manifest checkpoint, checkpoint: {:#?}", s);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn manifest_store(&self) -> &Arc<ManifestObjectStore> {
|
||||
self.inner.manifest_store()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<S: 'static + Checkpoint<Error = Error>, M: 'static + MetaAction<Error = Error>> Manifest
|
||||
for ManifestImpl<S, M>
|
||||
{
|
||||
type Error = Error;
|
||||
type Checkpoint = S;
|
||||
type MetaAction = M;
|
||||
type MetaActionIterator = MetaActionIteratorImpl<M>;
|
||||
|
||||
async fn update(&self, action_list: M) -> Result<ManifestVersion> {
|
||||
let version = self.inner.save(action_list).await?;
|
||||
|
||||
self.may_do_checkpoint(version).await?;
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
async fn scan(
|
||||
&self,
|
||||
start: ManifestVersion,
|
||||
end: ManifestVersion,
|
||||
) -> Result<Self::MetaActionIterator> {
|
||||
self.inner.scan(start, end).await
|
||||
}
|
||||
|
||||
async fn do_checkpoint(&self) -> Result<Option<S>> {
|
||||
if let Some(cp) = &self.checkpointer {
|
||||
let checkpoint = cp.do_checkpoint(self).await?;
|
||||
if let Some(checkpoint) = &checkpoint {
|
||||
self.set_last_checkpoint_version(checkpoint.last_version());
|
||||
}
|
||||
return Ok(checkpoint);
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
async fn last_checkpoint(&self) -> Result<Option<S>> {
|
||||
self.inner.last_checkpoint().await
|
||||
}
|
||||
|
||||
fn last_version(&self) -> ManifestVersion {
|
||||
self.inner.last_version()
|
||||
}
|
||||
|
||||
async fn start(&self) -> Result<()> {
|
||||
if let Some(task) = &self.gc_task {
|
||||
task.start(common_runtime::bg_runtime())
|
||||
.context(StartManifestGcTaskSnafu)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn stop(&self) -> Result<()> {
|
||||
if let Some(task) = &self.gc_task {
|
||||
task.stop().await.context(StopManifestGcTaskSnafu)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ManifestImplInner<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
|
||||
store: Arc<ManifestObjectStore>,
|
||||
version: AtomicU64,
|
||||
/// Current using protocol
|
||||
protocol: ArcSwap<ProtocolAction>,
|
||||
/// Current node supported protocols (reader_version, writer_version)
|
||||
supported_reader_version: ProtocolVersion,
|
||||
supported_writer_version: ProtocolVersion,
|
||||
_phantom: PhantomData<(S, M)>,
|
||||
}
|
||||
|
||||
pub struct MetaActionIteratorImpl<M: MetaAction<Error = Error>> {
|
||||
log_iter: ObjectStoreLogIterator,
|
||||
reader_version: ProtocolVersion,
|
||||
last_protocol: Option<ProtocolAction>,
|
||||
_phantom: PhantomData<M>,
|
||||
}
|
||||
|
||||
impl<M: MetaAction<Error = Error>> MetaActionIteratorImpl<M> {
|
||||
pub fn last_protocol(&self) -> &Option<ProtocolAction> {
|
||||
&self.last_protocol
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<M: MetaAction<Error = Error>> MetaActionIterator for MetaActionIteratorImpl<M> {
|
||||
type Error = Error;
|
||||
type MetaAction = M;
|
||||
|
||||
async fn next_action(&mut self) -> Result<Option<(ManifestVersion, M)>> {
|
||||
match self.log_iter.next_log().await? {
|
||||
Some((v, bytes)) => {
|
||||
let (action_list, protocol) = M::decode(&bytes, self.reader_version)?;
|
||||
|
||||
if protocol.is_some() {
|
||||
self.last_protocol = protocol;
|
||||
}
|
||||
|
||||
Ok(Some((v, action_list)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ManifestGcTask<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
|
||||
inner: Arc<ManifestImplInner<S, M>>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> TaskFunction<Error>
|
||||
for ManifestGcTask<S, M>
|
||||
{
|
||||
fn name(&self) -> &str {
|
||||
"region-manifest-gc"
|
||||
}
|
||||
|
||||
async fn call(&mut self) -> Result<()> {
|
||||
if let Some((last_version, _)) = self.inner.store.load_last_checkpoint().await? {
|
||||
// Purge all manifest <= last_version and checkpoint files < last_version.
|
||||
let deleted = self
|
||||
.inner
|
||||
.store
|
||||
.delete_until(last_version + 1, true)
|
||||
.await?;
|
||||
debug!(
|
||||
"Deleted {} logs from region manifest storage(path={}), last_version: {}.",
|
||||
deleted,
|
||||
self.inner.store.path(),
|
||||
last_version,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> ManifestImplInner<S, M> {
|
||||
fn new(manifest_dir: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self {
|
||||
let (reader_version, writer_version) = action::supported_protocol_version();
|
||||
|
||||
Self {
|
||||
store: Arc::new(ManifestObjectStore::new(
|
||||
manifest_dir,
|
||||
object_store,
|
||||
compress_type,
|
||||
)),
|
||||
version: AtomicU64::new(0),
|
||||
protocol: ArcSwap::new(Arc::new(ProtocolAction::new())),
|
||||
supported_reader_version: reader_version,
|
||||
supported_writer_version: writer_version,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn manifest_store(&self) -> &Arc<ManifestObjectStore> {
|
||||
&self.store
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn inc_version(&self) -> ManifestVersion {
|
||||
self.version.fetch_add(1, Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn update_state(&self, version: ManifestVersion, protocol: Option<ProtocolAction>) {
|
||||
self.version.store(version, Ordering::Relaxed);
|
||||
if let Some(p) = protocol {
|
||||
self.protocol.store(Arc::new(p));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn last_version(&self) -> ManifestVersion {
|
||||
self.version.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
async fn save(&self, mut action_list: M) -> Result<ManifestVersion> {
|
||||
let protocol = self.protocol.load();
|
||||
|
||||
ensure!(
|
||||
protocol.is_writable(self.supported_writer_version),
|
||||
ManifestProtocolForbidWriteSnafu {
|
||||
min_version: protocol.min_writer_version,
|
||||
supported_version: self.supported_writer_version,
|
||||
}
|
||||
);
|
||||
|
||||
let version = self.inc_version();
|
||||
|
||||
if version == 0 || protocol.min_writer_version < self.supported_writer_version {
|
||||
let new_protocol = ProtocolAction {
|
||||
min_reader_version: self.supported_reader_version,
|
||||
min_writer_version: self.supported_writer_version,
|
||||
};
|
||||
action_list.set_protocol(new_protocol.clone());
|
||||
|
||||
logging::info!(
|
||||
"Updated manifest protocol from {} to {}.",
|
||||
protocol,
|
||||
new_protocol
|
||||
);
|
||||
|
||||
self.protocol.store(Arc::new(new_protocol));
|
||||
}
|
||||
|
||||
logging::debug!(
|
||||
"Save region metadata action: {:?}, version: {}",
|
||||
action_list,
|
||||
version
|
||||
);
|
||||
|
||||
self.store.save(version, &action_list.encode()?).await?;
|
||||
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
async fn scan(
|
||||
&self,
|
||||
start: ManifestVersion,
|
||||
end: ManifestVersion,
|
||||
) -> Result<MetaActionIteratorImpl<M>> {
|
||||
Ok(MetaActionIteratorImpl {
|
||||
log_iter: self.store.scan(start, end).await?,
|
||||
reader_version: self.supported_reader_version,
|
||||
last_protocol: None,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
async fn last_checkpoint(&self) -> Result<Option<S>> {
|
||||
let protocol = self.protocol.load();
|
||||
let last_checkpoint = self.store.load_last_checkpoint().await?;
|
||||
|
||||
if let Some((version, bytes)) = last_checkpoint {
|
||||
let checkpoint = S::decode(&bytes, protocol.min_reader_version)?;
|
||||
assert!(checkpoint.last_version() >= version);
|
||||
if checkpoint.last_version() > version {
|
||||
// It happens when saving checkpoint successfully, but failed at saving checkpoint metadata(the "__last_checkpoint" file).
|
||||
// Then we try to use the old checkpoint and do the checkpoint next time.
|
||||
// If the old checkpoint was deleted, it's fine that we return the latest checkpoint.
|
||||
// The only side effect is leaving some unused checkpoint files,
|
||||
// and they will be purged by gc task.
|
||||
warn!("The checkpoint manifest version {} in {} is greater than checkpoint metadata version {}.", self.store.path(), checkpoint.last_version(), version);
|
||||
|
||||
if let Some((_, bytes)) = self.store.load_checkpoint(version).await? {
|
||||
let old_checkpoint = S::decode(&bytes, protocol.min_reader_version)?;
|
||||
return Ok(Some(old_checkpoint));
|
||||
}
|
||||
}
|
||||
Ok(Some(checkpoint))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,690 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region manifest impl
|
||||
use std::any::Any;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_datasource::compression::CompressionType;
|
||||
use common_telemetry::{info, warn};
|
||||
use object_store::ObjectStore;
|
||||
use store_api::manifest::action::ProtocolAction;
|
||||
use store_api::manifest::{
|
||||
Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator, MIN_VERSION,
|
||||
};
|
||||
|
||||
use crate::error::{ManifestCheckpointSnafu, Result};
|
||||
use crate::manifest::action::*;
|
||||
use crate::manifest::checkpoint::Checkpointer;
|
||||
use crate::manifest::ManifestImpl;
|
||||
|
||||
pub type RegionManifest = ManifestImpl<RegionCheckpoint, RegionMetaActionList>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RegionManifestCheckpointer {
|
||||
// The latest manifest version when flushing memtables.
|
||||
// Checkpoint can't exceed over flushed manifest version because we have to keep
|
||||
// the region metadata for replaying WAL to ensure correct data schema.
|
||||
flushed_manifest_version: AtomicU64,
|
||||
}
|
||||
|
||||
impl RegionManifestCheckpointer {
|
||||
pub(crate) fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) {
|
||||
let current = self.flushed_manifest_version.load(Ordering::Relaxed);
|
||||
|
||||
self.flushed_manifest_version
|
||||
.store(current.max(manifest_version), Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Checkpointer for RegionManifestCheckpointer {
|
||||
type Checkpoint = RegionCheckpoint;
|
||||
type MetaAction = RegionMetaActionList;
|
||||
|
||||
async fn do_checkpoint(
|
||||
&self,
|
||||
manifest: &ManifestImpl<RegionCheckpoint, RegionMetaActionList>,
|
||||
) -> Result<Option<RegionCheckpoint>> {
|
||||
let last_checkpoint = manifest.last_checkpoint().await?;
|
||||
|
||||
let current_version = manifest.last_version();
|
||||
let (start_version, mut protocol, mut manifest_builder) =
|
||||
if let Some(checkpoint) = last_checkpoint {
|
||||
(
|
||||
checkpoint.last_version + 1,
|
||||
checkpoint.protocol,
|
||||
RegionManifestDataBuilder::with_checkpoint(checkpoint.checkpoint),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
MIN_VERSION,
|
||||
ProtocolAction::default(),
|
||||
RegionManifestDataBuilder::default(),
|
||||
)
|
||||
};
|
||||
|
||||
let end_version =
|
||||
current_version.min(self.flushed_manifest_version.load(Ordering::Relaxed)) + 1;
|
||||
if start_version >= end_version {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
info!("Begin to do region manifest checkpoint, path: {}, start_version: {}, end_version: {}, flushed_manifest_version: {}",
|
||||
manifest.manifest_store().path(),
|
||||
start_version,
|
||||
end_version,
|
||||
self.flushed_manifest_version.load(Ordering::Relaxed));
|
||||
|
||||
let mut iter = manifest.scan(start_version, end_version).await?;
|
||||
|
||||
let mut last_version = start_version;
|
||||
let mut compacted_actions = 0;
|
||||
while let Some((version, action_list)) = iter.next_action().await? {
|
||||
for action in action_list.actions {
|
||||
match action {
|
||||
RegionMetaAction::Change(c) => manifest_builder.apply_change(c),
|
||||
RegionMetaAction::Edit(e) => manifest_builder.apply_edit(version, e),
|
||||
RegionMetaAction::Protocol(p) => protocol = p,
|
||||
action => {
|
||||
return ManifestCheckpointSnafu {
|
||||
msg: format!("can't apply region action: {:?}", action),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
}
|
||||
}
|
||||
last_version = version;
|
||||
compacted_actions += 1;
|
||||
}
|
||||
|
||||
if compacted_actions == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let region_manifest = manifest_builder.build();
|
||||
let checkpoint = RegionCheckpoint {
|
||||
protocol,
|
||||
last_version,
|
||||
compacted_actions,
|
||||
checkpoint: Some(region_manifest),
|
||||
};
|
||||
|
||||
manifest.save_checkpoint(&checkpoint).await?;
|
||||
if let Err(e) = manifest
|
||||
.manifest_store()
|
||||
.delete(start_version, last_version + 1)
|
||||
.await
|
||||
{
|
||||
// We only log when the error kind isn't `NotFound`
|
||||
if !e.is_object_to_delete_not_found() {
|
||||
// It doesn't matter when deletion fails, they will be purged by gc task.
|
||||
warn!(
|
||||
"Failed to delete manifest logs [{},{}] in path: {}. err: {}",
|
||||
start_version,
|
||||
last_version,
|
||||
manifest.manifest_store().path(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
info!("Region manifest checkpoint, path: {}, start_version: {}, last_version: {}, compacted actions: {}",
|
||||
manifest.manifest_store().path(),
|
||||
start_version,
|
||||
last_version,
|
||||
compacted_actions);
|
||||
|
||||
Ok(Some(checkpoint))
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl RegionManifest {
|
||||
pub fn with_checkpointer(
|
||||
manifest_dir: &str,
|
||||
object_store: ObjectStore,
|
||||
compress_type: CompressionType,
|
||||
checkpoint_actions_margin: Option<u16>,
|
||||
gc_duration: Option<Duration>,
|
||||
) -> Self {
|
||||
Self::new(
|
||||
manifest_dir,
|
||||
object_store,
|
||||
compress_type,
|
||||
checkpoint_actions_margin,
|
||||
gc_duration,
|
||||
Some(Arc::new(RegionManifestCheckpointer {
|
||||
flushed_manifest_version: AtomicU64::new(0),
|
||||
})),
|
||||
)
|
||||
}
|
||||
|
||||
// Update flushed manifest version in checkpointer
|
||||
pub fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) {
|
||||
if let Some(checkpointer) = self.checkpointer() {
|
||||
if let Some(checkpointer) = checkpointer
|
||||
.as_any()
|
||||
.downcast_ref::<RegionManifestCheckpointer>()
|
||||
{
|
||||
checkpointer.set_flushed_manifest_version(manifest_version);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::services::{Fs, S3};
|
||||
use object_store::test_util::{s3_test_config, TempFolder};
|
||||
use object_store::ObjectStore;
|
||||
use store_api::manifest::action::ProtocolAction;
|
||||
use store_api::manifest::{Manifest, MetaActionIterator, MAX_VERSION};
|
||||
|
||||
use super::*;
|
||||
use crate::manifest::manifest_compress_type;
|
||||
use crate::manifest::test_utils::*;
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::sst::FileId;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fs_region_manifest_compress() {
|
||||
let manifest = new_fs_manifest(true, None).await;
|
||||
test_region_manifest(&manifest).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fs_region_manifest_uncompress() {
|
||||
let manifest = new_fs_manifest(false, None).await;
|
||||
test_region_manifest(&manifest).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_s3_region_manifest_compress() {
|
||||
if s3_test_config().is_some() {
|
||||
let (manifest, temp_dir) = new_s3_manifest(true, None).await;
|
||||
test_region_manifest(&manifest).await;
|
||||
temp_dir.remove_all().await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_s3_region_manifest_uncompress() {
|
||||
if s3_test_config().is_some() {
|
||||
let (manifest, temp_dir) = new_s3_manifest(false, None).await;
|
||||
test_region_manifest(&manifest).await;
|
||||
temp_dir.remove_all().await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
async fn new_fs_manifest(compress: bool, gc_duration: Option<Duration>) -> RegionManifest {
|
||||
let tmp_dir = create_temp_dir("test_region_manifest");
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(&tmp_dir.path().to_string_lossy());
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
|
||||
let manifest = RegionManifest::with_checkpointer(
|
||||
"/manifest/",
|
||||
object_store,
|
||||
manifest_compress_type(compress),
|
||||
None,
|
||||
gc_duration,
|
||||
);
|
||||
manifest.start().await.unwrap();
|
||||
manifest
|
||||
}
|
||||
|
||||
async fn new_s3_manifest(
|
||||
compress: bool,
|
||||
gc_duration: Option<Duration>,
|
||||
) -> (RegionManifest, TempFolder) {
|
||||
let s3_config = s3_test_config().unwrap();
|
||||
let mut builder = S3::default();
|
||||
let _ = builder
|
||||
.root(&s3_config.root)
|
||||
.access_key_id(&s3_config.access_key_id)
|
||||
.secret_access_key(&s3_config.secret_access_key)
|
||||
.bucket(&s3_config.bucket);
|
||||
|
||||
if s3_config.region.is_some() {
|
||||
let _ = builder.region(s3_config.region.as_ref().unwrap());
|
||||
}
|
||||
let store = ObjectStore::new(builder).unwrap().finish();
|
||||
let temp_folder = TempFolder::new(&store, "/");
|
||||
let manifest = RegionManifest::with_checkpointer(
|
||||
"/manifest/",
|
||||
store,
|
||||
manifest_compress_type(compress),
|
||||
None,
|
||||
gc_duration,
|
||||
);
|
||||
manifest.start().await.unwrap();
|
||||
|
||||
(manifest, temp_folder)
|
||||
}
|
||||
|
||||
async fn test_region_manifest(manifest: &RegionManifest) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let region_meta = Arc::new(build_region_meta());
|
||||
|
||||
assert_eq!(
|
||||
None,
|
||||
manifest
|
||||
.scan(0, MAX_VERSION)
|
||||
.await
|
||||
.unwrap()
|
||||
.next_action()
|
||||
.await
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
assert!(manifest
|
||||
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
|
||||
RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 99,
|
||||
},
|
||||
)))
|
||||
.await
|
||||
.is_ok());
|
||||
|
||||
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
|
||||
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(0, v);
|
||||
assert_eq!(2, action_list.actions.len());
|
||||
let protocol = &action_list.actions[0];
|
||||
assert!(matches!(
|
||||
protocol,
|
||||
RegionMetaAction::Protocol(ProtocolAction { .. })
|
||||
));
|
||||
|
||||
let action = &action_list.actions[1];
|
||||
|
||||
match action {
|
||||
RegionMetaAction::Change(c) => {
|
||||
assert_eq!(
|
||||
RegionMetadata::try_from(c.metadata.clone()).unwrap(),
|
||||
*region_meta
|
||||
);
|
||||
assert_eq!(c.committed_sequence, 99);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// Save some actions
|
||||
assert!(manifest
|
||||
.update(RegionMetaActionList::new(vec![
|
||||
RegionMetaAction::Edit(build_region_edit(1, &[FileId::random()], &[])),
|
||||
RegionMetaAction::Edit(build_region_edit(
|
||||
2,
|
||||
&[FileId::random(), FileId::random()],
|
||||
&[],
|
||||
)),
|
||||
]))
|
||||
.await
|
||||
.is_ok());
|
||||
|
||||
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(0, v);
|
||||
assert_eq!(2, action_list.actions.len());
|
||||
let protocol = &action_list.actions[0];
|
||||
assert!(matches!(
|
||||
protocol,
|
||||
RegionMetaAction::Protocol(ProtocolAction { .. })
|
||||
));
|
||||
|
||||
let action = &action_list.actions[1];
|
||||
match action {
|
||||
RegionMetaAction::Change(c) => {
|
||||
assert_eq!(
|
||||
RegionMetadata::try_from(c.metadata.clone()).unwrap(),
|
||||
*region_meta
|
||||
);
|
||||
assert_eq!(c.committed_sequence, 99);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(1, v);
|
||||
assert_eq!(2, action_list.actions.len());
|
||||
assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_)));
|
||||
assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_)));
|
||||
|
||||
// Reach end
|
||||
assert!(iter.next_action().await.unwrap().is_none());
|
||||
|
||||
manifest.stop().await.unwrap();
|
||||
}
|
||||
|
||||
async fn assert_scan(manifest: &RegionManifest, start_version: ManifestVersion, expected: u64) {
|
||||
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
|
||||
let mut actions = 0;
|
||||
while let Some((v, _)) = iter.next_action().await.unwrap() {
|
||||
assert_eq!(v, start_version + actions);
|
||||
actions += 1;
|
||||
}
|
||||
assert_eq!(expected, actions);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn test_fs_region_manifest_checkpoint_compress() {
|
||||
let duration = Duration::from_millis(50);
|
||||
let manifest = new_fs_manifest(true, Some(duration)).await;
|
||||
|
||||
test_region_manifest_checkpoint(&manifest, duration).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fs_region_manifest_checkpoint_uncompress() {
|
||||
let duration = Duration::from_millis(50);
|
||||
let manifest = new_fs_manifest(false, Some(duration)).await;
|
||||
|
||||
test_region_manifest_checkpoint(&manifest, duration).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_s3_region_manifest_checkpoint_compress() {
|
||||
if s3_test_config().is_some() {
|
||||
let duration = Duration::from_millis(50);
|
||||
let (manifest, temp_dir) = new_s3_manifest(true, Some(duration)).await;
|
||||
|
||||
test_region_manifest_checkpoint(&manifest, duration).await;
|
||||
temp_dir.remove_all().await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_s3_region_manifest_checkpoint_uncompress() {
|
||||
if s3_test_config().is_some() {
|
||||
let duration = Duration::from_millis(50);
|
||||
let (manifest, temp_dir) = new_s3_manifest(false, Some(duration)).await;
|
||||
|
||||
test_region_manifest_checkpoint(&manifest, duration).await;
|
||||
temp_dir.remove_all().await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
async fn test_region_manifest_checkpoint(
|
||||
manifest: &RegionManifest,
|
||||
test_gc_duration: Duration,
|
||||
) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let region_meta = Arc::new(build_region_meta());
|
||||
let new_region_meta = Arc::new(build_altered_region_meta());
|
||||
|
||||
let file = FileId::random();
|
||||
let file_ids = vec![FileId::random(), FileId::random()];
|
||||
|
||||
let actions: Vec<RegionMetaActionList> = vec![
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 1,
|
||||
})),
|
||||
RegionMetaActionList::new(vec![
|
||||
RegionMetaAction::Edit(build_region_edit(2, &[file], &[])),
|
||||
RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])),
|
||||
]),
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
|
||||
metadata: new_region_meta.as_ref().into(),
|
||||
committed_sequence: 99,
|
||||
})),
|
||||
];
|
||||
|
||||
for action in actions {
|
||||
let _ = manifest.update(action).await.unwrap();
|
||||
}
|
||||
assert!(manifest.last_checkpoint().await.unwrap().is_none());
|
||||
assert_scan(manifest, 0, 3).await;
|
||||
// update flushed manifest version for doing checkpoint
|
||||
manifest.set_flushed_manifest_version(2);
|
||||
|
||||
let mut checkpoint_versions = vec![];
|
||||
|
||||
// do a checkpoint
|
||||
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
|
||||
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(checkpoint, last_checkpoint);
|
||||
assert_eq!(checkpoint.compacted_actions, 3);
|
||||
assert_eq!(checkpoint.last_version, 2);
|
||||
checkpoint_versions.push(2);
|
||||
let alterd_raw_meta = RawRegionMetadata::from(new_region_meta.as_ref());
|
||||
assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
|
||||
committed_sequence: 99,
|
||||
metadata,
|
||||
version: Some(RegionVersion {
|
||||
manifest_version: 1,
|
||||
flushed_sequence: Some(3),
|
||||
files,
|
||||
}),
|
||||
}) if files.len() == 2 &&
|
||||
files.contains_key(&file_ids[0]) &&
|
||||
files.contains_key(&file_ids[1]) &&
|
||||
*metadata == alterd_raw_meta));
|
||||
// all actions were compacted
|
||||
assert_eq!(
|
||||
None,
|
||||
manifest
|
||||
.scan(0, MAX_VERSION)
|
||||
.await
|
||||
.unwrap()
|
||||
.next_action()
|
||||
.await
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
assert!(manifest.do_checkpoint().await.unwrap().is_none());
|
||||
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(checkpoint, last_checkpoint);
|
||||
|
||||
// add new actions
|
||||
let new_file = FileId::random();
|
||||
let actions: Vec<RegionMetaActionList> = vec![
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 200,
|
||||
})),
|
||||
RegionMetaActionList::new(vec![RegionMetaAction::Edit(build_region_edit(
|
||||
201,
|
||||
&[new_file],
|
||||
&file_ids,
|
||||
))]),
|
||||
];
|
||||
for action in actions {
|
||||
let _ = manifest.update(action).await.unwrap();
|
||||
}
|
||||
|
||||
assert_scan(manifest, 3, 2).await;
|
||||
|
||||
// do another checkpoints
|
||||
// compacted RegionChange
|
||||
manifest.set_flushed_manifest_version(3);
|
||||
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
|
||||
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(checkpoint, last_checkpoint);
|
||||
assert_eq!(checkpoint.compacted_actions, 1);
|
||||
assert_eq!(checkpoint.last_version, 3);
|
||||
checkpoint_versions.push(3);
|
||||
assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
|
||||
committed_sequence: 200,
|
||||
metadata,
|
||||
version: Some(RegionVersion {
|
||||
manifest_version: 1,
|
||||
flushed_sequence: Some(3),
|
||||
files,
|
||||
}),
|
||||
}) if files.len() == 2 &&
|
||||
files.contains_key(&file_ids[0]) &&
|
||||
files.contains_key(&file_ids[1]) &&
|
||||
*metadata == RawRegionMetadata::from(region_meta.as_ref())));
|
||||
|
||||
assert_scan(manifest, 4, 1).await;
|
||||
// compacted RegionEdit
|
||||
manifest.set_flushed_manifest_version(4);
|
||||
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
|
||||
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(checkpoint, last_checkpoint);
|
||||
assert_eq!(checkpoint.compacted_actions, 1);
|
||||
assert_eq!(checkpoint.last_version, 4);
|
||||
checkpoint_versions.push(4);
|
||||
assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
|
||||
committed_sequence: 200,
|
||||
metadata,
|
||||
version: Some(RegionVersion {
|
||||
manifest_version: 4,
|
||||
flushed_sequence: Some(201),
|
||||
files,
|
||||
}),
|
||||
}) if files.len() == 1 &&
|
||||
files.contains_key(&new_file) &&
|
||||
*metadata == RawRegionMetadata::from(region_meta.as_ref())));
|
||||
|
||||
// all actions were compacted
|
||||
assert_eq!(
|
||||
None,
|
||||
manifest
|
||||
.scan(0, MAX_VERSION)
|
||||
.await
|
||||
.unwrap()
|
||||
.next_action()
|
||||
.await
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
// wait for gc
|
||||
tokio::time::sleep(test_gc_duration * 3).await;
|
||||
|
||||
for v in checkpoint_versions {
|
||||
if v < 4 {
|
||||
// ensure old checkpoints were purged.
|
||||
assert!(manifest
|
||||
.manifest_store()
|
||||
.load_checkpoint(v)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none());
|
||||
} else {
|
||||
// the last checkpoints is still exists.
|
||||
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(checkpoint, last_checkpoint);
|
||||
}
|
||||
}
|
||||
|
||||
manifest.stop().await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_manifest_truncate() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let manifest = new_fs_manifest(false, None).await;
|
||||
let region_meta = Arc::new(build_region_meta());
|
||||
let committed_sequence = 99;
|
||||
|
||||
let file = FileId::random();
|
||||
let file_ids = vec![FileId::random(), FileId::random()];
|
||||
|
||||
// Save some actions.
|
||||
let actions: Vec<RegionMetaActionList> = vec![
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 1,
|
||||
})),
|
||||
RegionMetaActionList::new(vec![
|
||||
RegionMetaAction::Edit(build_region_edit(2, &[file], &[])),
|
||||
RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])),
|
||||
]),
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
|
||||
region_id: 0.into(),
|
||||
committed_sequence,
|
||||
})),
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 1,
|
||||
})),
|
||||
];
|
||||
|
||||
for action in actions {
|
||||
manifest.update(action).await.unwrap();
|
||||
}
|
||||
|
||||
// Scan manifest.
|
||||
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
|
||||
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
info!("action_list = {:?}", action_list.actions);
|
||||
assert_eq!(0, v);
|
||||
assert_eq!(2, action_list.actions.len());
|
||||
let protocol = &action_list.actions[0];
|
||||
assert!(matches!(
|
||||
protocol,
|
||||
RegionMetaAction::Protocol(ProtocolAction { .. })
|
||||
));
|
||||
|
||||
let change = &action_list.actions[1];
|
||||
assert!(matches!(
|
||||
change,
|
||||
RegionMetaAction::Change(RegionChange {
|
||||
committed_sequence: 1,
|
||||
..
|
||||
})
|
||||
));
|
||||
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(1, v);
|
||||
assert_eq!(2, action_list.actions.len());
|
||||
assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_)));
|
||||
assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_)));
|
||||
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(2, v);
|
||||
assert_eq!(1, action_list.actions.len());
|
||||
let truncate = &action_list.actions[0];
|
||||
assert!(matches!(
|
||||
truncate,
|
||||
RegionMetaAction::Truncate(RegionTruncate {
|
||||
committed_sequence: 99,
|
||||
..
|
||||
})
|
||||
));
|
||||
|
||||
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(3, v);
|
||||
assert_eq!(1, action_list.actions.len());
|
||||
let change = &action_list.actions[0];
|
||||
assert!(matches!(
|
||||
change,
|
||||
RegionMetaAction::Change(RegionChange {
|
||||
committed_sequence: 1,
|
||||
..
|
||||
})
|
||||
));
|
||||
|
||||
// Reach end
|
||||
assert!(iter.next_action().await.unwrap().is_none());
|
||||
}
|
||||
}
|
||||
@@ -1,741 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::iter::Iterator;
|
||||
use std::str::FromStr;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_datasource::compression::CompressionType;
|
||||
use common_telemetry::logging;
|
||||
use futures::TryStreamExt;
|
||||
use lazy_static::lazy_static;
|
||||
use object_store::{raw_normalize_path, util, Entry, ErrorKind, ObjectStore};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::manifest::{LogIterator, ManifestLogStorage, ManifestVersion};
|
||||
|
||||
use crate::error::{
|
||||
CompressObjectSnafu, DecodeJsonSnafu, DecompressObjectSnafu, DeleteObjectSnafu,
|
||||
EncodeJsonSnafu, Error, InvalidScanIndexSnafu, ListObjectsSnafu, ReadObjectSnafu, Result,
|
||||
Utf8Snafu, WriteObjectSnafu,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap();
|
||||
static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap();
|
||||
}
|
||||
|
||||
const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
|
||||
const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip;
|
||||
/// Due to backward compatibility, it is possible that the user's manifest file has not been compressed.
|
||||
/// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing.
|
||||
const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
|
||||
|
||||
#[inline]
|
||||
pub const fn manifest_compress_type(compress: bool) -> CompressionType {
|
||||
if compress {
|
||||
DEFAULT_MANIFEST_COMPRESSION_TYPE
|
||||
} else {
|
||||
FALL_BACK_COMPRESS_TYPE
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn delta_file(version: ManifestVersion) -> String {
|
||||
format!("{version:020}.json")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn checkpoint_file(version: ManifestVersion) -> String {
|
||||
format!("{version:020}.checkpoint")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn gen_path(path: &str, file: &str, compress_type: CompressionType) -> String {
|
||||
if compress_type == CompressionType::Uncompressed {
|
||||
format!("{}{}", path, file)
|
||||
} else {
|
||||
format!("{}{}.{}", path, file, compress_type.file_extension())
|
||||
}
|
||||
}
|
||||
|
||||
/// Return's the file manifest version from path
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the file path is not a valid delta or checkpoint file.
|
||||
#[inline]
|
||||
pub fn file_version(path: &str) -> ManifestVersion {
|
||||
let s = path.split('.').next().unwrap();
|
||||
s.parse().unwrap_or_else(|_| panic!("Invalid file: {path}"))
|
||||
}
|
||||
|
||||
/// Return's the file compress algorithm by file extension.
|
||||
///
|
||||
/// for example file
|
||||
/// `00000000000000000000.json.gz` -> `CompressionType::GZIP`
|
||||
#[inline]
|
||||
pub fn file_compress_type(path: &str) -> CompressionType {
|
||||
let s = path.rsplit('.').next().unwrap_or("");
|
||||
CompressionType::from_str(s).unwrap_or(CompressionType::Uncompressed)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_delta_file(file_name: &str) -> bool {
|
||||
DELTA_RE.is_match(file_name)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_checkpoint_file(file_name: &str) -> bool {
|
||||
CHECKPOINT_RE.is_match(file_name)
|
||||
}
|
||||
|
||||
pub struct ObjectStoreLogIterator {
|
||||
object_store: ObjectStore,
|
||||
iter: Box<dyn Iterator<Item = (ManifestVersion, Entry)> + Send + Sync>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LogIterator for ObjectStoreLogIterator {
|
||||
type Error = Error;
|
||||
|
||||
async fn next_log(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
match self.iter.next() {
|
||||
Some((v, entry)) => {
|
||||
let compress_type = file_compress_type(entry.name());
|
||||
let bytes = self
|
||||
.object_store
|
||||
.read(entry.path())
|
||||
.await
|
||||
.context(ReadObjectSnafu { path: entry.path() })?;
|
||||
let data = compress_type
|
||||
.decode(bytes)
|
||||
.await
|
||||
.context(DecompressObjectSnafu {
|
||||
compress_type,
|
||||
path: entry.path(),
|
||||
})?;
|
||||
Ok(Some((v, data)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ManifestObjectStore {
|
||||
object_store: ObjectStore,
|
||||
compress_type: CompressionType,
|
||||
path: String,
|
||||
}
|
||||
|
||||
impl ManifestObjectStore {
|
||||
pub fn new(path: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self {
|
||||
Self {
|
||||
object_store,
|
||||
compress_type,
|
||||
path: util::normalize_dir(path),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the delta file path under the **current** compression algorithm
|
||||
fn delta_file_path(&self, version: ManifestVersion) -> String {
|
||||
gen_path(&self.path, &delta_file(version), self.compress_type)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the checkpoint file path under the **current** compression algorithm
|
||||
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
|
||||
gen_path(&self.path, &checkpoint_file(version), self.compress_type)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the last checkpoint path, because the last checkpoint is not compressed,
|
||||
/// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
|
||||
fn last_checkpoint_path(&self) -> String {
|
||||
format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
|
||||
}
|
||||
|
||||
/// Return all `R`s in the root directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
|
||||
/// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
|
||||
async fn get_paths<F, R>(&self, filter: F) -> Result<Vec<R>>
|
||||
where
|
||||
F: Fn(Entry) -> Option<R>,
|
||||
{
|
||||
let streamer = self
|
||||
.object_store
|
||||
.lister_with(&self.path)
|
||||
.await
|
||||
.context(ListObjectsSnafu { path: &self.path })?;
|
||||
streamer
|
||||
.try_filter_map(|e| async { Ok(filter(e)) })
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.context(ListObjectsSnafu { path: &self.path })
|
||||
}
|
||||
|
||||
pub(crate) fn path(&self) -> &str {
|
||||
&self.path
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct CheckpointMetadata {
|
||||
pub size: usize,
|
||||
/// The latest version this checkpoint contains.
|
||||
pub version: ManifestVersion,
|
||||
pub checksum: Option<String>,
|
||||
pub extend_metadata: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
impl CheckpointMetadata {
|
||||
fn encode(&self) -> Result<impl AsRef<[u8]>> {
|
||||
serde_json::to_string(self).context(EncodeJsonSnafu)
|
||||
}
|
||||
|
||||
fn decode(bs: &[u8]) -> Result<Self> {
|
||||
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
|
||||
|
||||
serde_json::from_str(data).context(DecodeJsonSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ManifestLogStorage for ManifestObjectStore {
|
||||
type Error = Error;
|
||||
type Iter = ObjectStoreLogIterator;
|
||||
|
||||
async fn scan(
|
||||
&self,
|
||||
start: ManifestVersion,
|
||||
end: ManifestVersion,
|
||||
) -> Result<ObjectStoreLogIterator> {
|
||||
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
|
||||
|
||||
let mut entries: Vec<(ManifestVersion, Entry)> = self
|
||||
.get_paths(|entry| {
|
||||
let file_name = entry.name();
|
||||
if is_delta_file(file_name) {
|
||||
let version = file_version(file_name);
|
||||
if start <= version && version < end {
|
||||
return Some((version, entry));
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.await?;
|
||||
|
||||
entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2));
|
||||
|
||||
Ok(ObjectStoreLogIterator {
|
||||
object_store: self.object_store.clone(),
|
||||
iter: Box::new(entries.into_iter()),
|
||||
})
|
||||
}
|
||||
|
||||
async fn delete_until(
|
||||
&self,
|
||||
end: ManifestVersion,
|
||||
keep_last_checkpoint: bool,
|
||||
) -> Result<usize> {
|
||||
// Stores (entry, is_checkpoint, version) in a Vec.
|
||||
let entries: Vec<_> = self
|
||||
.get_paths(|entry| {
|
||||
let file_name = entry.name();
|
||||
let is_checkpoint = is_checkpoint_file(file_name);
|
||||
if is_delta_file(file_name) || is_checkpoint_file(file_name) {
|
||||
let version = file_version(file_name);
|
||||
if version < end {
|
||||
return Some((entry, is_checkpoint, version));
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.await?;
|
||||
let checkpoint_version = if keep_last_checkpoint {
|
||||
// Note that the order of entries is unspecific.
|
||||
entries
|
||||
.iter()
|
||||
.filter_map(
|
||||
|(_e, is_checkpoint, version)| {
|
||||
if *is_checkpoint {
|
||||
Some(version)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
)
|
||||
.max()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let paths: Vec<_> = entries
|
||||
.iter()
|
||||
.filter(|(_e, is_checkpoint, version)| {
|
||||
if let Some(max_version) = checkpoint_version {
|
||||
if *is_checkpoint {
|
||||
// We need to keep the checkpoint file.
|
||||
version < max_version
|
||||
} else {
|
||||
// We can delete the log file with max_version as the checkpoint
|
||||
// file contains the log file's content.
|
||||
version <= max_version
|
||||
}
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.map(|e| e.0.path().to_string())
|
||||
.collect();
|
||||
let ret = paths.len();
|
||||
|
||||
logging::debug!(
|
||||
"Deleting {} logs from manifest storage path {} until {}, checkpoint: {:?}, paths: {:?}",
|
||||
ret,
|
||||
self.path,
|
||||
end,
|
||||
checkpoint_version,
|
||||
paths,
|
||||
);
|
||||
|
||||
self.object_store
|
||||
.remove(paths)
|
||||
.await
|
||||
.with_context(|_| DeleteObjectSnafu {
|
||||
path: self.path.clone(),
|
||||
})?;
|
||||
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
async fn delete_all(&self, remove_action_manifest: ManifestVersion) -> Result<()> {
|
||||
let entries: Vec<Entry> = self.get_paths(Some).await?;
|
||||
|
||||
// Filter out the latest delta file.
|
||||
let paths: Vec<_> = entries
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
let name = e.name();
|
||||
if is_delta_file(name) && file_version(name) == remove_action_manifest {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
.map(|e| e.path().to_string())
|
||||
.collect();
|
||||
|
||||
logging::info!(
|
||||
"Deleting {} from manifest storage path {} paths: {:?}",
|
||||
paths.len(),
|
||||
self.path,
|
||||
paths,
|
||||
);
|
||||
|
||||
// Delete all files except the latest delta file.
|
||||
self.object_store
|
||||
.remove(paths)
|
||||
.await
|
||||
.with_context(|_| DeleteObjectSnafu {
|
||||
path: self.path.clone(),
|
||||
})?;
|
||||
|
||||
// Delete the latest delta file and the manifest directory.
|
||||
self.object_store
|
||||
.remove_all(&self.path)
|
||||
.await
|
||||
.with_context(|_| DeleteObjectSnafu {
|
||||
path: self.path.clone(),
|
||||
})?;
|
||||
logging::info!("Deleted manifest storage path {}", self.path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
|
||||
let path = self.delta_file_path(version);
|
||||
logging::debug!("Save log to manifest storage, version: {}", version);
|
||||
let data = self
|
||||
.compress_type
|
||||
.encode(bytes)
|
||||
.await
|
||||
.context(CompressObjectSnafu {
|
||||
compress_type: self.compress_type,
|
||||
path: &path,
|
||||
})?;
|
||||
self.object_store
|
||||
.write(&path, data)
|
||||
.await
|
||||
.context(WriteObjectSnafu { path })
|
||||
}
|
||||
|
||||
async fn delete(&self, start: ManifestVersion, end: ManifestVersion) -> Result<()> {
|
||||
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
|
||||
|
||||
// Due to backward compatibility, it is possible that the user's log between start and end has not been compressed,
|
||||
// so we need to delete the uncompressed file corresponding to that version, even if the uncompressed file in that version do not exist.
|
||||
let mut paths = Vec::with_capacity(((end - start) * 2) as usize);
|
||||
for version in start..end {
|
||||
paths.push(raw_normalize_path(&self.delta_file_path(version)));
|
||||
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
paths.push(raw_normalize_path(&gen_path(
|
||||
&self.path,
|
||||
&delta_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
logging::debug!(
|
||||
"Deleting logs from manifest storage, start: {}, end: {}",
|
||||
start,
|
||||
end
|
||||
);
|
||||
|
||||
self.object_store
|
||||
.remove(paths.clone())
|
||||
.await
|
||||
.with_context(|_| DeleteObjectSnafu {
|
||||
path: paths.join(","),
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
|
||||
let path = self.checkpoint_file_path(version);
|
||||
let data = self
|
||||
.compress_type
|
||||
.encode(bytes)
|
||||
.await
|
||||
.context(CompressObjectSnafu {
|
||||
compress_type: self.compress_type,
|
||||
path: &path,
|
||||
})?;
|
||||
self.object_store
|
||||
.write(&path, data)
|
||||
.await
|
||||
.context(WriteObjectSnafu { path })?;
|
||||
|
||||
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
|
||||
let last_checkpoint_path = self.last_checkpoint_path();
|
||||
|
||||
let checkpoint_metadata = CheckpointMetadata {
|
||||
size: bytes.len(),
|
||||
version,
|
||||
checksum: None,
|
||||
extend_metadata: None,
|
||||
};
|
||||
|
||||
logging::debug!(
|
||||
"Save checkpoint in path: {}, metadata: {:?}",
|
||||
last_checkpoint_path,
|
||||
checkpoint_metadata
|
||||
);
|
||||
|
||||
let bs = checkpoint_metadata.encode()?;
|
||||
self.object_store
|
||||
.write(&last_checkpoint_path, bs.as_ref().to_vec())
|
||||
.await
|
||||
.context(WriteObjectSnafu {
|
||||
path: last_checkpoint_path,
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load_checkpoint(
|
||||
&self,
|
||||
version: ManifestVersion,
|
||||
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
let path = self.checkpoint_file_path(version);
|
||||
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
|
||||
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
|
||||
let checkpoint_data =
|
||||
match self.object_store.read(&path).await {
|
||||
Ok(checkpoint) => {
|
||||
let decompress_data = self.compress_type.decode(checkpoint).await.context(
|
||||
DecompressObjectSnafu {
|
||||
compress_type: self.compress_type,
|
||||
path,
|
||||
},
|
||||
)?;
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
let fall_back_path = gen_path(
|
||||
&self.path,
|
||||
&checkpoint_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
);
|
||||
logging::debug!(
|
||||
"Failed to load checkpoint from path: {}, fall back to path: {}",
|
||||
path,
|
||||
fall_back_path
|
||||
);
|
||||
match self.object_store.read(&fall_back_path).await {
|
||||
Ok(checkpoint) => {
|
||||
let decompress_data = FALL_BACK_COMPRESS_TYPE
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.context(DecompressObjectSnafu {
|
||||
compress_type: FALL_BACK_COMPRESS_TYPE,
|
||||
path,
|
||||
})?;
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
|
||||
Err(e) => Err(e).context(ReadObjectSnafu {
|
||||
path: &fall_back_path,
|
||||
}),
|
||||
}
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
} else {
|
||||
Err(e).context(ReadObjectSnafu { path: &path })
|
||||
}
|
||||
}
|
||||
}?;
|
||||
Ok(checkpoint_data.map(|data| (version, data)))
|
||||
}
|
||||
|
||||
async fn delete_checkpoint(&self, version: ManifestVersion) -> Result<()> {
|
||||
// Due to backward compatibility, it is possible that the user's checkpoint file has not been compressed,
|
||||
// so we need to delete the uncompressed checkpoint file corresponding to that version, even if the uncompressed checkpoint file in that version do not exist.
|
||||
let paths = if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
vec![
|
||||
raw_normalize_path(&self.checkpoint_file_path(version)),
|
||||
raw_normalize_path(&gen_path(
|
||||
&self.path,
|
||||
&checkpoint_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
)),
|
||||
]
|
||||
} else {
|
||||
vec![raw_normalize_path(&self.checkpoint_file_path(version))]
|
||||
};
|
||||
|
||||
self.object_store
|
||||
.remove(paths.clone())
|
||||
.await
|
||||
.context(DeleteObjectSnafu {
|
||||
path: paths.join(","),
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load_last_checkpoint(&self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
let last_checkpoint_path = self.last_checkpoint_path();
|
||||
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
|
||||
Ok(data) => data,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
return Ok(None);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e).context(ReadObjectSnafu {
|
||||
path: last_checkpoint_path,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
|
||||
|
||||
logging::debug!(
|
||||
"Load checkpoint in path: {}, metadata: {:?}",
|
||||
last_checkpoint_path,
|
||||
checkpoint_metadata
|
||||
);
|
||||
|
||||
self.load_checkpoint(checkpoint_metadata.version).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn new_test_manifest_store() -> ManifestObjectStore {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let tmp_dir = create_temp_dir("test_manifest_log_store");
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(&tmp_dir.path().to_string_lossy());
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
ManifestObjectStore::new("/", object_store, CompressionType::Uncompressed)
|
||||
}
|
||||
|
||||
#[test]
|
||||
// Define this test mainly to prevent future unintentional changes may break the backward compatibility.
|
||||
fn test_compress_file_path_generation() {
|
||||
let path = "/foo/bar/";
|
||||
let version: ManifestVersion = 0;
|
||||
let file_path = gen_path(path, &delta_file(version), CompressionType::Gzip);
|
||||
assert_eq!(file_path.as_str(), "/foo/bar/00000000000000000000.json.gz")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manifest_log_store_uncompress() {
|
||||
let mut log_store = new_test_manifest_store();
|
||||
log_store.compress_type = CompressionType::Uncompressed;
|
||||
test_manifest_log_store_case(log_store).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manifest_log_store_compress() {
|
||||
let mut log_store = new_test_manifest_store();
|
||||
log_store.compress_type = CompressionType::Gzip;
|
||||
test_manifest_log_store_case(log_store).await;
|
||||
}
|
||||
|
||||
async fn test_manifest_log_store_case(log_store: ManifestObjectStore) {
|
||||
for v in 0..5 {
|
||||
log_store
|
||||
.save(v, format!("hello, {v}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let mut it = log_store.scan(1, 4).await.unwrap();
|
||||
for v in 1..4 {
|
||||
let (version, bytes) = it.next_log().await.unwrap().unwrap();
|
||||
assert_eq!(v, version);
|
||||
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
|
||||
}
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
|
||||
let mut it = log_store.scan(0, 11).await.unwrap();
|
||||
for v in 0..5 {
|
||||
let (version, bytes) = it.next_log().await.unwrap().unwrap();
|
||||
assert_eq!(v, version);
|
||||
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
|
||||
}
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
|
||||
// Delete [0, 3)
|
||||
log_store.delete(0, 3).await.unwrap();
|
||||
|
||||
// [3, 5) remains
|
||||
let mut it = log_store.scan(0, 11).await.unwrap();
|
||||
for v in 3..5 {
|
||||
let (version, bytes) = it.next_log().await.unwrap().unwrap();
|
||||
assert_eq!(v, version);
|
||||
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
|
||||
}
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
|
||||
// test checkpoint
|
||||
assert!(log_store.load_last_checkpoint().await.unwrap().is_none());
|
||||
log_store
|
||||
.save_checkpoint(3, "checkpoint".as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(checkpoint, "checkpoint".as_bytes());
|
||||
assert_eq!(3, v);
|
||||
|
||||
//delete (,4) logs and keep checkpoint 3.
|
||||
let _ = log_store.delete_until(4, true).await.unwrap();
|
||||
let _ = log_store.load_checkpoint(3).await.unwrap().unwrap();
|
||||
let _ = log_store.load_last_checkpoint().await.unwrap().unwrap();
|
||||
let mut it = log_store.scan(0, 11).await.unwrap();
|
||||
let (version, bytes) = it.next_log().await.unwrap().unwrap();
|
||||
assert_eq!(4, version);
|
||||
assert_eq!("hello, 4".as_bytes(), bytes);
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
|
||||
// delete all logs and checkpoints
|
||||
let _ = log_store.delete_until(11, false).await.unwrap();
|
||||
assert!(log_store.load_checkpoint(3).await.unwrap().is_none());
|
||||
assert!(log_store.load_last_checkpoint().await.unwrap().is_none());
|
||||
let mut it = log_store.scan(0, 11).await.unwrap();
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
// test ManifestObjectStore can read/delete previously uncompressed data correctly
|
||||
async fn test_compress_backward_compatible() {
|
||||
let mut log_store = new_test_manifest_store();
|
||||
|
||||
// write uncompress data to stimulate previously uncompressed data
|
||||
log_store.compress_type = CompressionType::Uncompressed;
|
||||
for v in 0..5 {
|
||||
log_store
|
||||
.save(v, format!("hello, {v}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
log_store
|
||||
.save_checkpoint(5, "checkpoint_uncompressed".as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// change compress type
|
||||
log_store.compress_type = CompressionType::Gzip;
|
||||
|
||||
// test load_last_checkpoint work correctly for previously uncompressed data
|
||||
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(v, 5);
|
||||
assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes());
|
||||
|
||||
// write compressed data to stimulate compress alogorithom take effect
|
||||
for v in 5..10 {
|
||||
log_store
|
||||
.save(v, format!("hello, {v}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
log_store
|
||||
.save_checkpoint(10, "checkpoint_compressed".as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// test data reading
|
||||
let mut it = log_store.scan(0, 10).await.unwrap();
|
||||
for v in 0..10 {
|
||||
let (version, bytes) = it.next_log().await.unwrap().unwrap();
|
||||
assert_eq!(v, version);
|
||||
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
|
||||
}
|
||||
let (v, checkpoint) = log_store.load_checkpoint(5).await.unwrap().unwrap();
|
||||
assert_eq!(v, 5);
|
||||
assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes());
|
||||
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(v, 10);
|
||||
assert_eq!(checkpoint, "checkpoint_compressed".as_bytes());
|
||||
|
||||
// Delete previously uncompressed checkpoint
|
||||
log_store.delete_checkpoint(5).await.unwrap();
|
||||
assert!(log_store.load_checkpoint(5).await.unwrap().is_none());
|
||||
|
||||
// Delete [3, 7), contain uncompressed/compressed data
|
||||
log_store.delete(3, 7).await.unwrap();
|
||||
// [3, 7) deleted
|
||||
let mut it = log_store.scan(3, 7).await.unwrap();
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
|
||||
// Delete util 10, contain uncompressed/compressed data
|
||||
// log 0, 1, 2, 7, 8, 9 will be delete
|
||||
assert_eq!(6, log_store.delete_until(10, false).await.unwrap());
|
||||
let mut it = log_store.scan(0, 10).await.unwrap();
|
||||
assert!(it.next_log().await.unwrap().is_none());
|
||||
}
|
||||
}
|
||||
@@ -1,83 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use store_api::storage::SequenceNumber;
|
||||
|
||||
use crate::manifest::action::*;
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::sst::{FileId, FileMeta};
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
|
||||
pub const DEFAULT_TEST_FILE_SIZE: u64 = 1024;
|
||||
|
||||
pub fn build_region_meta() -> RegionMetadata {
|
||||
let region_name = "region-0";
|
||||
let desc = RegionDescBuilder::new(region_name)
|
||||
.id(0)
|
||||
.push_key_column(("k1", LogicalTypeId::Int32, false))
|
||||
.push_field_column(("v1", LogicalTypeId::Float32, true))
|
||||
.build();
|
||||
desc.try_into().unwrap()
|
||||
}
|
||||
|
||||
pub fn build_altered_region_meta() -> RegionMetadata {
|
||||
let region_name = "region-0";
|
||||
let desc = RegionDescBuilder::new(region_name)
|
||||
.id(0)
|
||||
.push_key_column(("k1", LogicalTypeId::Int32, false))
|
||||
.push_field_column(("v1", LogicalTypeId::Float32, true))
|
||||
.push_field_column(("v2", LogicalTypeId::Float32, true))
|
||||
.build();
|
||||
desc.try_into().unwrap()
|
||||
}
|
||||
|
||||
pub fn build_region_edit(
|
||||
sequence: SequenceNumber,
|
||||
files_to_add: &[FileId],
|
||||
files_to_remove: &[FileId],
|
||||
) -> RegionEdit {
|
||||
RegionEdit {
|
||||
region_version: 0,
|
||||
flushed_sequence: Some(sequence),
|
||||
files_to_add: files_to_add
|
||||
.iter()
|
||||
.map(|f| FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id: *f,
|
||||
time_range: None,
|
||||
level: 0,
|
||||
file_size: DEFAULT_TEST_FILE_SIZE,
|
||||
})
|
||||
.collect(),
|
||||
files_to_remove: files_to_remove
|
||||
.iter()
|
||||
.map(|f| FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id: *f,
|
||||
time_range: None,
|
||||
level: 0,
|
||||
file_size: DEFAULT_TEST_FILE_SIZE,
|
||||
})
|
||||
.collect(),
|
||||
compaction_time_window: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_region_truncate(committed_sequence: u64) -> RegionTruncate {
|
||||
RegionTruncate {
|
||||
region_id: 0.into(),
|
||||
committed_sequence,
|
||||
}
|
||||
}
|
||||
@@ -1,294 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
mod btree;
|
||||
mod inserter;
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
mod version;
|
||||
|
||||
use std::fmt;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use common_time::range::TimestampRange;
|
||||
use common_time::Timestamp;
|
||||
use datatypes::vectors::VectorRef;
|
||||
use store_api::storage::{consts, SequenceNumber};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::flush::FlushStrategyRef;
|
||||
use crate::memtable::btree::BTreeMemtable;
|
||||
pub use crate::memtable::inserter::Inserter;
|
||||
pub use crate::memtable::version::MemtableVersion;
|
||||
use crate::metrics::WRITE_BUFFER_BYTES;
|
||||
use crate::read::Batch;
|
||||
use crate::schema::{ProjectedSchemaRef, RegionSchemaRef};
|
||||
|
||||
/// Unique id for memtables under same region.
|
||||
pub type MemtableId = u32;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct MemtableStats {
|
||||
/// The estimated bytes allocated by this memtable from heap. Result
|
||||
/// of this method may be larger than the estimated based on `num_rows` because
|
||||
/// of the implementor's pre-alloc behavior.
|
||||
pub estimated_bytes: usize,
|
||||
/// The max timestamp that this memtable contains.
|
||||
pub max_timestamp: Timestamp,
|
||||
/// The min timestamp that this memtable contains.
|
||||
pub min_timestamp: Timestamp,
|
||||
}
|
||||
|
||||
impl MemtableStats {
|
||||
pub fn bytes_allocated(&self) -> usize {
|
||||
self.estimated_bytes
|
||||
}
|
||||
}
|
||||
|
||||
/// In memory storage.
|
||||
pub trait Memtable: Send + Sync + fmt::Debug {
|
||||
/// Returns id of this memtable.
|
||||
fn id(&self) -> MemtableId;
|
||||
|
||||
/// Returns schema of the memtable.
|
||||
fn schema(&self) -> RegionSchemaRef;
|
||||
|
||||
/// Write key/values to the memtable.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the schema of key/value differs from memtable's schema.
|
||||
fn write(&self, kvs: &KeyValues) -> Result<()>;
|
||||
|
||||
/// Iterates the memtable.
|
||||
fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator>;
|
||||
|
||||
/// Returns the number of rows in the memtable.
|
||||
fn num_rows(&self) -> usize;
|
||||
|
||||
/// Returns stats of this memtable.
|
||||
fn stats(&self) -> MemtableStats;
|
||||
|
||||
/// Mark the memtable is immutable.
|
||||
///
|
||||
/// The region MUST call this inside the region writer's write lock.
|
||||
fn mark_immutable(&self);
|
||||
}
|
||||
|
||||
pub type MemtableRef = Arc<dyn Memtable>;
|
||||
|
||||
/// Context for iterating memtable.
|
||||
///
|
||||
/// Should be cheap to clone.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IterContext {
|
||||
/// The suggested batch size of the iterator.
|
||||
pub batch_size: usize,
|
||||
/// Max visible sequence (inclusive).
|
||||
pub visible_sequence: SequenceNumber,
|
||||
|
||||
/// Schema the reader expect to read.
|
||||
///
|
||||
/// Set to `None` to read all columns.
|
||||
pub projected_schema: Option<ProjectedSchemaRef>,
|
||||
|
||||
/// Timestamp range
|
||||
pub time_range: Option<TimestampRange>,
|
||||
}
|
||||
|
||||
impl Default for IterContext {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
batch_size: consts::READ_BATCH_SIZE,
|
||||
// All data in memory is visible by default.
|
||||
visible_sequence: SequenceNumber::MAX,
|
||||
projected_schema: None,
|
||||
time_range: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The ordering of the iterator output.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum RowOrdering {
|
||||
/// The output rows are unordered.
|
||||
Unordered,
|
||||
|
||||
/// The output rows are ordered by key.
|
||||
Key,
|
||||
}
|
||||
|
||||
/// Iterator of memtable.
|
||||
///
|
||||
/// Since data of memtable are stored in memory, so avoid defining this trait
|
||||
/// as an async trait.
|
||||
pub trait BatchIterator: Iterator<Item = Result<Batch>> + Send + Sync {
|
||||
/// Returns the schema of this iterator.
|
||||
fn schema(&self) -> ProjectedSchemaRef;
|
||||
|
||||
/// Returns the ordering of the output rows from this iterator.
|
||||
fn ordering(&self) -> RowOrdering;
|
||||
}
|
||||
|
||||
pub type BoxedBatchIterator = Box<dyn BatchIterator>;
|
||||
|
||||
pub trait MemtableBuilder: Send + Sync + fmt::Debug {
|
||||
fn build(&self, schema: RegionSchemaRef) -> MemtableRef;
|
||||
}
|
||||
|
||||
pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
|
||||
|
||||
/// Key-value pairs in columnar format.
|
||||
pub struct KeyValues {
|
||||
pub sequence: SequenceNumber,
|
||||
pub op_type: OpType,
|
||||
/// Start index of these key-value paris in batch. Each row in the same batch has
|
||||
/// a unique index to identify it.
|
||||
pub start_index_in_batch: usize,
|
||||
pub keys: Vec<VectorRef>,
|
||||
pub values: Vec<VectorRef>,
|
||||
pub timestamp: Option<VectorRef>,
|
||||
}
|
||||
|
||||
impl KeyValues {
|
||||
// Note that `sequence` is not reset.
|
||||
fn reset(&mut self, op_type: OpType, index_in_batch: usize) {
|
||||
self.op_type = op_type;
|
||||
self.start_index_in_batch = index_in_batch;
|
||||
self.keys.clear();
|
||||
self.values.clear();
|
||||
self.timestamp = None;
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.timestamp.as_ref().map(|v| v.len()).unwrap_or_default()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
pub fn estimated_memory_size(&self) -> usize {
|
||||
self.keys.iter().fold(0, |acc, v| acc + v.memory_size())
|
||||
+ self.values.iter().fold(0, |acc, v| acc + v.memory_size())
|
||||
+ self
|
||||
.timestamp
|
||||
.as_ref()
|
||||
.map(|t| t.memory_size())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Memtable memory allocation tracker.
|
||||
pub struct AllocTracker {
|
||||
flush_strategy: Option<FlushStrategyRef>,
|
||||
/// Bytes allocated by the tracker.
|
||||
bytes_allocated: AtomicUsize,
|
||||
/// Whether allocating is done.
|
||||
is_done_allocating: AtomicBool,
|
||||
}
|
||||
|
||||
impl fmt::Debug for AllocTracker {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("AllocTracker")
|
||||
.field("bytes_allocated", &self.bytes_allocated)
|
||||
.field("is_done_allocating", &self.is_done_allocating)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl AllocTracker {
|
||||
/// Returns a new [AllocTracker].
|
||||
pub fn new(flush_strategy: Option<FlushStrategyRef>) -> AllocTracker {
|
||||
AllocTracker {
|
||||
flush_strategy,
|
||||
bytes_allocated: AtomicUsize::new(0),
|
||||
is_done_allocating: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks `bytes` memory is allocated.
|
||||
pub(crate) fn on_allocate(&self, bytes: usize) {
|
||||
let _ = self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
|
||||
WRITE_BUFFER_BYTES.add(bytes as i64);
|
||||
if let Some(flush_strategy) = &self.flush_strategy {
|
||||
flush_strategy.reserve_mem(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/// Marks we have finished allocating memory so we can free it from
|
||||
/// the write buffer's limit.
|
||||
///
|
||||
/// The region MUST ensure that it calls this method inside the region writer's write lock.
|
||||
pub(crate) fn done_allocating(&self) {
|
||||
if let Some(flush_strategy) = &self.flush_strategy {
|
||||
if self
|
||||
.is_done_allocating
|
||||
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
flush_strategy.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns bytes allocated.
|
||||
pub(crate) fn bytes_allocated(&self) -> usize {
|
||||
self.bytes_allocated.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for AllocTracker {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_done_allocating.load(Ordering::Relaxed) {
|
||||
self.done_allocating();
|
||||
}
|
||||
|
||||
let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed);
|
||||
WRITE_BUFFER_BYTES.sub(bytes_allocated as i64);
|
||||
|
||||
// Memory tracked by this tracker is freed.
|
||||
if let Some(flush_strategy) = &self.flush_strategy {
|
||||
flush_strategy.free_mem(bytes_allocated);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Default memtable builder that builds `BTreeMemtable`.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DefaultMemtableBuilder {
|
||||
memtable_id: AtomicU32,
|
||||
flush_strategy: Option<FlushStrategyRef>,
|
||||
}
|
||||
|
||||
impl DefaultMemtableBuilder {
|
||||
/// Returns a new [DefaultMemtableBuilder] with specific `flush_strategy`.
|
||||
///
|
||||
/// If `flush_strategy` is `Some`, the memtable will report its memory usage
|
||||
/// to the `flush_strategy`.
|
||||
pub fn with_flush_strategy(flush_strategy: Option<FlushStrategyRef>) -> Self {
|
||||
Self {
|
||||
memtable_id: AtomicU32::new(0),
|
||||
flush_strategy,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MemtableBuilder for DefaultMemtableBuilder {
|
||||
fn build(&self, schema: RegionSchemaRef) -> MemtableRef {
|
||||
let id = self.memtable_id.fetch_add(1, Ordering::Relaxed);
|
||||
Arc::new(BTreeMemtable::new(id, schema, self.flush_strategy.clone()))
|
||||
}
|
||||
}
|
||||
@@ -1,573 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{btree_map, BTreeMap};
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use api::v1::OpType;
|
||||
use common_time::range::TimestampRange;
|
||||
use datatypes::data_type::DataType;
|
||||
use datatypes::prelude::*;
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::{UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder};
|
||||
use store_api::storage::{SequenceNumber, MIN_OP_TYPE};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::flush::FlushStrategyRef;
|
||||
use crate::memtable::{
|
||||
AllocTracker, BatchIterator, BoxedBatchIterator, IterContext, KeyValues, Memtable, MemtableId,
|
||||
MemtableStats, RowOrdering,
|
||||
};
|
||||
use crate::read::Batch;
|
||||
use crate::schema::compat::ReadAdapter;
|
||||
use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
|
||||
|
||||
type RwLockMap = RwLock<BTreeMap<InnerKey, RowValue>>;
|
||||
|
||||
/// A simple memtable implementation based on std's [`BTreeMap`].
|
||||
///
|
||||
/// Mainly for test purpose, don't use in production.
|
||||
pub struct BTreeMemtable {
|
||||
id: MemtableId,
|
||||
schema: RegionSchemaRef,
|
||||
map: Arc<RwLockMap>,
|
||||
alloc_tracker: AllocTracker,
|
||||
max_timestamp: AtomicI64,
|
||||
min_timestamp: AtomicI64,
|
||||
}
|
||||
|
||||
impl BTreeMemtable {
|
||||
pub fn new(
|
||||
id: MemtableId,
|
||||
schema: RegionSchemaRef,
|
||||
flush_strategy: Option<FlushStrategyRef>,
|
||||
) -> BTreeMemtable {
|
||||
BTreeMemtable {
|
||||
id,
|
||||
schema,
|
||||
map: Arc::new(RwLock::new(BTreeMap::new())),
|
||||
alloc_tracker: AllocTracker::new(flush_strategy),
|
||||
max_timestamp: AtomicI64::new(i64::MIN),
|
||||
min_timestamp: AtomicI64::new(i64::MAX),
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates memtable stats.
|
||||
/// This function is guarded by `BTreeMemtable::map` so that store-after-load is safe.
|
||||
fn update_stats(&self, request_size: usize, min: Option<Value>, max: Option<Value>) {
|
||||
self.alloc_tracker.on_allocate(request_size);
|
||||
|
||||
if let Some(min) = min {
|
||||
let min_val = min
|
||||
.as_timestamp()
|
||||
.expect("Min timestamp must be a valid timestamp value")
|
||||
.value();
|
||||
let cur_min = self.min_timestamp.load(AtomicOrdering::Relaxed);
|
||||
if min_val < cur_min {
|
||||
self.min_timestamp.store(min_val, AtomicOrdering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(max) = max {
|
||||
let cur_max = self.max_timestamp.load(AtomicOrdering::Relaxed);
|
||||
let max_val = max
|
||||
.as_timestamp()
|
||||
.expect("Max timestamp must be a valid timestamp value")
|
||||
.value();
|
||||
if max_val > cur_max {
|
||||
self.max_timestamp.store(max_val, AtomicOrdering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for BTreeMemtable {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let len = self.map.read().unwrap().len();
|
||||
|
||||
f.debug_struct("BTreeMemtable")
|
||||
.field("id", &self.id)
|
||||
// Only show StoreSchema
|
||||
.field("schema", &self.schema)
|
||||
.field("rows", &len)
|
||||
.field("alloc_tracker", &self.alloc_tracker)
|
||||
.field("max_timestamp", &self.max_timestamp)
|
||||
.field("min_timestamp", &self.min_timestamp)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Memtable for BTreeMemtable {
|
||||
fn id(&self) -> MemtableId {
|
||||
self.id
|
||||
}
|
||||
|
||||
fn schema(&self) -> RegionSchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
fn write(&self, kvs: &KeyValues) -> Result<()> {
|
||||
debug_assert!(kvs.timestamp.is_some());
|
||||
let iter_row = IterRow::new(kvs);
|
||||
let mut map = self.map.write().unwrap();
|
||||
|
||||
let mut min_ts = None;
|
||||
let mut max_ts = None;
|
||||
for (inner_key, row_value) in iter_row {
|
||||
let ts = inner_key.timestamp();
|
||||
let min_ts = min_ts.get_or_insert_with(|| ts.clone());
|
||||
let max_ts = max_ts.get_or_insert_with(|| ts.clone());
|
||||
if ts < min_ts {
|
||||
*min_ts = ts.clone();
|
||||
}
|
||||
if ts > max_ts {
|
||||
*max_ts = ts.clone();
|
||||
}
|
||||
let _ = map.insert(inner_key, row_value);
|
||||
}
|
||||
|
||||
self.update_stats(kvs.estimated_memory_size(), min_ts, max_ts);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator> {
|
||||
assert!(ctx.batch_size > 0);
|
||||
|
||||
let iter = BTreeIterator::new(ctx, self.schema.clone(), self.map.clone())?;
|
||||
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
fn num_rows(&self) -> usize {
|
||||
self.map.read().unwrap().len()
|
||||
}
|
||||
|
||||
fn stats(&self) -> MemtableStats {
|
||||
let ts_meta = self.schema.column_metadata(self.schema.timestamp_index());
|
||||
|
||||
let Some(timestamp_type) = ts_meta.desc.data_type.as_timestamp() else {
|
||||
// safety: timestamp column always has timestamp type, otherwise it's a bug.
|
||||
panic!(
|
||||
"Timestamp column is not a valid timestamp type: {:?}",
|
||||
self.schema
|
||||
);
|
||||
};
|
||||
|
||||
MemtableStats {
|
||||
estimated_bytes: self.alloc_tracker.bytes_allocated(),
|
||||
max_timestamp: timestamp_type
|
||||
.create_timestamp(self.max_timestamp.load(AtomicOrdering::Relaxed)),
|
||||
min_timestamp: timestamp_type
|
||||
.create_timestamp(self.min_timestamp.load(AtomicOrdering::Relaxed)),
|
||||
}
|
||||
}
|
||||
|
||||
fn mark_immutable(&self) {
|
||||
self.alloc_tracker.done_allocating();
|
||||
}
|
||||
}
|
||||
|
||||
struct BTreeIterator {
|
||||
ctx: IterContext,
|
||||
/// Schema of this memtable.
|
||||
schema: RegionSchemaRef,
|
||||
/// Projected schema that user expect to read.
|
||||
projected_schema: ProjectedSchemaRef,
|
||||
adapter: ReadAdapter,
|
||||
map: Arc<RwLockMap>,
|
||||
last_key: Option<InnerKey>,
|
||||
}
|
||||
|
||||
impl BatchIterator for BTreeIterator {
|
||||
fn schema(&self) -> ProjectedSchemaRef {
|
||||
self.projected_schema.clone()
|
||||
}
|
||||
|
||||
fn ordering(&self) -> RowOrdering {
|
||||
RowOrdering::Key
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for BTreeIterator {
|
||||
type Item = Result<Batch>;
|
||||
|
||||
fn next(&mut self) -> Option<Result<Batch>> {
|
||||
self.next_batch().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl BTreeIterator {
|
||||
fn new(
|
||||
ctx: IterContext,
|
||||
schema: RegionSchemaRef,
|
||||
map: Arc<RwLockMap>,
|
||||
) -> Result<BTreeIterator> {
|
||||
let projected_schema = ctx
|
||||
.projected_schema
|
||||
.clone()
|
||||
.unwrap_or_else(|| Arc::new(ProjectedSchema::no_projection(schema.clone())));
|
||||
let adapter = ReadAdapter::new(schema.store_schema().clone(), projected_schema.clone())?;
|
||||
|
||||
Ok(BTreeIterator {
|
||||
ctx,
|
||||
schema,
|
||||
projected_schema,
|
||||
adapter,
|
||||
map,
|
||||
last_key: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
let map = self.map.read().unwrap();
|
||||
let iter = if let Some(last_key) = &self.last_key {
|
||||
map.range((Bound::Excluded(last_key), Bound::Unbounded))
|
||||
} else {
|
||||
map.range(..)
|
||||
};
|
||||
|
||||
let iter = MapIterWrapper::new(iter, self.ctx.visible_sequence, self.ctx.time_range);
|
||||
let (keys, sequences, op_types, values) = collect_iter(iter, self.ctx.batch_size);
|
||||
|
||||
if keys.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
self.last_key = keys.last().map(|k| {
|
||||
let mut last_key = (*k).clone();
|
||||
last_key.reset_for_seek();
|
||||
last_key
|
||||
});
|
||||
|
||||
let key_data_types = self
|
||||
.schema
|
||||
.row_key_columns()
|
||||
.map(|column_meta| column_meta.desc.data_type.clone());
|
||||
let value_data_types = self
|
||||
.schema
|
||||
.field_columns()
|
||||
.map(|column_meta| column_meta.desc.data_type.clone());
|
||||
|
||||
let key_columns = rows_to_vectors(
|
||||
key_data_types,
|
||||
self.adapter.source_key_needed(),
|
||||
keys.as_slice(),
|
||||
);
|
||||
let field_columns = rows_to_vectors(
|
||||
value_data_types,
|
||||
self.adapter.source_value_needed(),
|
||||
values.as_slice(),
|
||||
);
|
||||
|
||||
let batch = self.adapter.batch_from_parts(
|
||||
key_columns,
|
||||
field_columns,
|
||||
Arc::new(sequences),
|
||||
Arc::new(op_types),
|
||||
)?;
|
||||
|
||||
Ok(Some(batch))
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_iter<'a, I: Iterator<Item = (&'a InnerKey, &'a RowValue)>>(
|
||||
iter: I,
|
||||
batch_size: usize,
|
||||
) -> (
|
||||
Vec<&'a InnerKey>,
|
||||
UInt64Vector,
|
||||
UInt8Vector,
|
||||
Vec<&'a RowValue>,
|
||||
) {
|
||||
let mut keys = Vec::with_capacity(batch_size);
|
||||
let mut sequences = UInt64VectorBuilder::with_capacity(batch_size);
|
||||
let mut op_types = UInt8VectorBuilder::with_capacity(batch_size);
|
||||
let mut values = Vec::with_capacity(batch_size);
|
||||
for (inner_key, row_value) in iter.take(batch_size) {
|
||||
keys.push(inner_key);
|
||||
sequences.push(Some(inner_key.sequence));
|
||||
op_types.push(Some(inner_key.op_type as u8));
|
||||
values.push(row_value);
|
||||
}
|
||||
|
||||
(keys, sequences.finish(), op_types.finish(), values)
|
||||
}
|
||||
|
||||
/// `MapIterWrapper` removes same user key with invisible sequence.
|
||||
struct MapIterWrapper<'a, InnerKey, RowValue> {
|
||||
iter: btree_map::Range<'a, InnerKey, RowValue>,
|
||||
prev_key: Option<InnerKey>,
|
||||
visible_sequence: SequenceNumber,
|
||||
time_range: Option<TimestampRange>,
|
||||
}
|
||||
|
||||
impl<'a> MapIterWrapper<'a, InnerKey, RowValue> {
|
||||
fn new(
|
||||
iter: btree_map::Range<'a, InnerKey, RowValue>,
|
||||
visible_sequence: SequenceNumber,
|
||||
time_range: Option<TimestampRange>,
|
||||
) -> MapIterWrapper<'a, InnerKey, RowValue> {
|
||||
MapIterWrapper {
|
||||
iter,
|
||||
prev_key: None,
|
||||
visible_sequence,
|
||||
time_range,
|
||||
}
|
||||
}
|
||||
|
||||
fn next_visible_entry(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> {
|
||||
for (k, v) in self.iter.by_ref() {
|
||||
if k.is_visible(self.visible_sequence) && k.is_in_time_range(&self.time_range) {
|
||||
return Some((k, v));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MapIterWrapper<'a, InnerKey, RowValue> {
|
||||
type Item = (&'a InnerKey, &'a RowValue);
|
||||
|
||||
fn next(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> {
|
||||
let (mut current_key, mut current_value) = self.next_visible_entry()?;
|
||||
if self.prev_key.is_none() {
|
||||
self.prev_key = Some(current_key.clone());
|
||||
return Some((current_key, current_value));
|
||||
}
|
||||
|
||||
let prev_key = self.prev_key.take().unwrap();
|
||||
while prev_key.is_row_key_equal(current_key) {
|
||||
if let Some((next_key, next_value)) = self.next_visible_entry() {
|
||||
(current_key, current_value) = (next_key, next_value);
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
self.prev_key = Some(current_key.clone());
|
||||
|
||||
Some((current_key, current_value))
|
||||
}
|
||||
}
|
||||
|
||||
struct IterRow<'a> {
|
||||
kvs: &'a KeyValues,
|
||||
index: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<'a> IterRow<'a> {
|
||||
fn new(kvs: &KeyValues) -> IterRow {
|
||||
IterRow {
|
||||
kvs,
|
||||
index: 0,
|
||||
len: kvs.len(),
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_row(&mut self) -> (InnerKey, RowValue) {
|
||||
let mut row_key: Vec<_> = self
|
||||
.kvs
|
||||
.keys
|
||||
.iter()
|
||||
.map(|vector| vector.get(self.index))
|
||||
.collect();
|
||||
|
||||
// unwrap safety: KeyValues always contains a timestamp as guaranteed in [Inserter::write_one_mutation]
|
||||
row_key.push(self.kvs.timestamp.as_ref().unwrap().get(self.index));
|
||||
let inner_key = InnerKey {
|
||||
row_key,
|
||||
sequence: self.kvs.sequence,
|
||||
index_in_batch: self.kvs.start_index_in_batch + self.index,
|
||||
op_type: self.kvs.op_type,
|
||||
};
|
||||
|
||||
let row_value = RowValue {
|
||||
values: self
|
||||
.kvs
|
||||
.values
|
||||
.iter()
|
||||
.map(|vector| vector.get(self.index))
|
||||
.collect(),
|
||||
};
|
||||
|
||||
self.index += 1;
|
||||
|
||||
(inner_key, row_value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for IterRow<'a> {
|
||||
type Item = (InnerKey, RowValue);
|
||||
|
||||
fn next(&mut self) -> Option<(InnerKey, RowValue)> {
|
||||
if self.index >= self.len {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(self.fetch_row())
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
(self.kvs.keys.len(), Some(self.kvs.keys.len()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
struct InnerKey {
|
||||
/// User defined primary keys
|
||||
row_key: Vec<Value>,
|
||||
/// Sequence number of row
|
||||
sequence: SequenceNumber,
|
||||
index_in_batch: usize,
|
||||
op_type: OpType,
|
||||
}
|
||||
|
||||
impl Ord for InnerKey {
|
||||
fn cmp(&self, other: &InnerKey) -> Ordering {
|
||||
// Order by (row_key asc, sequence desc, index_in_batch desc, op_type desc), though (key,
|
||||
// sequence, index_in_batch) should be enough to disambiguate.
|
||||
self.row_key
|
||||
.cmp(&other.row_key)
|
||||
.then_with(|| other.sequence.cmp(&self.sequence))
|
||||
.then_with(|| other.index_in_batch.cmp(&self.index_in_batch))
|
||||
.then_with(|| other.op_type.cmp(&self.op_type))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for InnerKey {
|
||||
fn partial_cmp(&self, other: &InnerKey) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl InnerKey {
|
||||
#[inline]
|
||||
fn timestamp(&self) -> &Value {
|
||||
// safety: row key shall at least contain a timestamp column
|
||||
self.row_key.last().unwrap()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_row_key_equal(&self, other: &InnerKey) -> bool {
|
||||
self.row_key == other.row_key
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_visible(&self, sequence: SequenceNumber) -> bool {
|
||||
self.sequence <= sequence
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_in_time_range(&self, range: &Option<TimestampRange>) -> bool {
|
||||
let Some(range) = range else {
|
||||
return true;
|
||||
};
|
||||
range.contains(
|
||||
&self
|
||||
.timestamp()
|
||||
.as_timestamp()
|
||||
.expect("Timestamp field must be a valid timestamp value"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Reset the `InnerKey` so that we can use it to seek next key that
|
||||
/// has different row key.
|
||||
fn reset_for_seek(&mut self) {
|
||||
// sequence, index_in_batch, op_type are ordered in desc order, so
|
||||
// we can represent the last inner key with same row key by setting them
|
||||
// to zero (Minimum value).
|
||||
self.sequence = 0;
|
||||
self.index_in_batch = 0;
|
||||
self.op_type = MIN_OP_TYPE;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct RowValue {
|
||||
values: Vec<Value>,
|
||||
}
|
||||
|
||||
trait RowsProvider {
|
||||
fn row_num(&self) -> usize;
|
||||
|
||||
fn column_num(&self) -> usize {
|
||||
self.row_by_index(0).len()
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.row_num() == 0
|
||||
}
|
||||
|
||||
fn row_by_index(&self, idx: usize) -> &Vec<Value>;
|
||||
}
|
||||
|
||||
impl<'a> RowsProvider for &'a [&InnerKey] {
|
||||
fn row_num(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn row_by_index(&self, idx: usize) -> &Vec<Value> {
|
||||
&self[idx].row_key
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> RowsProvider for &'a [&RowValue] {
|
||||
fn row_num(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn row_by_index(&self, idx: usize) -> &Vec<Value> {
|
||||
&self[idx].values
|
||||
}
|
||||
}
|
||||
|
||||
fn rows_to_vectors<I: Iterator<Item = ConcreteDataType>, T: RowsProvider>(
|
||||
data_types: I,
|
||||
column_needed: &[bool],
|
||||
provider: T,
|
||||
) -> Vec<VectorRef> {
|
||||
if provider.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let column_num = provider.column_num();
|
||||
let row_num = provider.row_num();
|
||||
let mut builders = Vec::with_capacity(column_num);
|
||||
for data_type in data_types {
|
||||
builders.push(data_type.create_mutable_vector(row_num));
|
||||
}
|
||||
|
||||
let mut vectors = Vec::with_capacity(column_num);
|
||||
for (col_idx, builder) in builders.iter_mut().enumerate() {
|
||||
if !column_needed[col_idx] {
|
||||
continue;
|
||||
}
|
||||
|
||||
for row_idx in 0..row_num {
|
||||
let row = provider.row_by_index(row_idx);
|
||||
let value = &row[col_idx];
|
||||
builder.as_mut().push_value_ref(value.as_value_ref());
|
||||
}
|
||||
|
||||
vectors.push(builder.to_vector());
|
||||
}
|
||||
|
||||
vectors
|
||||
}
|
||||
@@ -1,251 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use api::v1::OpType;
|
||||
use store_api::storage::SequenceNumber;
|
||||
|
||||
use super::MemtableRef;
|
||||
use crate::error::Result;
|
||||
use crate::memtable::KeyValues;
|
||||
use crate::metrics::MEMTABLE_WRITE_ELAPSED;
|
||||
use crate::write_batch::{Mutation, Payload};
|
||||
|
||||
/// Wraps logic of inserting key/values in [WriteBatch](crate::write_batch::WriteBatch) to [Memtable](crate::memtable::Memtable).
|
||||
pub struct Inserter {
|
||||
/// Sequence of the batch to be inserted.
|
||||
sequence: SequenceNumber,
|
||||
/// Used to calculate the start index in batch for `KeyValues`.
|
||||
index_in_batch: usize,
|
||||
}
|
||||
|
||||
impl Inserter {
|
||||
pub fn new(sequence: SequenceNumber) -> Inserter {
|
||||
Inserter {
|
||||
sequence,
|
||||
index_in_batch: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert write batch payload into memtable.
|
||||
///
|
||||
/// Won't do schema validation if not configured. Caller (mostly the `RegionWriter` should ensure the
|
||||
/// schemas of `memtable` are consistent with `payload`'s.
|
||||
pub fn insert_memtable(&mut self, payload: &Payload, memtable: &MemtableRef) -> Result<()> {
|
||||
let _timer = MEMTABLE_WRITE_ELAPSED.start_timer();
|
||||
|
||||
if payload.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// This function only makes effect in debug mode.
|
||||
validate_input_and_memtable_schemas(payload, memtable);
|
||||
|
||||
// Enough to hold all key or value columns.
|
||||
let total_column_num = payload.schema.num_columns();
|
||||
// Reusable KeyValues buffer.
|
||||
let mut kvs = KeyValues {
|
||||
sequence: self.sequence,
|
||||
op_type: OpType::Put,
|
||||
start_index_in_batch: self.index_in_batch,
|
||||
keys: Vec::with_capacity(total_column_num),
|
||||
values: Vec::with_capacity(total_column_num),
|
||||
timestamp: None,
|
||||
};
|
||||
|
||||
for mutation in &payload.mutations {
|
||||
self.write_one_mutation(mutation, memtable, &mut kvs)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_one_mutation(
|
||||
&mut self,
|
||||
mutation: &Mutation,
|
||||
memtable: &MemtableRef,
|
||||
kvs: &mut KeyValues,
|
||||
) -> Result<()> {
|
||||
let schema = memtable.schema();
|
||||
let num_rows = mutation.record_batch.num_rows();
|
||||
|
||||
kvs.reset(mutation.op_type, self.index_in_batch);
|
||||
|
||||
let ts_idx = schema.timestamp_index();
|
||||
kvs.timestamp = Some(mutation.record_batch.column(ts_idx).clone());
|
||||
for key_idx in 0..ts_idx {
|
||||
kvs.keys.push(mutation.record_batch.column(key_idx).clone());
|
||||
}
|
||||
for value_idx in schema.value_indices() {
|
||||
kvs.values
|
||||
.push(mutation.record_batch.column(value_idx).clone());
|
||||
}
|
||||
|
||||
memtable.write(kvs)?;
|
||||
|
||||
self.index_in_batch += num_rows;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_input_and_memtable_schemas(payload: &Payload, memtable: &MemtableRef) {
|
||||
if cfg!(debug_assertions) {
|
||||
let payload_schema = &payload.schema;
|
||||
let memtable_schema = memtable.schema();
|
||||
let user_schema = memtable_schema.user_schema();
|
||||
debug_assert_eq!(payload_schema.version(), user_schema.version());
|
||||
// Only validate column schemas.
|
||||
debug_assert_eq!(
|
||||
payload_schema.column_schemas(),
|
||||
user_schema.column_schemas()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds `start` and `end` indexes to get a slice `[start, end)` from the vector whose
|
||||
/// timestamps belong to same time range at `range_index`.
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct SliceIndex {
|
||||
start: usize,
|
||||
end: usize,
|
||||
/// Index in time ranges.
|
||||
range_index: usize,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_time::timestamp::Timestamp;
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
|
||||
use store_api::storage::WriteRequest;
|
||||
|
||||
use super::*;
|
||||
use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::schema::RegionSchemaRef;
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
use crate::test_util::write_batch_util;
|
||||
use crate::write_batch::WriteBatch;
|
||||
|
||||
fn new_test_write_batch() -> WriteBatch {
|
||||
write_batch_util::new_write_batch(
|
||||
&[
|
||||
("ts", LogicalTypeId::TimestampMillisecond, false),
|
||||
("value", LogicalTypeId::Int64, true),
|
||||
],
|
||||
Some(0),
|
||||
1,
|
||||
)
|
||||
}
|
||||
|
||||
fn new_region_schema() -> RegionSchemaRef {
|
||||
let desc = RegionDescBuilder::new("test")
|
||||
.timestamp(("ts", LogicalTypeId::TimestampMillisecond, false))
|
||||
.push_field_column(("value", LogicalTypeId::Int64, true))
|
||||
.build();
|
||||
let metadata: RegionMetadata = desc.try_into().unwrap();
|
||||
|
||||
metadata.schema().clone()
|
||||
}
|
||||
|
||||
fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option<i64>)]) {
|
||||
let ts = TimestampMillisecondVector::from_values(data.iter().map(|v| v.0));
|
||||
let value = Int64Vector::from(data.iter().map(|v| v.1).collect::<Vec<_>>());
|
||||
let put_data = HashMap::from([
|
||||
("ts".to_string(), Arc::new(ts) as VectorRef),
|
||||
("value".to_string(), Arc::new(value) as VectorRef),
|
||||
]);
|
||||
|
||||
batch.put(put_data).unwrap();
|
||||
}
|
||||
|
||||
fn check_memtable_content(
|
||||
mem: &MemtableRef,
|
||||
sequence: SequenceNumber,
|
||||
data: &[(i64, Option<i64>)],
|
||||
max_ts: i64,
|
||||
min_ts: i64,
|
||||
) {
|
||||
let iter = mem.iter(IterContext::default()).unwrap();
|
||||
assert_eq!(min_ts, mem.stats().min_timestamp.value());
|
||||
assert_eq!(max_ts, mem.stats().max_timestamp.value());
|
||||
|
||||
let mut index = 0;
|
||||
for batch in iter {
|
||||
let batch = batch.unwrap();
|
||||
let row_num = batch.column(0).len();
|
||||
for i in 0..row_num {
|
||||
let ts = batch.column(0).get(i);
|
||||
let v = batch.column(1).get(i);
|
||||
assert_eq!(
|
||||
Value::Timestamp(Timestamp::new_millisecond(data[index].0)),
|
||||
ts
|
||||
);
|
||||
assert_eq!(Value::from(data[index].1), v);
|
||||
assert_eq!(Value::from(sequence), batch.column(2).get(i));
|
||||
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(data.len(), index);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inserter_put_one_memtable() {
|
||||
let sequence = 11111;
|
||||
let memtable_schema = new_region_schema();
|
||||
let mutable_memtable = DefaultMemtableBuilder::default().build(memtable_schema);
|
||||
let mut inserter = Inserter::new(sequence);
|
||||
|
||||
let mut batch = new_test_write_batch();
|
||||
put_batch(&mut batch, &[(1, Some(1)), (2, None)]);
|
||||
// Also test multiple put data in one batch.
|
||||
put_batch(
|
||||
&mut batch,
|
||||
&[
|
||||
(3, None),
|
||||
(2, None), // Duplicate entries in same put data.
|
||||
(2, Some(2)),
|
||||
(4, Some(4)),
|
||||
(201, Some(201)),
|
||||
(102, None),
|
||||
(101, Some(101)),
|
||||
],
|
||||
);
|
||||
|
||||
inserter
|
||||
.insert_memtable(batch.payload(), &mutable_memtable)
|
||||
.unwrap();
|
||||
check_memtable_content(
|
||||
&mutable_memtable,
|
||||
sequence,
|
||||
&[
|
||||
(1, Some(1)),
|
||||
(2, Some(2)),
|
||||
(3, None),
|
||||
(4, Some(4)),
|
||||
(101, Some(101)),
|
||||
(102, None),
|
||||
(201, Some(201)),
|
||||
],
|
||||
201,
|
||||
1,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,595 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_time::Timestamp;
|
||||
use datatypes::prelude::*;
|
||||
use datatypes::timestamp::TimestampMillisecond;
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use datatypes::vectors::{
|
||||
TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64Vector,
|
||||
UInt64VectorBuilder, UInt8Vector,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::schema::{ProjectedSchema, RegionSchemaRef};
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
|
||||
// Schema for testing memtable:
|
||||
// - key: Int64(timestamp), UInt64(version),
|
||||
// - value: UInt64, UInt64
|
||||
pub fn schema_for_test() -> RegionSchemaRef {
|
||||
// Just build a region desc and use its columns metadata.
|
||||
let desc = RegionDescBuilder::new("test")
|
||||
.push_field_column(("v0", LogicalTypeId::UInt64, true))
|
||||
.push_field_column(("v1", LogicalTypeId::UInt64, true))
|
||||
.build();
|
||||
let metadata: RegionMetadata = desc.try_into().unwrap();
|
||||
|
||||
metadata.schema().clone()
|
||||
}
|
||||
|
||||
fn kvs_for_test_with_index(
|
||||
sequence: SequenceNumber,
|
||||
op_type: OpType,
|
||||
start_index_in_batch: usize,
|
||||
keys: &[TimestampMillisecond],
|
||||
values: &[(Option<u64>, Option<u64>)],
|
||||
) -> KeyValues {
|
||||
assert_eq!(keys.len(), values.len());
|
||||
|
||||
let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(keys.len());
|
||||
for key in keys {
|
||||
key_builders.push(Some(*key));
|
||||
}
|
||||
let ts_col = Arc::new(key_builders.finish()) as _;
|
||||
|
||||
let mut value_builders = (
|
||||
UInt64VectorBuilder::with_capacity(values.len()),
|
||||
UInt64VectorBuilder::with_capacity(values.len()),
|
||||
);
|
||||
for value in values {
|
||||
value_builders.0.push(value.0);
|
||||
value_builders.1.push(value.1);
|
||||
}
|
||||
let row_values = vec![
|
||||
Arc::new(value_builders.0.finish()) as _,
|
||||
Arc::new(value_builders.1.finish()) as _,
|
||||
];
|
||||
|
||||
let kvs = KeyValues {
|
||||
sequence,
|
||||
op_type,
|
||||
start_index_in_batch,
|
||||
keys: vec![],
|
||||
values: row_values,
|
||||
timestamp: Some(ts_col),
|
||||
};
|
||||
|
||||
assert_eq!(keys.len(), kvs.len());
|
||||
assert_eq!(keys.is_empty(), kvs.is_empty());
|
||||
|
||||
kvs
|
||||
}
|
||||
|
||||
fn kvs_for_test(
|
||||
sequence: SequenceNumber,
|
||||
op_type: OpType,
|
||||
keys: &[TimestampMillisecond],
|
||||
values: &[(Option<u64>, Option<u64>)],
|
||||
) -> KeyValues {
|
||||
kvs_for_test_with_index(sequence, op_type, 0, keys, values)
|
||||
}
|
||||
|
||||
pub fn write_kvs(
|
||||
memtable: &dyn Memtable,
|
||||
sequence: SequenceNumber,
|
||||
op_type: OpType,
|
||||
keys: &[i64],
|
||||
values: &[(Option<u64>, Option<u64>)],
|
||||
) {
|
||||
let keys: Vec<TimestampMillisecond> = keys.iter().map(|l| ((*l).into())).collect();
|
||||
|
||||
let kvs = kvs_for_test(sequence, op_type, &keys, values);
|
||||
|
||||
memtable.write(&kvs).unwrap();
|
||||
}
|
||||
|
||||
fn check_batch_valid(batch: &Batch) {
|
||||
assert_eq!(5, batch.num_columns());
|
||||
let row_num = batch.column(0).len();
|
||||
for i in 1..5 {
|
||||
assert_eq!(row_num, batch.column(i).len());
|
||||
}
|
||||
}
|
||||
|
||||
fn check_iter_content(
|
||||
iter: &mut dyn BatchIterator,
|
||||
keys: &[i64],
|
||||
sequences: &[u64],
|
||||
op_types: &[OpType],
|
||||
values: &[(Option<u64>, Option<u64>)],
|
||||
) {
|
||||
let keys: Vec<TimestampMillisecond> = keys.iter().map(|l| (*l).into()).collect();
|
||||
|
||||
let mut index = 0;
|
||||
for batch in iter {
|
||||
let batch = batch.unwrap();
|
||||
check_batch_valid(&batch);
|
||||
|
||||
let row_num = batch.column(0).len();
|
||||
for i in 0..row_num {
|
||||
let k0 = batch.column(0).get(i);
|
||||
let (v0, v1) = (batch.column(1).get(i), batch.column(2).get(i));
|
||||
let sequence = batch.column(3).get(i);
|
||||
let op_type = batch.column(4).get(i);
|
||||
|
||||
assert_eq!(Value::from(keys[index]), k0);
|
||||
assert_eq!(Value::from(values[index].0), v0);
|
||||
assert_eq!(Value::from(values[index].1), v1);
|
||||
assert_eq!(Value::from(sequences[index]), sequence);
|
||||
assert_eq!(Value::from(op_types[index] as u8), op_type);
|
||||
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(keys.len(), index);
|
||||
}
|
||||
|
||||
struct MemtableTester {
|
||||
schema: RegionSchemaRef,
|
||||
builders: Vec<MemtableBuilderRef>,
|
||||
}
|
||||
|
||||
impl Default for MemtableTester {
|
||||
fn default() -> MemtableTester {
|
||||
MemtableTester::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl MemtableTester {
|
||||
fn new() -> MemtableTester {
|
||||
let schema = schema_for_test();
|
||||
let builders = vec![Arc::new(DefaultMemtableBuilder::default()) as _];
|
||||
|
||||
MemtableTester { schema, builders }
|
||||
}
|
||||
|
||||
fn new_memtables(&self) -> Vec<MemtableRef> {
|
||||
self.builders
|
||||
.iter()
|
||||
.map(|b| b.build(self.schema.clone()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn run_testcase<F>(&self, testcase: F)
|
||||
where
|
||||
F: Fn(TestContext),
|
||||
{
|
||||
for memtable in self.new_memtables() {
|
||||
let test_ctx = TestContext {
|
||||
schema: self.schema.clone(),
|
||||
memtable,
|
||||
};
|
||||
|
||||
testcase(test_ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TestContext {
|
||||
schema: RegionSchemaRef,
|
||||
memtable: MemtableRef,
|
||||
}
|
||||
|
||||
fn write_iter_memtable_case(ctx: &TestContext) {
|
||||
// Test iterating an empty memtable.
|
||||
let mut iter = ctx.memtable.iter(IterContext::default()).unwrap();
|
||||
assert!(iter.next().is_none());
|
||||
// Poll the empty iterator again.
|
||||
assert!(iter.next().is_none());
|
||||
assert_eq!(0, ctx.memtable.stats().bytes_allocated());
|
||||
|
||||
// Init test data.
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1000, 2002, 2003, 2003, 1001], // keys
|
||||
&[
|
||||
(Some(1), None),
|
||||
(Some(2), None),
|
||||
(Some(7), None),
|
||||
(Some(8), None),
|
||||
(Some(9), None),
|
||||
(Some(3), None),
|
||||
], // values
|
||||
);
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
11, // sequence
|
||||
OpType::Put,
|
||||
&[1002, 1003, 1004], // keys
|
||||
&[(None, None), (Some(5), None), (None, None)], // values
|
||||
);
|
||||
|
||||
// 9 key value pairs (6 + 3).
|
||||
assert_eq!(576, ctx.memtable.stats().bytes_allocated());
|
||||
|
||||
let batch_sizes = [1, 4, 8, consts::READ_BATCH_SIZE];
|
||||
for batch_size in batch_sizes {
|
||||
let iter_ctx = IterContext {
|
||||
batch_size,
|
||||
..Default::default()
|
||||
};
|
||||
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
|
||||
assert_eq!(
|
||||
ctx.schema.user_schema(),
|
||||
iter.schema().projected_user_schema()
|
||||
);
|
||||
assert_eq!(RowOrdering::Key, iter.ordering());
|
||||
|
||||
check_iter_content(
|
||||
&mut *iter,
|
||||
&[1000, 1001, 1002, 1003, 1004, 2002, 2003], // keys
|
||||
&[10, 10, 11, 11, 11, 10, 10], // sequences
|
||||
&[
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
OpType::Put,
|
||||
], // op_types
|
||||
&[
|
||||
(Some(2), None),
|
||||
(Some(3), None),
|
||||
(None, None),
|
||||
(Some(5), None),
|
||||
(None, None),
|
||||
(Some(7), None),
|
||||
(Some(9), None),
|
||||
], // values
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iter_context_default() {
|
||||
let ctx = IterContext::default();
|
||||
assert_eq!(SequenceNumber::MAX, ctx.visible_sequence);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_iter_memtable() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_iter_memtable_case(&ctx);
|
||||
});
|
||||
}
|
||||
|
||||
fn check_iter_batch_size(iter: &mut dyn BatchIterator, total: usize, batch_size: usize) {
|
||||
let mut remains = total;
|
||||
for batch in iter {
|
||||
let batch = batch.unwrap();
|
||||
check_batch_valid(&batch);
|
||||
|
||||
let row_num = batch.column(0).len();
|
||||
if remains >= batch_size {
|
||||
assert_eq!(batch_size, row_num);
|
||||
remains -= batch_size;
|
||||
} else {
|
||||
assert_eq!(remains, row_num);
|
||||
remains = 0;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(0, remains);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iter_batch_size() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1000, 1001, 2002, 2003, 2003], // keys
|
||||
&[
|
||||
(Some(1), None),
|
||||
(Some(2), None),
|
||||
(Some(3), None),
|
||||
(Some(4), None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
], // values
|
||||
);
|
||||
|
||||
let total = 4;
|
||||
// Batch size [less than, equal to, greater than] total
|
||||
let batch_sizes = [1, 6, 8];
|
||||
for batch_size in batch_sizes {
|
||||
let iter_ctx = IterContext {
|
||||
batch_size,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
|
||||
check_iter_batch_size(&mut *iter, total, batch_size);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_duplicate_key_across_batch() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1001, 2000, 2001], // keys
|
||||
&[(Some(1), None), (None, None), (None, None), (None, None)], // values
|
||||
);
|
||||
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
11, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 2001], // keys
|
||||
&[(Some(1231), None), (Some(1232), None)], // values
|
||||
);
|
||||
|
||||
let batch_sizes = [1, 2, 3, 4, 5];
|
||||
for batch_size in batch_sizes {
|
||||
let iter_ctx = IterContext {
|
||||
batch_size,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
|
||||
check_iter_content(
|
||||
&mut *iter,
|
||||
&[1000, 1001, 2000, 2001], // keys
|
||||
&[11, 10, 10, 11], // sequences
|
||||
&[OpType::Put, OpType::Put, OpType::Put, OpType::Put], // op_types
|
||||
&[
|
||||
(Some(1231), None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
(Some(1232), None),
|
||||
], // values
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_duplicate_key_in_batch() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1000, 1001, 2001], // keys
|
||||
&[(None, None), (None, None), (Some(1234), None), (None, None)], // values
|
||||
);
|
||||
|
||||
let batch_sizes = [1, 2, 3, 4, 5];
|
||||
for batch_size in batch_sizes {
|
||||
let iter_ctx = IterContext {
|
||||
batch_size,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
|
||||
check_iter_content(
|
||||
&mut *iter,
|
||||
&[1000, 1001, 2001], // keys
|
||||
&[10, 10, 10], // sequences
|
||||
&[OpType::Put, OpType::Put, OpType::Put], // op_types
|
||||
&[(None, None), (Some(1234), None), (None, None)], // values
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sequence_visibility() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1000], // keys
|
||||
&[(Some(1), None), (Some(2), None)], // values
|
||||
);
|
||||
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
11, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1000], // keys
|
||||
&[(Some(11), None), (Some(12), None)], // values
|
||||
);
|
||||
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
12, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1000], // keys
|
||||
&[(Some(21), None), (Some(22), None)], // values
|
||||
);
|
||||
|
||||
{
|
||||
let iter_ctx = IterContext {
|
||||
batch_size: 1,
|
||||
visible_sequence: 9,
|
||||
projected_schema: None,
|
||||
time_range: None,
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
|
||||
check_iter_content(
|
||||
&mut *iter,
|
||||
&[], // keys
|
||||
&[], // sequences
|
||||
&[], // op_types
|
||||
&[], // values
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
let iter_ctx = IterContext {
|
||||
batch_size: 1,
|
||||
visible_sequence: 10,
|
||||
projected_schema: None,
|
||||
time_range: None,
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
|
||||
check_iter_content(
|
||||
&mut *iter,
|
||||
&[1000], // keys
|
||||
&[10], // sequences
|
||||
&[OpType::Put, OpType::Put], // op_types
|
||||
&[(Some(2), None)], // values
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
let iter_ctx = IterContext {
|
||||
batch_size: 1,
|
||||
visible_sequence: 11,
|
||||
projected_schema: None,
|
||||
time_range: None,
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
|
||||
check_iter_content(
|
||||
&mut *iter,
|
||||
&[1000], // keys
|
||||
&[11], // sequences
|
||||
&[OpType::Put, OpType::Put], // op_types
|
||||
&[(Some(12), None)], // values
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iter_after_none() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1001, 1002], // keys
|
||||
&[(Some(0), None), (Some(1), None), (Some(2), None)], // values
|
||||
);
|
||||
|
||||
let iter_ctx = IterContext {
|
||||
batch_size: 4,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
|
||||
let _ = iter.next().unwrap();
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none());
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_memtable() {
|
||||
let tester = MemtableTester::default();
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1001, 1002], // keys
|
||||
&[(Some(0), None), (Some(1), None), (Some(2), None)], // values
|
||||
);
|
||||
|
||||
let iter_ctx = IterContext {
|
||||
batch_size: 4,
|
||||
time_range: Some(
|
||||
TimestampRange::new(
|
||||
Timestamp::new_millisecond(0),
|
||||
Timestamp::new_millisecond(1001),
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
|
||||
let batch = iter.next().unwrap().unwrap();
|
||||
assert_eq!(5, batch.columns.len());
|
||||
assert_eq!(
|
||||
Arc::new(TimestampMillisecondVector::from_slice([1000])) as Arc<_>,
|
||||
batch.columns[0]
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memtable_projection() {
|
||||
let tester = MemtableTester::default();
|
||||
// Only need v0, but row key columns and internal columns would also be read.
|
||||
let projected_schema =
|
||||
Arc::new(ProjectedSchema::new(tester.schema.clone(), Some(vec![2])).unwrap());
|
||||
|
||||
tester.run_testcase(|ctx| {
|
||||
write_kvs(
|
||||
&*ctx.memtable,
|
||||
9, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1001, 1002], // keys
|
||||
&[
|
||||
(Some(10), Some(20)),
|
||||
(Some(11), Some(21)),
|
||||
(Some(12), Some(22)),
|
||||
], // values
|
||||
);
|
||||
|
||||
let iter_ctx = IterContext {
|
||||
batch_size: 4,
|
||||
projected_schema: Some(projected_schema.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
|
||||
let batch = iter.next().unwrap().unwrap();
|
||||
assert!(iter.next().is_none());
|
||||
|
||||
assert_eq!(4, batch.num_columns());
|
||||
let k0 = Arc::new(TimestampMillisecondVector::from_slice([1000, 1001, 1002])) as VectorRef;
|
||||
let v0 = Arc::new(UInt64Vector::from_slice([20, 21, 22])) as VectorRef;
|
||||
let sequences = Arc::new(UInt64Vector::from_slice([9, 9, 9])) as VectorRef;
|
||||
let op_types = Arc::new(UInt8Vector::from_slice([1, 1, 1])) as VectorRef;
|
||||
|
||||
assert_eq!(k0, *batch.column(0));
|
||||
assert_eq!(v0, *batch.column(1));
|
||||
assert_eq!(sequences, *batch.column(2));
|
||||
assert_eq!(op_types, *batch.column(3));
|
||||
});
|
||||
}
|
||||
@@ -1,166 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use common_time::RangeMillis;
|
||||
|
||||
use crate::memtable::{MemtableId, MemtableRef};
|
||||
|
||||
/// A version of all memtables.
|
||||
///
|
||||
/// This structure is immutable now.
|
||||
#[derive(Debug)]
|
||||
pub struct MemtableVersion {
|
||||
mutable: MemtableRef,
|
||||
/// Immutable memtables.
|
||||
immutables: Vec<MemtableRef>,
|
||||
}
|
||||
|
||||
impl MemtableVersion {
|
||||
pub fn new(mutable: MemtableRef) -> MemtableVersion {
|
||||
Self {
|
||||
mutable,
|
||||
immutables: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn mutable_memtable(&self) -> &MemtableRef {
|
||||
&self.mutable
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn immutable_memtables(&self) -> &[MemtableRef] {
|
||||
&self.immutables
|
||||
}
|
||||
|
||||
pub fn num_memtables(&self) -> usize {
|
||||
// the last `1` is for `mutable`
|
||||
self.immutable_memtables().len() + 1
|
||||
}
|
||||
|
||||
/// Clone current memtable version and freeze its mutable memtables, which moves
|
||||
/// all mutable memtables to immutable memtable list.
|
||||
///
|
||||
/// This method also calls [Memtable::mark_immutable()](crate::memtable::Memtable::mark_immutable()) to
|
||||
/// mark the mutable memtable as immutable.
|
||||
pub fn freeze_mutable(&self, new_mutable: MemtableRef) -> MemtableVersion {
|
||||
let mut immutables = self.immutables.clone();
|
||||
// Marks the mutable memtable as immutable so it can free the memory usage from our
|
||||
// soft limit.
|
||||
self.mutable.mark_immutable();
|
||||
immutables.push(self.mutable.clone());
|
||||
|
||||
MemtableVersion {
|
||||
mutable: new_mutable,
|
||||
immutables,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mutable_bytes_allocated(&self) -> usize {
|
||||
self.mutable.stats().bytes_allocated()
|
||||
}
|
||||
|
||||
pub fn total_bytes_allocated(&self) -> usize {
|
||||
self.immutables
|
||||
.iter()
|
||||
.map(|m| m.stats().bytes_allocated())
|
||||
.sum::<usize>()
|
||||
+ self.mutable.stats().bytes_allocated()
|
||||
}
|
||||
|
||||
/// Creates a new `MemtableVersion` that removes immutable memtables
|
||||
/// less than or equal to max_memtable_id.
|
||||
pub fn remove_immutables(&self, max_memtable_id: MemtableId) -> MemtableVersion {
|
||||
let immutables = self
|
||||
.immutables
|
||||
.iter()
|
||||
.filter(|immem| immem.id() > max_memtable_id)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
MemtableVersion {
|
||||
mutable: self.mutable.clone(),
|
||||
immutables,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn memtables_to_flush(&self) -> (Option<MemtableId>, Vec<MemtableRef>) {
|
||||
let max_memtable_id = self.immutables.iter().map(|immem| immem.id()).max();
|
||||
let memtables = self.immutables.clone();
|
||||
|
||||
(max_memtable_id, memtables)
|
||||
}
|
||||
}
|
||||
|
||||
// We use a new type to order time ranges by (end, start).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
struct RangeKey(RangeMillis);
|
||||
|
||||
impl Ord for RangeKey {
|
||||
fn cmp(&self, other: &RangeKey) -> Ordering {
|
||||
self.0
|
||||
.end()
|
||||
.cmp(other.0.end())
|
||||
.then_with(|| self.0.start().cmp(other.0.start()))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for RangeKey {
|
||||
fn partial_cmp(&self, other: &RangeKey) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder};
|
||||
use crate::test_util::schema_util;
|
||||
|
||||
#[test]
|
||||
fn test_memtable_version() {
|
||||
let memtable_builder = DefaultMemtableBuilder::default();
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(1, 1));
|
||||
|
||||
let memtable_1 = memtable_builder.build(region_schema.clone());
|
||||
let v1 = MemtableVersion::new(memtable_1);
|
||||
assert_eq!(1, v1.num_memtables());
|
||||
|
||||
// Freeze and add new mutable.
|
||||
let memtable_2 = memtable_builder.build(region_schema.clone());
|
||||
let v2 = v1.freeze_mutable(memtable_2);
|
||||
let v2_immutables = v2.immutable_memtables();
|
||||
assert_eq!(1, v2_immutables.len());
|
||||
assert_eq!(0, v2_immutables[0].id());
|
||||
assert_eq!(1, v2.mutable_memtable().id());
|
||||
assert_eq!(2, v2.num_memtables());
|
||||
|
||||
// Add another one and check immutable memtables that need flush
|
||||
let memtable_3 = memtable_builder.build(region_schema);
|
||||
let v3 = v2.freeze_mutable(memtable_3);
|
||||
let (max_table_id, immutables) = v3.memtables_to_flush();
|
||||
assert_eq!(1, max_table_id.unwrap());
|
||||
assert_eq!(2, immutables.len());
|
||||
|
||||
// Remove memtables
|
||||
let v4 = v3.remove_immutables(1);
|
||||
assert_eq!(1, v4.num_memtables());
|
||||
assert_eq!(0, v4.immutable_memtables().len());
|
||||
assert_eq!(2, v4.mutable_memtable().id());
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,66 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! storage metrics
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use prometheus::*;
|
||||
|
||||
/// Reason to flush.
|
||||
pub const FLUSH_REASON: &str = "reason";
|
||||
|
||||
lazy_static! {
|
||||
/// Elapsed time of updating manifest when creating regions.
|
||||
pub static ref CREATE_REGION_UPDATE_MANIFEST: Histogram =
|
||||
register_histogram!("storage_create_region_update_manifest", "storage create region update manifest").unwrap();
|
||||
/// Counter of scheduled flush requests.
|
||||
pub static ref FLUSH_REQUESTS_TOTAL: IntCounterVec =
|
||||
register_int_counter_vec!("storage_flush_requests_total", "storage flush requests total", &[FLUSH_REASON]).unwrap();
|
||||
/// Counter of scheduled failed flush jobs.
|
||||
pub static ref FLUSH_ERRORS_TOTAL: IntCounter =
|
||||
register_int_counter!("storage_flush_errors_total", "storage flush errors total").unwrap();
|
||||
//// Elapsed time of a flush job.
|
||||
pub static ref FLUSH_ELAPSED: Histogram =
|
||||
register_histogram!("storage_flush_elapsed", "storage flush elapsed").unwrap();
|
||||
/// Counter of flushed bytes.
|
||||
pub static ref FLUSH_BYTES_TOTAL: IntCounter =
|
||||
register_int_counter!("storage_flush_bytes_total", "storage flush bytes total").unwrap();
|
||||
/// Gauge for open regions
|
||||
pub static ref REGION_COUNT: IntGauge =
|
||||
register_int_gauge!("storage_region_count", "storage region count").unwrap();
|
||||
/// Timer for logstore write
|
||||
pub static ref LOG_STORE_WRITE_ELAPSED: Histogram =
|
||||
register_histogram!("storage_logstore_write_elapsed", "storage logstore write elapsed").unwrap();
|
||||
/// Elapsed time of a compact job.
|
||||
pub static ref COMPACT_ELAPSED: Histogram =
|
||||
register_histogram!("storage_compact_elapsed", "storage compact elapsed").unwrap();
|
||||
/// Elapsed time for merging SST files.
|
||||
pub static ref MERGE_ELAPSED: Histogram =
|
||||
register_histogram!("storage_compaction_merge_elapsed", "storage compaction merge elapsed").unwrap();
|
||||
/// Global write buffer size in bytes.
|
||||
pub static ref WRITE_BUFFER_BYTES: IntGauge =
|
||||
register_int_gauge!("storage_write_buffer_bytes", "storage write buffer bytes").unwrap();
|
||||
/// Elapsed time of inserting memtable.
|
||||
pub static ref MEMTABLE_WRITE_ELAPSED: Histogram =
|
||||
register_histogram!("storage_memtable_write_elapsed", "storage memtable write elapsed").unwrap();
|
||||
/// Elapsed time of preprocessing write batch.
|
||||
pub static ref PREPROCESS_ELAPSED: Histogram =
|
||||
register_histogram!("storage_write_preprocess_elapsed", "storage write preprocess elapsed").unwrap();
|
||||
/// Elapsed time for windowed scan
|
||||
pub static ref WINDOW_SCAN_ELAPSED: Histogram =
|
||||
register_histogram!("query_scan_window_scan_elapsed", "query scan window scan elapsed").unwrap();
|
||||
/// Rows per window during window scan
|
||||
pub static ref WINDOW_SCAN_ROWS_PER_WINDOW: Histogram =
|
||||
register_histogram!("query_scan_window_scan_window_row_size", "query scan window scan window row size").unwrap();
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod wal;
|
||||
@@ -1,40 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#![allow(clippy::all)]
|
||||
tonic::include_proto!("greptime.storage.wal.v1");
|
||||
|
||||
use api::v1::OpType;
|
||||
|
||||
use crate::write_batch::Payload;
|
||||
|
||||
pub fn gen_mutation_types(payload: &Payload) -> Vec<i32> {
|
||||
payload
|
||||
.mutations
|
||||
.iter()
|
||||
.map(|m| match m.op_type {
|
||||
OpType::Delete => MutationType::Delete.into(),
|
||||
OpType::Put => MutationType::Put.into(),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
impl WalHeader {
|
||||
pub fn with_last_manifest_version(last_manifest_version: u64) -> Self {
|
||||
Self {
|
||||
last_manifest_version,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,271 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Common structs and utilities for read.
|
||||
|
||||
mod chain;
|
||||
mod dedup;
|
||||
mod merge;
|
||||
mod windowed;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use datatypes::data_type::DataType;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::vectors::{BooleanVector, MutableVector, VectorRef};
|
||||
use snafu::{ensure, ResultExt};
|
||||
|
||||
use crate::error::{self, Result};
|
||||
pub use crate::read::chain::ChainReader;
|
||||
pub use crate::read::dedup::DedupReader;
|
||||
pub use crate::read::merge::{MergeReader, MergeReaderBuilder};
|
||||
pub use crate::read::windowed::WindowedReader;
|
||||
|
||||
/// Storage internal representation of a batch of rows.
|
||||
// Now the structure of `Batch` is still unstable, all pub fields may be changed.
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone)]
|
||||
pub struct Batch {
|
||||
/// Rows organized in columnar format.
|
||||
///
|
||||
/// Columns follow the same order convention of region schema:
|
||||
/// key, value, internal columns.
|
||||
pub columns: Vec<VectorRef>,
|
||||
}
|
||||
|
||||
impl Batch {
|
||||
/// Create a new `Batch` from `columns`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if vectors in `columns` have different length.
|
||||
pub fn new(columns: Vec<VectorRef>) -> Batch {
|
||||
Self::assert_columns(&columns);
|
||||
|
||||
Batch { columns }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn num_columns(&self) -> usize {
|
||||
self.columns.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn num_rows(&self) -> usize {
|
||||
self.columns.get(0).map(|v| v.len()).unwrap_or(0)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.num_rows() == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn columns(&self) -> &[VectorRef] {
|
||||
&self.columns
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn column(&self, idx: usize) -> &VectorRef {
|
||||
&self.columns[idx]
|
||||
}
|
||||
|
||||
/// Slice the batch, returning a new batch.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `offset + length > self.num_rows()`.
|
||||
fn slice(&self, offset: usize, length: usize) -> Batch {
|
||||
let columns = self
|
||||
.columns
|
||||
.iter()
|
||||
.map(|v| v.slice(offset, length))
|
||||
.collect();
|
||||
Batch { columns }
|
||||
}
|
||||
|
||||
fn assert_columns(columns: &[VectorRef]) {
|
||||
if columns.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let length = columns[0].len();
|
||||
assert!(columns.iter().all(|col| col.len() == length));
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute operations for Batch.
|
||||
pub trait BatchOp {
|
||||
/// Compare `i-th` in `left` to `j-th` row in `right` by key (row key + internal columns).
|
||||
///
|
||||
/// The caller should ensure `left` and `right` have same schema as `self`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `i` or `j` is out of bound.
|
||||
/// - `left` or `right` has insufficient column num.
|
||||
fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering;
|
||||
|
||||
/// Find unique rows in `batch` by row key.
|
||||
///
|
||||
/// If `prev` is `Some` and not empty, the last row of `prev` would be used to dedup
|
||||
/// current `batch`. Set `i-th` bit of `selected` to `true` if `i-th` row is unique,
|
||||
/// which means the row key of `i-th` row is different from `i+1-th`'s.
|
||||
///
|
||||
/// The caller could use `selected` to build a [BooleanVector] to filter the
|
||||
/// batch, and must ensure `selected` is initialized by filling `batch.num_rows()` bits
|
||||
/// to zero.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `batch` and `prev` have different number of columns (unless `prev` is
|
||||
/// empty).
|
||||
/// - `selected.len()` is less than the number of rows.
|
||||
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>);
|
||||
|
||||
/// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
|
||||
/// are true).
|
||||
///
|
||||
/// Note that the nulls of `filter` are interpreted as `false` will lead to these elements
|
||||
/// being masked out.
|
||||
fn filter(&self, batch: &Batch, filter: &BooleanVector) -> Result<Batch>;
|
||||
|
||||
/// Unselect deleted rows according to the [`OpType`](api::v1::OpType).
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `batch` doesn't have a valid op type column.
|
||||
/// - `selected.len()` is less than the number of rows.
|
||||
fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec);
|
||||
}
|
||||
|
||||
/// Reusable [Batch] builder.
|
||||
pub struct BatchBuilder {
|
||||
builders: Vec<Box<dyn MutableVector>>,
|
||||
}
|
||||
|
||||
impl BatchBuilder {
|
||||
/// Create a new `BatchBuilder` from data types with given `capacity`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `types` is empty.
|
||||
pub fn with_capacity<'a, I>(types: I, capacity: usize) -> BatchBuilder
|
||||
where
|
||||
I: IntoIterator<Item = &'a ConcreteDataType>,
|
||||
{
|
||||
let builders: Vec<_> = types
|
||||
.into_iter()
|
||||
.map(|t| t.create_mutable_vector(capacity))
|
||||
.collect();
|
||||
assert!(!builders.is_empty());
|
||||
|
||||
BatchBuilder { builders }
|
||||
}
|
||||
|
||||
/// Returns number of rows already in this builder.
|
||||
#[inline]
|
||||
pub fn num_rows(&self) -> usize {
|
||||
self.builders[0].len()
|
||||
}
|
||||
|
||||
/// Returns true if no rows in this builder.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.num_rows() == 0
|
||||
}
|
||||
|
||||
/// Extend the builder by slice of batch.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `offset + length > batch.num_rows()`.
|
||||
/// - Number of columns in `batch` is not equal to the builder's.
|
||||
pub fn extend_slice_of(&mut self, batch: &Batch, offset: usize, length: usize) -> Result<()> {
|
||||
assert_eq!(self.builders.len(), batch.num_columns());
|
||||
|
||||
for (builder, column) in self.builders.iter_mut().zip(batch.columns()) {
|
||||
builder
|
||||
.extend_slice_of(&**column, offset, length)
|
||||
.context(error::PushBatchSnafu)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Push `i-th` row of batch into the builder.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `i` is out of bound.
|
||||
/// - Number of columns in `batch` is not equal to the builder's.
|
||||
pub fn push_row_of(&mut self, batch: &Batch, i: usize) -> Result<()> {
|
||||
assert_eq!(self.builders.len(), batch.num_columns());
|
||||
|
||||
for (builder, column) in self.builders.iter_mut().zip(batch.columns()) {
|
||||
let value = column.get_ref(i);
|
||||
builder
|
||||
.try_push_value_ref(value)
|
||||
.context(error::PushBatchSnafu)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new [Batch] and reset this builder.
|
||||
pub fn build(&mut self) -> Result<Batch> {
|
||||
// Checks length of each builder.
|
||||
let rows = self.num_rows();
|
||||
for (i, builder) in self.builders.iter().enumerate() {
|
||||
ensure!(
|
||||
rows == builder.len(),
|
||||
error::BuildBatchSnafu {
|
||||
msg: format!(
|
||||
"expect row num {} but builder {} has {}",
|
||||
rows,
|
||||
i,
|
||||
builder.len()
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
let columns = self.builders.iter_mut().map(|b| b.to_vector()).collect();
|
||||
|
||||
Ok(Batch { columns })
|
||||
}
|
||||
}
|
||||
|
||||
/// Async batch reader.
|
||||
#[async_trait]
|
||||
pub trait BatchReader: Send {
|
||||
// TODO(yingwen): Schema of batch.
|
||||
|
||||
/// Fetch next [Batch].
|
||||
///
|
||||
/// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
|
||||
/// again won't return batch again.
|
||||
///
|
||||
/// If `Err` is returned, caller **must** not call this method again, the implementor
|
||||
/// may or may not panic in such case.
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>>;
|
||||
}
|
||||
|
||||
/// Pointer to [BatchReader].
|
||||
pub type BoxedBatchReader = Box<dyn BatchReader>;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
(**self).next_batch().await
|
||||
}
|
||||
}
|
||||
@@ -1,124 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::read::{Batch, BatchReader};
|
||||
use crate::schema::ProjectedSchemaRef;
|
||||
|
||||
/// A reader that simply chain the outputs of input readers.
|
||||
pub struct ChainReader<R> {
|
||||
/// Schema to read
|
||||
pub schema: ProjectedSchemaRef,
|
||||
/// Each reader reads a slice of time window
|
||||
pub readers: Vec<R>,
|
||||
}
|
||||
|
||||
impl<R> ChainReader<R> {
|
||||
/// Returns a new [ChainReader] with specific input `readers`.
|
||||
pub fn new(schema: ProjectedSchemaRef, mut readers: Vec<R>) -> Self {
|
||||
// Reverse readers since we iter them backward.
|
||||
readers.reverse();
|
||||
Self { schema, readers }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<R> BatchReader for ChainReader<R>
|
||||
where
|
||||
R: BatchReader,
|
||||
{
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
while let Some(reader) = self.readers.last_mut() {
|
||||
if let Some(batch) = reader.next_batch().await? {
|
||||
return Ok(Some(batch));
|
||||
} else {
|
||||
// Remove the exhausted reader.
|
||||
self.readers.pop();
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::test_util::read_util::{self, Batches, VecBatchReader};
|
||||
|
||||
fn build_chain_reader(sources: &[Batches]) -> ChainReader<VecBatchReader> {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let readers = sources
|
||||
.iter()
|
||||
.map(|source| read_util::build_vec_reader(source))
|
||||
.collect();
|
||||
|
||||
ChainReader::new(schema, readers)
|
||||
}
|
||||
|
||||
async fn check_chain_reader_result(
|
||||
mut reader: ChainReader<VecBatchReader>,
|
||||
input: &[Batches<'_>],
|
||||
) {
|
||||
let expect: Vec<_> = input
|
||||
.iter()
|
||||
.flat_map(|v| v.iter())
|
||||
.flat_map(|v| v.iter().copied())
|
||||
.collect();
|
||||
|
||||
let result = read_util::collect_kv_batch(&mut reader).await;
|
||||
assert_eq!(expect, result);
|
||||
|
||||
// Call next_batch() again is allowed.
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chain_empty() {
|
||||
let mut reader = build_chain_reader(&[]);
|
||||
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
// Call next_batch() again is allowed.
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chain_one() {
|
||||
let input: &[Batches] = &[&[
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
&[(3, Some(3)), (4, Some(4))],
|
||||
&[(5, Some(5))],
|
||||
]];
|
||||
|
||||
let reader = build_chain_reader(input);
|
||||
|
||||
check_chain_reader_result(reader, input).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chain_multi() {
|
||||
let input: &[Batches] = &[
|
||||
&[
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
&[(3, Some(3)), (4, Some(4))],
|
||||
&[(5, Some(5))],
|
||||
],
|
||||
&[&[(6, Some(3)), (7, Some(4)), (8, Some(8))], &[(9, Some(9))]],
|
||||
&[&[(10, Some(10)), (11, Some(11))], &[(12, Some(12))]],
|
||||
];
|
||||
|
||||
let reader = build_chain_reader(input);
|
||||
|
||||
check_chain_reader_result(reader, input).await;
|
||||
}
|
||||
}
|
||||
@@ -1,181 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::vectors::BooleanVector;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::read::{Batch, BatchOp, BatchReader};
|
||||
use crate::schema::ProjectedSchemaRef;
|
||||
|
||||
/// A reader that dedup rows from inner reader.
|
||||
pub struct DedupReader<R> {
|
||||
/// Projected schema to read.
|
||||
schema: ProjectedSchemaRef,
|
||||
/// The inner reader.
|
||||
reader: R,
|
||||
/// Previous batch from the reader.
|
||||
prev_batch: Option<Batch>,
|
||||
/// Reused bitmap buffer.
|
||||
selected: BitVec,
|
||||
}
|
||||
|
||||
impl<R> DedupReader<R> {
|
||||
pub fn new(schema: ProjectedSchemaRef, reader: R) -> DedupReader<R> {
|
||||
DedupReader {
|
||||
schema,
|
||||
reader,
|
||||
prev_batch: None,
|
||||
selected: BitVec::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Take `batch` and then returns a new batch with no duplicated rows.
|
||||
///
|
||||
/// This method may returns empty `Batch`.
|
||||
fn dedup_batch(&mut self, batch: Batch) -> Result<Batch> {
|
||||
if batch.is_empty() {
|
||||
// No need to update `prev_batch` if current batch is empty.
|
||||
return Ok(batch);
|
||||
}
|
||||
|
||||
// Reinitialize the bit map to zeros.
|
||||
self.selected.clear();
|
||||
self.selected.resize(batch.num_rows(), false);
|
||||
self.schema
|
||||
.find_unique(&batch, &mut self.selected, self.prev_batch.as_ref());
|
||||
|
||||
// Store current batch to `prev_batch` so we could compare the next batch
|
||||
// with this batch. We store batch before filtering it mainly for correctness, as
|
||||
// once we supports `DELETE`, rows with `OpType::Delete` would be removed from the
|
||||
// batch after filter, then we may store an incorrect `last row` of previous batch.
|
||||
self.prev_batch
|
||||
.get_or_insert_with(Batch::default)
|
||||
.clone_from(&batch); // Use `clone_from` to reuse allocated memory if possible.
|
||||
|
||||
// Find all rows whose op_types are `OpType::Delete`, mark their `selected` to false.
|
||||
self.schema.unselect_deleted(&batch, &mut self.selected);
|
||||
|
||||
let filter = BooleanVector::from_iterator(self.selected.iter().by_vals());
|
||||
// Filter duplicate rows.
|
||||
self.schema.filter(&batch, &filter)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<R: BatchReader> BatchReader for DedupReader<R> {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
while let Some(batch) = self.reader.next_batch().await? {
|
||||
let filtered = self.dedup_batch(batch)?;
|
||||
// Skip empty batch.
|
||||
if !filtered.is_empty() {
|
||||
return Ok(Some(filtered));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use api::v1::OpType;
|
||||
|
||||
use super::*;
|
||||
use crate::test_util::read_util;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dedup_reader_empty() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let reader = read_util::build_vec_reader(&[]);
|
||||
let mut reader = DedupReader::new(schema, reader);
|
||||
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
// Call next_batch() again is allowed.
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dedup_by_sequence() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let reader = read_util::build_full_vec_reader(&[
|
||||
// key, value, sequence, op_type
|
||||
&[
|
||||
(100, 1, 1000, OpType::Put),
|
||||
(100, 2, 999, OpType::Put),
|
||||
(100, 3, 998, OpType::Put),
|
||||
(101, 1, 1000, OpType::Put),
|
||||
],
|
||||
&[
|
||||
(101, 2, 999, OpType::Put),
|
||||
(102, 12, 1000, OpType::Put),
|
||||
(103, 13, 1000, OpType::Put),
|
||||
],
|
||||
&[(103, 2, 999, OpType::Put)],
|
||||
]);
|
||||
let mut reader = DedupReader::new(schema, reader);
|
||||
|
||||
let result = read_util::collect_kv_batch(&mut reader).await;
|
||||
let expect = [
|
||||
(100, Some(1)),
|
||||
(101, Some(1)),
|
||||
(102, Some(12)),
|
||||
(103, Some(13)),
|
||||
];
|
||||
assert_eq!(&expect, &result[..]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dedup_contains_empty_input() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let reader = read_util::build_full_vec_reader(&[
|
||||
// key, value, sequence, op_type
|
||||
&[
|
||||
(100, 1, 1000, OpType::Put),
|
||||
(100, 2, 999, OpType::Put),
|
||||
(101, 1, 1000, OpType::Put),
|
||||
],
|
||||
&[],
|
||||
&[(101, 2, 999, OpType::Put), (102, 12, 1000, OpType::Put)],
|
||||
]);
|
||||
let mut reader = DedupReader::new(schema, reader);
|
||||
|
||||
let result = read_util::collect_kv_batch(&mut reader).await;
|
||||
let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))];
|
||||
assert_eq!(&expect, &result[..]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dedup_contains_empty_output() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let reader = read_util::build_full_vec_reader(&[
|
||||
// key, value, sequence, op_type
|
||||
&[
|
||||
(100, 1, 1000, OpType::Put),
|
||||
(100, 2, 999, OpType::Put),
|
||||
(101, 1, 1000, OpType::Put),
|
||||
],
|
||||
&[(101, 2, 999, OpType::Put)],
|
||||
&[(101, 3, 998, OpType::Put), (101, 4, 997, OpType::Put)],
|
||||
&[(102, 12, 998, OpType::Put)],
|
||||
]);
|
||||
let mut reader = DedupReader::new(schema, reader);
|
||||
|
||||
let result = read_util::collect_kv_batch(&mut reader).await;
|
||||
let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))];
|
||||
assert_eq!(&expect, &result[..]);
|
||||
}
|
||||
}
|
||||
@@ -1,828 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Merge reader.
|
||||
//!
|
||||
//! The implementation of [`MergeReader`] is inspired by
|
||||
//! [`kudu's MergeIterator`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L107)
|
||||
//! and [`CeresDB's MergeIterator`](https://github.com/CeresDB/ceresdb/blob/02a7e3100f47cf16aa6c245ed529a6978be20fbd/analytic_engine/src/row_iter/merge.rs)
|
||||
//!
|
||||
//! The main idea of the merge algorithm is to maintain a `merge window`. The window describes,
|
||||
//! at any given time, the key range where we expect to find the row with the smallest key.
|
||||
//! A [`Node`] (known as the sub-iterator in kudu) whose NEXT overlaps with the `merge window`
|
||||
//! is said to be actively participating in the merge.
|
||||
//!
|
||||
//! The `merge window` is defined as follows:
|
||||
//! 1. The window's start is the smallest lower bound of all nodes. We
|
||||
//! refer to the node that owns this lower bound as LOW.
|
||||
//! 2. The window’s end is the smallest upper bound of all nodes whose
|
||||
//! lower bounds are less than or equal to LOW's upper bound.
|
||||
//! 2a. The window's end could be LOW's upper bound itself, if it is the smallest
|
||||
//! upper bound, but this isn't necessarily the case.
|
||||
//! 3. The merge window's dimensions change as the merge proceeds, though it
|
||||
//! only ever moves "to the right" (i.e. the window start/end only increase).
|
||||
//!
|
||||
//! We can divide the nodes into two sets, one for whose next rows overlap with the `merge window`,
|
||||
//! another for whose next rows do not. The merge steady state resembles that of a traditional
|
||||
//! heap-based merge: the top-most node is popped from HOT, the lower bound is copied to the output
|
||||
//! and advanced, and the node is pushed back to HOT.
|
||||
//!
|
||||
//! In the steady state, we need to move nodes from COLD to HOT whenever the end of the merge window
|
||||
//! moves; that's a sign that the window may now overlap with a NEXT belonging to a nodes in the
|
||||
//! second set (COLD). The end of the merge window moves when a node is fully exhausted (i.e. all rows have
|
||||
//! been copied to the output), or when a node finishes its NEXT and needs to peek again.
|
||||
//!
|
||||
//! At any given time, the NEXT belonging to the top-most node in COLD is nearest the merge window.
|
||||
//! When the merge window's end has moved and we need to refill HOT, the top-most node in COLD is
|
||||
//! the best candidate. To figure out whether it should be moved, we compare its NEXT's lower bound
|
||||
//! against the upper bound in HOT's first node: if the lower bound is less than or equal to the key,
|
||||
//! we move the node from COLD to HOT. On the flip side, when a node from HOT finishes its NEXT and peeks
|
||||
//! again, we also need to check whether it has exited the merge window. The approach is similar: if
|
||||
//! its NEXT's lower bound is greater than the upper bound of HOT'S first node, it's time to move it to COLD.
|
||||
//!
|
||||
//! A full description of the merge algorithm could be found in [`kudu's comment`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L349)
|
||||
//! and the [google doc](https://docs.google.com/document/d/1uP0ubjM6ulnKVCRrXtwT_dqrTWjF9tlFSRk0JN2e_O0/edit#).
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use store_api::storage::consts;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::memtable::BoxedBatchIterator;
|
||||
use crate::read::{Batch, BatchBuilder, BatchOp, BatchReader, BoxedBatchReader};
|
||||
use crate::schema::{ProjectedSchema, ProjectedSchemaRef};
|
||||
|
||||
/// Batch data source.
|
||||
enum Source {
|
||||
// To avoid the overhead of async-trait (typically a heap allocation), wraps the
|
||||
// BatchIterator into an enum instead of converting the iterator into a BatchReader.
|
||||
Iter(BoxedBatchIterator),
|
||||
Reader(BoxedBatchReader),
|
||||
}
|
||||
|
||||
impl Source {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
match self {
|
||||
Source::Iter(iter) => iter.next().transpose(),
|
||||
Source::Reader(reader) => reader.next_batch().await,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch next non empty batch.
|
||||
async fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
|
||||
while let Some(batch) = self.next_batch().await? {
|
||||
if !batch.is_empty() {
|
||||
return Ok(Some(batch));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Source {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Source::Iter(_) => write!(f, "Iter(..)"),
|
||||
Source::Reader(_) => write!(f, "Reader(..)"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reference to a row in [BatchCursor].
|
||||
#[derive(Debug)]
|
||||
struct RowCursor<'a> {
|
||||
batch: &'a Batch,
|
||||
pos: usize,
|
||||
}
|
||||
|
||||
impl<'a> RowCursor<'a> {
|
||||
#[inline]
|
||||
fn compare(&self, schema: &ProjectedSchema, other: &RowCursor) -> Ordering {
|
||||
schema.compare_row(self.batch, self.pos, other.batch, other.pos)
|
||||
}
|
||||
}
|
||||
|
||||
/// A `BatchCursor` wraps the `Batch` and allows reading the `Batch` by row.
|
||||
#[derive(Debug)]
|
||||
struct BatchCursor {
|
||||
/// Current buffered `Batch`.
|
||||
///
|
||||
/// `Batch` must contains at least one row.
|
||||
batch: Batch,
|
||||
/// Index of current row.
|
||||
///
|
||||
/// `pos == batch.num_rows()` indicates no more rows to read.
|
||||
pos: usize,
|
||||
}
|
||||
|
||||
impl BatchCursor {
|
||||
/// Create a new `BatchCursor`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `batch` is empty.
|
||||
fn new(batch: Batch) -> BatchCursor {
|
||||
assert!(!batch.is_empty());
|
||||
|
||||
BatchCursor { batch, pos: 0 }
|
||||
}
|
||||
|
||||
/// Returns true if there are remaining rows to read.
|
||||
#[inline]
|
||||
fn is_valid(&self) -> bool {
|
||||
!self.is_empty()
|
||||
}
|
||||
|
||||
/// Returns first row of current batch.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is invalid.
|
||||
fn first_row(&self) -> RowCursor {
|
||||
assert!(self.is_valid());
|
||||
|
||||
RowCursor {
|
||||
batch: &self.batch,
|
||||
pos: self.pos,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns last row of current batch.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is invalid.
|
||||
fn last_row(&self) -> RowCursor {
|
||||
assert!(self.is_valid());
|
||||
|
||||
RowCursor {
|
||||
batch: &self.batch,
|
||||
pos: self.batch.num_rows() - 1,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.pos >= self.batch.num_rows()
|
||||
}
|
||||
|
||||
/// Take slice of batch with at most `length` rows from the cursor, then
|
||||
/// advance the cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is invalid.
|
||||
fn take_batch_slice(&mut self, length: usize) -> Batch {
|
||||
let length = length.min(self.batch.num_rows() - self.pos);
|
||||
let batch = self.batch.slice(self.pos, length);
|
||||
self.pos += batch.num_rows();
|
||||
|
||||
batch
|
||||
}
|
||||
|
||||
/// Push at most `length` rows from `self` to the `builder` and advance the cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is invalid.
|
||||
fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> {
|
||||
let length = length.min(self.batch.num_rows() - self.pos);
|
||||
builder.extend_slice_of(&self.batch, self.pos, length)?;
|
||||
self.pos += length;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Push next row from `self` to the `builder` and advance the cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is invalid.
|
||||
fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> {
|
||||
builder.push_row_of(&self.batch, self.pos)?;
|
||||
self.pos += 1;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A `Node` represent an individual input data source to be merged.
|
||||
struct Node {
|
||||
/// Schema of data source.
|
||||
schema: ProjectedSchemaRef,
|
||||
/// Data source of this `Node`.
|
||||
source: Source,
|
||||
/// Current batch to be read.
|
||||
///
|
||||
/// `None` means the `source` has reached EOF.
|
||||
cursor: Option<BatchCursor>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Node {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Node")
|
||||
.field("source", &self.source)
|
||||
.field("cursor", &self.cursor)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl Node {
|
||||
async fn new(schema: ProjectedSchemaRef, mut source: Source) -> Result<Node> {
|
||||
let cursor = source.next_non_empty_batch().await?.map(BatchCursor::new);
|
||||
Ok(Node {
|
||||
schema,
|
||||
source,
|
||||
cursor,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the reference to the cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn cursor_ref(&self) -> &BatchCursor {
|
||||
self.cursor.as_ref().unwrap()
|
||||
}
|
||||
|
||||
/// Returns first row in cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn first_row(&self) -> RowCursor {
|
||||
self.cursor_ref().first_row()
|
||||
}
|
||||
|
||||
/// Returns last row in cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn last_row(&self) -> RowCursor {
|
||||
self.cursor_ref().last_row()
|
||||
}
|
||||
|
||||
/// Compare first row of two nodes.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - either `self` or `other` is EOF.
|
||||
fn compare_first_row(&self, other: &Node) -> Ordering {
|
||||
self.first_row().compare(&self.schema, &other.first_row())
|
||||
}
|
||||
|
||||
/// Returns true if no more batch could be fetched from this node.
|
||||
fn is_eof(&self) -> bool {
|
||||
self.cursor.is_none()
|
||||
}
|
||||
|
||||
/// Returns true if the key range of current batch in `self` is behind (exclusive) current
|
||||
/// batch in `other`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - either `self` or `other` is EOF.
|
||||
fn is_behind(&self, other: &Node) -> bool {
|
||||
let first = self.first_row();
|
||||
let last = other.last_row();
|
||||
// `self` is after `other` if min (first) row of `self` is greater than
|
||||
// max (last) row of `other`.
|
||||
first.compare(&self.schema, &last) == Ordering::Greater
|
||||
}
|
||||
|
||||
/// Fetch next batch and reset its cursor if `self` isn't EOF and the cursor
|
||||
/// is empty.
|
||||
///
|
||||
/// Returns true if a new batch has been fetched.
|
||||
async fn maybe_fetch_next_batch(&mut self) -> Result<bool> {
|
||||
let need_fetch = !self.is_eof() && self.cursor_ref().is_empty();
|
||||
if !need_fetch {
|
||||
// Still has remaining rows, no need to fetch.
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// This ensure the cursor is either non empty or None (EOF).
|
||||
match self.source.next_non_empty_batch().await? {
|
||||
Some(batch) => {
|
||||
self.cursor = Some(BatchCursor::new(batch));
|
||||
Ok(true)
|
||||
}
|
||||
None => {
|
||||
// EOF
|
||||
self.cursor = None;
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the mutable reference to the cursor.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn cursor_mut(&mut self) -> &mut BatchCursor {
|
||||
self.cursor.as_mut().unwrap()
|
||||
}
|
||||
|
||||
/// Take batch from this node.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn take_batch_slice(&mut self, length: usize) -> Batch {
|
||||
self.cursor_mut().take_batch_slice(length)
|
||||
}
|
||||
|
||||
/// Push at most `length` rows from `self` to the `builder`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> {
|
||||
self.cursor_mut().push_rows_to(builder, length)
|
||||
}
|
||||
|
||||
/// Push next row from `self` to the `builder`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `self` is EOF.
|
||||
fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> {
|
||||
self.cursor_mut().push_next_row_to(builder)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Node {
|
||||
fn eq(&self, other: &Node) -> bool {
|
||||
self.compare_first_row(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Node {}
|
||||
|
||||
impl PartialOrd for Node {
|
||||
fn partial_cmp(&self, other: &Node) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Node {
|
||||
fn cmp(&self, other: &Node) -> Ordering {
|
||||
// The std binary heap is a max heap, but we want the nodes are ordered in
|
||||
// ascend order, so we compare the nodes in reverse order.
|
||||
other.compare_first_row(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// A reader that would sort and merge `Batch` from multiple sources by key.
|
||||
///
|
||||
/// `Batch` from each `Source` **must** be sorted.
|
||||
pub struct MergeReader {
|
||||
/// Whether the reader has been initialized.
|
||||
initialized: bool,
|
||||
/// Schema of data source.
|
||||
schema: ProjectedSchemaRef,
|
||||
/// Input data sources.
|
||||
///
|
||||
/// All data source must have same schema. Initialize the reader would
|
||||
/// convert all `Source`s into `Node`s and then clear this vector.
|
||||
sources: Vec<Source>,
|
||||
/// Holds `Node` whose key range of current batch **is** overlapped with the merge window.
|
||||
///
|
||||
/// `Node` in this heap **must** not be empty. A `merge window` is the key range of the
|
||||
/// root node in the `hot` heap.
|
||||
hot: BinaryHeap<Node>,
|
||||
/// Holds `Node` whose key range of current batch **isn't** overlapped with the merge window.
|
||||
///
|
||||
/// `Node` in this heap **must** not be empty.
|
||||
cold: BinaryHeap<Node>,
|
||||
/// Suggested row number of each batch.
|
||||
///
|
||||
/// The size of the batch yield from this reader may not always equal to this suggested size.
|
||||
batch_size: usize,
|
||||
/// Buffered batch.
|
||||
batch_builder: BatchBuilder,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl BatchReader for MergeReader {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
self.fetch_next_batch().await
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MergeReaderBuilder {
|
||||
schema: ProjectedSchemaRef,
|
||||
sources: Vec<Source>,
|
||||
batch_size: usize,
|
||||
}
|
||||
|
||||
impl MergeReaderBuilder {
|
||||
pub fn new(schema: ProjectedSchemaRef) -> Self {
|
||||
MergeReaderBuilder::with_capacity(schema, 0)
|
||||
}
|
||||
|
||||
pub fn with_capacity(schema: ProjectedSchemaRef, capacity: usize) -> Self {
|
||||
MergeReaderBuilder {
|
||||
schema,
|
||||
sources: Vec::with_capacity(capacity),
|
||||
batch_size: consts::READ_BATCH_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_batch_iter(mut self, iter: BoxedBatchIterator) -> Self {
|
||||
self.sources.push(Source::Iter(iter));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push_batch_reader(mut self, reader: BoxedBatchReader) -> Self {
|
||||
self.sources.push(Source::Reader(reader));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn batch_size(mut self, size: usize) -> Self {
|
||||
self.batch_size = size;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> MergeReader {
|
||||
let num_sources = self.sources.len();
|
||||
let column_schemas = self.schema.schema_to_read().schema().column_schemas();
|
||||
let batch_builder = BatchBuilder::with_capacity(
|
||||
column_schemas.iter().map(|c| &c.data_type),
|
||||
self.batch_size,
|
||||
);
|
||||
|
||||
MergeReader {
|
||||
initialized: false,
|
||||
schema: self.schema,
|
||||
sources: self.sources,
|
||||
hot: BinaryHeap::with_capacity(num_sources),
|
||||
cold: BinaryHeap::with_capacity(num_sources),
|
||||
batch_size: self.batch_size,
|
||||
batch_builder,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeReader {
|
||||
/// Initialize the reader if it has not yet been initialized.
|
||||
async fn try_init(&mut self) -> Result<()> {
|
||||
if self.initialized {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.sources.is_empty() {
|
||||
self.initialized = true;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for source in self.sources.drain(..) {
|
||||
let node = Node::new(self.schema.clone(), source).await?;
|
||||
|
||||
if !node.is_eof() {
|
||||
self.cold.push(node);
|
||||
}
|
||||
}
|
||||
|
||||
self.refill_hot();
|
||||
|
||||
self.initialized = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn fetch_next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
self.try_init().await?;
|
||||
|
||||
while !self.hot.is_empty() && self.batch_builder.num_rows() < self.batch_size {
|
||||
if self.hot.len() == 1 {
|
||||
// No need to do merge sort if only one batch in the hot heap.
|
||||
let fetch_row_num = self.batch_size - self.batch_builder.num_rows();
|
||||
if let Some(batch) = self.fetch_batch_from_hottest(fetch_row_num).await? {
|
||||
// The builder is empty and we have fetched a new batch from this node.
|
||||
return Ok(Some(batch));
|
||||
}
|
||||
// Otherwise, some rows may have been pushed into the builder.
|
||||
} else {
|
||||
// We could only fetch one row from the hottest node.
|
||||
self.fetch_one_row_from_hottest().await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Check buffered rows in the builder.
|
||||
if self.batch_builder.is_empty() {
|
||||
Ok(None)
|
||||
} else {
|
||||
self.batch_builder.build().map(Some)
|
||||
}
|
||||
}
|
||||
|
||||
/// Move nodes in `cold` heap, whose key range is overlapped with current merge
|
||||
/// window to `hot` heap.
|
||||
fn refill_hot(&mut self) {
|
||||
while !self.cold.is_empty() {
|
||||
if let Some(merge_window) = self.hot.peek() {
|
||||
let warmest = self.cold.peek().unwrap();
|
||||
if warmest.is_behind(merge_window) {
|
||||
// if the warmest node in the `cold` heap is totally after the
|
||||
// `merge_window`, then no need to add more nodes into the `hot`
|
||||
// heap for merge sorting.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let warmest = self.cold.pop().unwrap();
|
||||
self.hot.push(warmest);
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch at most `fetch_row_num` from the hottest node and attempt to return them directly
|
||||
/// instead of pushing into the builder if the `self.batch_builder` is empty.
|
||||
async fn fetch_batch_from_hottest(&mut self, fetch_row_num: usize) -> Result<Option<Batch>> {
|
||||
assert_eq!(1, self.hot.len());
|
||||
|
||||
let mut hottest = self.hot.pop().unwrap();
|
||||
let batch = if self.batch_builder.is_empty() {
|
||||
Some(hottest.take_batch_slice(fetch_row_num))
|
||||
} else {
|
||||
hottest.push_rows_to(&mut self.batch_builder, fetch_row_num)?;
|
||||
|
||||
None
|
||||
};
|
||||
|
||||
self.reheap(hottest).await?;
|
||||
|
||||
Ok(batch)
|
||||
}
|
||||
|
||||
/// Fetch one row from the hottest node.
|
||||
async fn fetch_one_row_from_hottest(&mut self) -> Result<()> {
|
||||
let mut hottest = self.hot.pop().unwrap();
|
||||
hottest.push_next_row_to(&mut self.batch_builder)?;
|
||||
|
||||
self.reheap(hottest).await
|
||||
}
|
||||
|
||||
/// Fetch next batch from this node and reset its cursor, then push the node back to a
|
||||
/// proper heap.
|
||||
async fn reheap(&mut self, mut node: Node) -> Result<()> {
|
||||
let fetched_new_batch = node.maybe_fetch_next_batch().await?;
|
||||
|
||||
if node.is_eof() {
|
||||
// The merge window would be updated, need to refill the hot heap.
|
||||
self.refill_hot();
|
||||
} else if fetched_new_batch {
|
||||
// A new batch has been fetched from the node, thus the key range of this node
|
||||
// has been changed. Try to find a proper heap for this node.
|
||||
let node_is_cold = if let Some(hottest) = self.hot.peek() {
|
||||
// Now key range of this node is behind the hottest node's.
|
||||
node.is_behind(hottest)
|
||||
} else {
|
||||
// Setting this to false should not affect correctness but performance because
|
||||
// `refille_hot()` ensures the hottest node is correct.
|
||||
true
|
||||
};
|
||||
|
||||
if node_is_cold {
|
||||
self.cold.push(node);
|
||||
} else {
|
||||
self.hot.push(node);
|
||||
}
|
||||
// Anyway, the merge window has been changed, we need to refill the hot heap.
|
||||
self.refill_hot();
|
||||
} else {
|
||||
// No new batch has been fetched, so the end key of merge window has not been
|
||||
// changed, we could just put the node back to the hot heap.
|
||||
self.hot.push(node);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
|
||||
|
||||
use super::*;
|
||||
use crate::test_util::read_util::{self, Batches};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_reader_empty() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
|
||||
let mut reader = MergeReaderBuilder::new(schema).build();
|
||||
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
// Call next_batch() again is allowed.
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_node() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let left_source = read_util::build_boxed_iter(&[&[(1, None), (3, None), (5, None)]]);
|
||||
let mut left = Node::new(schema.clone(), Source::Iter(left_source))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let right_source = read_util::build_boxed_reader(&[&[(2, None), (3, None), (6, None)]]);
|
||||
let mut right = Node::new(schema.clone(), Source::Reader(right_source))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// We use reverse order for a node.
|
||||
assert!(left > right);
|
||||
assert_ne!(left, right);
|
||||
|
||||
// Advance the left and right node.
|
||||
left.cursor_mut().pos += 1;
|
||||
right.cursor_mut().pos += 1;
|
||||
assert_eq!(left, right);
|
||||
|
||||
// Check Debug is implemented.
|
||||
let output = format!("{left:?}");
|
||||
assert!(output.contains("cursor"));
|
||||
assert!(output.contains("pos: 1"));
|
||||
let output = format!("{right:?}");
|
||||
assert!(output.contains("cursor"));
|
||||
let output = format!("{:?}", left.first_row());
|
||||
assert!(output.contains("pos: 1"));
|
||||
}
|
||||
|
||||
fn build_merge_reader(sources: &[Batches], num_iter: usize, batch_size: usize) -> MergeReader {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let mut builder =
|
||||
MergeReaderBuilder::with_capacity(schema, sources.len()).batch_size(batch_size);
|
||||
|
||||
for (i, source) in sources.iter().enumerate() {
|
||||
if i < num_iter {
|
||||
builder = builder.push_batch_iter(read_util::build_boxed_iter(source));
|
||||
} else {
|
||||
builder = builder.push_batch_reader(read_util::build_boxed_reader(source));
|
||||
}
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
async fn check_merge_reader_result(mut reader: MergeReader, input: &[Batches<'_>]) {
|
||||
let mut expect: Vec<_> = input
|
||||
.iter()
|
||||
.flat_map(|v| v.iter())
|
||||
.flat_map(|v| v.iter().copied())
|
||||
.collect();
|
||||
expect.sort_by_key(|k| k.0);
|
||||
|
||||
let result = read_util::collect_kv_batch(&mut reader).await;
|
||||
assert_eq!(expect, result);
|
||||
|
||||
// Call next_batch() again is allowed.
|
||||
assert!(reader.next_batch().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
async fn check_merge_reader_by_batch(mut reader: MergeReader, expect_batches: Batches<'_>) {
|
||||
let mut result = Vec::new();
|
||||
while let Some(batch) = reader.next_batch().await.unwrap() {
|
||||
let key = batch
|
||||
.column(0)
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap();
|
||||
let value = batch
|
||||
.column(1)
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Vector>()
|
||||
.unwrap();
|
||||
|
||||
let batch: Vec<_> = key
|
||||
.iter_data()
|
||||
.zip(value.iter_data())
|
||||
.map(|(k, v)| (k.unwrap().into(), v))
|
||||
.collect();
|
||||
result.push(batch);
|
||||
}
|
||||
|
||||
for (expect, actual) in expect_batches.iter().zip(result.iter()) {
|
||||
assert_eq!(expect, actual);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_multiple_interleave() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let input: &[Batches] = &[
|
||||
&[&[(1, Some(1)), (5, Some(5)), (9, Some(9))]],
|
||||
&[&[(2, Some(2)), (3, Some(3)), (8, Some(8))]],
|
||||
&[&[(7, Some(7)), (12, Some(12))]],
|
||||
];
|
||||
let reader = build_merge_reader(input, 1, 3);
|
||||
check_merge_reader_result(reader, input).await;
|
||||
|
||||
let input: &[Batches] = &[
|
||||
&[
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
&[(3, Some(3)), (4, Some(4))],
|
||||
&[(5, Some(5)), (12, Some(12))],
|
||||
],
|
||||
&[&[(6, Some(6)), (7, Some(7)), (18, Some(18))]],
|
||||
&[&[(13, Some(13)), (15, Some(15))]],
|
||||
];
|
||||
let reader = build_merge_reader(input, 1, 3);
|
||||
check_merge_reader_by_batch(
|
||||
reader,
|
||||
&[
|
||||
// The former two batches could be returned directly.
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
&[(3, Some(3)), (4, Some(4))],
|
||||
&[(5, Some(5)), (6, Some(6)), (7, Some(7))],
|
||||
&[(12, Some(12)), (13, Some(13)), (15, Some(15))],
|
||||
&[(18, Some(18))],
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
let input: &[Batches] = &[
|
||||
&[
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
&[(5, Some(5)), (9, Some(9))],
|
||||
&[(14, Some(14)), (17, Some(17))],
|
||||
],
|
||||
&[&[(6, Some(6)), (7, Some(7))], &[(15, Some(15))]],
|
||||
];
|
||||
let reader = build_merge_reader(input, 1, 2);
|
||||
check_merge_reader_by_batch(
|
||||
reader,
|
||||
&[
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
// Could not return batch (6, 7) directly.
|
||||
&[(5, Some(5)), (6, Some(6))],
|
||||
&[(7, Some(7)), (9, Some(9))],
|
||||
&[(14, Some(14)), (15, Some(15))],
|
||||
&[(17, Some(17))],
|
||||
],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_one_source() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let input: &[Batches] = &[&[
|
||||
&[(1, Some(1)), (2, Some(2)), (3, Some(3))],
|
||||
&[(4, Some(4)), (5, Some(5)), (6, Some(6))],
|
||||
]];
|
||||
let reader = build_merge_reader(input, 1, 2);
|
||||
|
||||
check_merge_reader_result(reader, input).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_with_empty_batch() {
|
||||
let input: &[Batches] = &[
|
||||
&[
|
||||
&[(1, Some(1)), (2, Some(2))],
|
||||
&[(3, Some(3)), (6, Some(6))],
|
||||
&[],
|
||||
&[],
|
||||
&[(8, Some(8)), (12, Some(12))],
|
||||
&[],
|
||||
],
|
||||
&[
|
||||
&[(4, Some(4)), (5, Some(5))],
|
||||
&[],
|
||||
&[(15, None), (18, None), (20, None)],
|
||||
],
|
||||
&[&[(13, Some(13)), (19, None)], &[], &[]],
|
||||
];
|
||||
let reader = build_merge_reader(input, 1, 2);
|
||||
|
||||
check_merge_reader_result(reader, input).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_duplicate_key() {
|
||||
let input: &[Batches] = &[
|
||||
&[
|
||||
&[(1, Some(1)), (5, Some(5)), (8, Some(8))],
|
||||
&[(9, None), (11, None)],
|
||||
&[(12, Some(12)), (15, None)],
|
||||
],
|
||||
&[&[(1, Some(1)), (3, Some(3)), (8, Some(8))], &[(16, None)]],
|
||||
&[
|
||||
&[(7, Some(7)), (12, Some(12))],
|
||||
&[(15, None), (16, None), (17, None)],
|
||||
],
|
||||
&[&[(15, None)]],
|
||||
];
|
||||
let reader = build_merge_reader(input, 2, 2);
|
||||
check_merge_reader_result(reader, input).await;
|
||||
}
|
||||
}
|
||||
@@ -1,171 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use arrow::compute::SortOptions;
|
||||
use arrow::row::{RowConverter, SortField};
|
||||
use arrow_array::{Array, ArrayRef};
|
||||
use common_recordbatch::OrderOption;
|
||||
use datatypes::data_type::DataType;
|
||||
use datatypes::vectors::Helper;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::read::{Batch, BatchReader};
|
||||
use crate::schema::{ProjectedSchemaRef, StoreSchema};
|
||||
|
||||
/// [WindowedReader] provides a windowed record batch reader that scans all rows within a window
|
||||
/// at a time and sort these rows ordered in `[<timestamp>, <PK>]` order.
|
||||
pub struct WindowedReader<R> {
|
||||
/// Schema to read
|
||||
pub schema: ProjectedSchemaRef,
|
||||
/// Each reader reads a slice of time window
|
||||
pub readers: Vec<R>,
|
||||
/// `order_options` defines how records within windows are sorted.
|
||||
pub order_options: Vec<OrderOption>,
|
||||
}
|
||||
|
||||
impl<R> WindowedReader<R> {
|
||||
/// Creates a new [WindowedReader] from given schema and a set of boxed readers.
|
||||
///
|
||||
/// ### Note
|
||||
/// [WindowedReader] always reads the readers in a reverse order. The last reader in `readers`
|
||||
/// gets polled first.
|
||||
pub fn new(
|
||||
schema: ProjectedSchemaRef,
|
||||
readers: Vec<R>,
|
||||
order_options: Vec<OrderOption>,
|
||||
) -> Self {
|
||||
Self {
|
||||
schema,
|
||||
readers,
|
||||
order_options,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<R> BatchReader for WindowedReader<R>
|
||||
where
|
||||
R: BatchReader,
|
||||
{
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
let _window_scan_elapsed = crate::metrics::WINDOW_SCAN_ELAPSED.start_timer();
|
||||
let Some(mut reader) = self.readers.pop() else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let store_schema = self.schema.schema_to_read();
|
||||
let mut batches = vec![];
|
||||
while let Some(batch) = reader.next_batch().await? {
|
||||
batches.push(
|
||||
batch
|
||||
.columns
|
||||
.into_iter()
|
||||
.map(|v| v.to_arrow_array())
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
}
|
||||
|
||||
let Some(num_columns) = batches.get(0).map(|b| b.len()) else {
|
||||
// the reader does not yield data, a batch of empty vectors must be returned instead of
|
||||
// an empty batch without any column.
|
||||
let empty_columns = store_schema
|
||||
.columns()
|
||||
.iter()
|
||||
.map(|s| s.desc.data_type.create_mutable_vector(0).to_vector())
|
||||
.collect();
|
||||
return Ok(Some(Batch::new(empty_columns)));
|
||||
};
|
||||
let mut vectors_in_batch = Vec::with_capacity(num_columns);
|
||||
|
||||
for idx in 0..num_columns {
|
||||
let columns: Vec<&dyn Array> =
|
||||
batches.iter().map(|b| b[idx].as_ref()).collect::<Vec<_>>();
|
||||
vectors_in_batch
|
||||
.push(arrow::compute::concat(&columns).context(error::ConvertColumnsToRowsSnafu)?);
|
||||
}
|
||||
if let Some(v) = vectors_in_batch.get(0) {
|
||||
crate::metrics::WINDOW_SCAN_ROWS_PER_WINDOW.observe(v.len() as f64);
|
||||
}
|
||||
let sorted = sort_by_rows(&self.schema, vectors_in_batch, &self.order_options)?;
|
||||
let vectors = sorted
|
||||
.iter()
|
||||
.zip(store_schema.columns().iter().map(|c| &c.desc.name))
|
||||
.map(|(arr, name)| {
|
||||
Helper::try_into_vector(arr).context(error::ConvertChunkSnafu { name })
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(Some(Batch::new(vectors)))
|
||||
}
|
||||
}
|
||||
|
||||
fn sort_by_rows(
|
||||
schema: &ProjectedSchemaRef,
|
||||
arrays: Vec<ArrayRef>,
|
||||
order_options: &[OrderOption],
|
||||
) -> Result<Vec<ArrayRef>> {
|
||||
let store_schema = schema.schema_to_read();
|
||||
let sort_columns = build_sorted_columns(store_schema, order_options);
|
||||
// Convert columns to rows to speed lexicographic sort
|
||||
// TODO(hl): maybe optimize to lexsort_to_index when only timestamp column is involved.
|
||||
let row_converter = RowConverter::new(
|
||||
sort_columns
|
||||
.iter()
|
||||
.map(|(idx, descending)| {
|
||||
SortField::new_with_options(
|
||||
store_schema.columns()[*idx].desc.data_type.as_arrow_type(),
|
||||
SortOptions {
|
||||
descending: *descending,
|
||||
nulls_first: true,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
.context(error::ConvertColumnsToRowsSnafu)?;
|
||||
|
||||
let columns_to_sort = sort_columns
|
||||
.into_iter()
|
||||
.map(|(idx, _)| arrays[idx].clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let rows_to_sort = row_converter
|
||||
.convert_columns(&columns_to_sort)
|
||||
.context(error::ConvertColumnsToRowsSnafu)?;
|
||||
|
||||
let mut sort_pairs = rows_to_sort.iter().enumerate().collect::<Vec<_>>();
|
||||
sort_pairs.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
|
||||
|
||||
let idx =
|
||||
arrow::array::UInt32Array::from_iter_values(sort_pairs.iter().map(|(i, _)| *i as u32));
|
||||
|
||||
let sorted = arrays
|
||||
.iter()
|
||||
.map(|arr| arrow::compute::take(arr, &idx, None))
|
||||
.collect::<arrow::error::Result<Vec<_>>>()
|
||||
.context(error::SortArraysSnafu)?;
|
||||
|
||||
debug_assert_eq!(sorted.len(), store_schema.num_columns());
|
||||
|
||||
Ok(sorted)
|
||||
}
|
||||
|
||||
/// Builds sorted columns from `order_options`.
|
||||
/// Returns a vector of columns indices to sort and sort orders (true means descending order).
|
||||
fn build_sorted_columns(schema: &StoreSchema, order_options: &[OrderOption]) -> Vec<(usize, bool)> {
|
||||
order_options
|
||||
.iter()
|
||||
.map(|o| (schema.column_index(&o.name), o.options.descending))
|
||||
.collect()
|
||||
}
|
||||
@@ -1,808 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
mod writer;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::sync::atomic::{AtomicI64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_telemetry::{info, logging};
|
||||
use common_time::util;
|
||||
use snafu::ResultExt;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::manifest::{
|
||||
self, Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator,
|
||||
};
|
||||
use store_api::storage::{
|
||||
AlterRequest, CloseContext, CompactContext, CompactionStrategy, FlushContext, FlushReason,
|
||||
OpenOptions, ReadContext, Region, RegionId, SequenceNumber, WriteContext, WriteResponse,
|
||||
};
|
||||
|
||||
use crate::compaction::{
|
||||
compaction_strategy_to_picker, CompactionPickerRef, CompactionSchedulerRef,
|
||||
};
|
||||
use crate::config::EngineConfig;
|
||||
use crate::error::{self, Error, Result};
|
||||
use crate::file_purger::FilePurgerRef;
|
||||
use crate::flush::{FlushSchedulerRef, FlushStrategyRef};
|
||||
use crate::manifest::action::{
|
||||
RawRegionMetadata, RegionChange, RegionCheckpoint, RegionMetaAction, RegionMetaActionList,
|
||||
};
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::memtable::{MemtableBuilderRef, MemtableVersion};
|
||||
use crate::metadata::{RegionMetaImpl, RegionMetadata, RegionMetadataRef};
|
||||
pub(crate) use crate::region::writer::schedule_compaction;
|
||||
pub use crate::region::writer::{
|
||||
AlterContext, RegionWriter, RegionWriterRef, WriterCompactRequest, WriterContext,
|
||||
};
|
||||
use crate::region::writer::{DropContext, TruncateContext};
|
||||
use crate::schema::compat::CompatWrite;
|
||||
use crate::snapshot::SnapshotImpl;
|
||||
use crate::sst::{AccessLayerRef, LevelMetas};
|
||||
use crate::version::{
|
||||
Version, VersionControl, VersionControlRef, VersionEdit, INIT_COMMITTED_SEQUENCE,
|
||||
};
|
||||
use crate::wal::Wal;
|
||||
use crate::write_batch::WriteBatch;
|
||||
|
||||
/// [Region] implementation.
|
||||
pub struct RegionImpl<S: LogStore> {
|
||||
inner: Arc<RegionInner<S>>,
|
||||
}
|
||||
|
||||
impl<S: LogStore> Clone for RegionImpl<S> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
inner: self.inner.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: LogStore> fmt::Debug for RegionImpl<S> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("RegionImpl")
|
||||
.field("id", &self.inner.shared.id)
|
||||
.field("name", &self.inner.shared.name)
|
||||
.field("wal", &self.inner.wal)
|
||||
.field("flush_strategy", &self.inner.flush_strategy)
|
||||
.field("compaction_scheduler", &self.inner.compaction_scheduler)
|
||||
.field("sst_layer", &self.inner.sst_layer)
|
||||
.field("manifest", &self.inner.manifest)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<S: LogStore> Region for RegionImpl<S> {
|
||||
type Error = Error;
|
||||
type Meta = RegionMetaImpl;
|
||||
type WriteRequest = WriteBatch;
|
||||
type Snapshot = SnapshotImpl;
|
||||
|
||||
fn id(&self) -> RegionId {
|
||||
self.inner.shared.id
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
&self.inner.shared.name
|
||||
}
|
||||
|
||||
fn in_memory_metadata(&self) -> RegionMetaImpl {
|
||||
self.inner.in_memory_metadata()
|
||||
}
|
||||
|
||||
async fn write(&self, ctx: &WriteContext, mut request: WriteBatch) -> Result<WriteResponse> {
|
||||
// Compat the schema of the write batch outside of the write lock.
|
||||
self.inner.compat_write_batch(&mut request)?;
|
||||
|
||||
self.inner.write(ctx, request).await
|
||||
}
|
||||
|
||||
fn snapshot(&self, _ctx: &ReadContext) -> Result<SnapshotImpl> {
|
||||
Ok(self.inner.create_snapshot())
|
||||
}
|
||||
|
||||
fn write_request(&self) -> Self::WriteRequest {
|
||||
let metadata = self.inner.version_control().metadata();
|
||||
let user_schema = metadata.user_schema().clone();
|
||||
let row_key_end = metadata.schema().store_schema().row_key_end();
|
||||
|
||||
WriteBatch::new(user_schema, row_key_end)
|
||||
}
|
||||
|
||||
async fn alter(&self, request: AlterRequest) -> Result<()> {
|
||||
self.inner.alter(request).await
|
||||
}
|
||||
|
||||
async fn drop_region(&self) -> Result<()> {
|
||||
crate::metrics::REGION_COUNT.dec();
|
||||
self.inner.drop_region().await
|
||||
}
|
||||
|
||||
fn disk_usage_bytes(&self) -> u64 {
|
||||
let version = self.inner.version_control().current();
|
||||
version
|
||||
.ssts()
|
||||
.levels()
|
||||
.iter()
|
||||
.map(|level_ssts| level_ssts.files().map(|sst| sst.file_size()).sum::<u64>())
|
||||
.sum()
|
||||
}
|
||||
|
||||
async fn flush(&self, ctx: &FlushContext) -> Result<()> {
|
||||
self.inner.flush(ctx).await
|
||||
}
|
||||
|
||||
async fn compact(&self, ctx: &CompactContext) -> std::result::Result<(), Self::Error> {
|
||||
self.inner.compact(ctx).await
|
||||
}
|
||||
|
||||
async fn truncate(&self) -> Result<()> {
|
||||
self.inner.truncate().await
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage related config for region.
|
||||
///
|
||||
/// Contains all necessary storage related components needed by the region, such as logstore,
|
||||
/// manifest, memtable builder.
|
||||
pub struct StoreConfig<S: LogStore> {
|
||||
pub log_store: Arc<S>,
|
||||
pub sst_layer: AccessLayerRef,
|
||||
pub manifest: RegionManifest,
|
||||
pub memtable_builder: MemtableBuilderRef,
|
||||
pub flush_scheduler: FlushSchedulerRef<S>,
|
||||
pub flush_strategy: FlushStrategyRef,
|
||||
pub compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
pub engine_config: Arc<EngineConfig>,
|
||||
pub file_purger: FilePurgerRef,
|
||||
pub ttl: Option<Duration>,
|
||||
pub write_buffer_size: usize,
|
||||
pub compaction_strategy: CompactionStrategy,
|
||||
}
|
||||
|
||||
pub type RecoveredMetadata = (SequenceNumber, (ManifestVersion, RawRegionMetadata));
|
||||
pub type RecoveredMetadataMap = BTreeMap<SequenceNumber, (ManifestVersion, RawRegionMetadata)>;
|
||||
|
||||
impl<S: LogStore> RegionImpl<S> {
|
||||
/// Create a new region and also persist the region metadata to manifest.
|
||||
///
|
||||
/// The caller should avoid calling this method simultaneously.
|
||||
pub async fn create(
|
||||
metadata: RegionMetadata,
|
||||
store_config: StoreConfig<S>,
|
||||
) -> Result<RegionImpl<S>> {
|
||||
let metadata = Arc::new(metadata);
|
||||
|
||||
// Try to persist region data to manifest, ensure the new region could be recovered from
|
||||
// the manifest.
|
||||
let manifest_version = {
|
||||
let _timer = crate::metrics::CREATE_REGION_UPDATE_MANIFEST.start_timer();
|
||||
store_config
|
||||
.manifest
|
||||
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
|
||||
RegionChange {
|
||||
metadata: metadata.as_ref().into(),
|
||||
committed_sequence: INIT_COMMITTED_SEQUENCE,
|
||||
},
|
||||
)))
|
||||
.await?
|
||||
};
|
||||
|
||||
let mutable_memtable = store_config
|
||||
.memtable_builder
|
||||
.build(metadata.schema().clone());
|
||||
let version = Version::with_manifest_version(
|
||||
metadata,
|
||||
manifest_version,
|
||||
mutable_memtable,
|
||||
store_config.sst_layer.clone(),
|
||||
store_config.file_purger.clone(),
|
||||
);
|
||||
let region = RegionImpl::new(version, store_config);
|
||||
crate::metrics::REGION_COUNT.inc();
|
||||
|
||||
Ok(region)
|
||||
}
|
||||
|
||||
/// Create a new region without persisting manifest.
|
||||
fn new(version: Version, store_config: StoreConfig<S>) -> RegionImpl<S> {
|
||||
let metadata = version.metadata();
|
||||
let id = metadata.id();
|
||||
let name = metadata.name().to_string();
|
||||
let version_control = VersionControl::with_version(version);
|
||||
let wal = Wal::new(id, store_config.log_store);
|
||||
|
||||
let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
|
||||
let inner = Arc::new(RegionInner {
|
||||
shared: Arc::new(SharedData {
|
||||
id,
|
||||
name,
|
||||
version_control: Arc::new(version_control),
|
||||
last_flush_millis: AtomicI64::new(0),
|
||||
}),
|
||||
writer: Arc::new(RegionWriter::new(
|
||||
store_config.memtable_builder,
|
||||
store_config.engine_config.clone(),
|
||||
store_config.ttl,
|
||||
store_config.write_buffer_size,
|
||||
store_config.compaction_scheduler.clone(),
|
||||
compaction_picker.clone(),
|
||||
)),
|
||||
wal,
|
||||
flush_strategy: store_config.flush_strategy,
|
||||
flush_scheduler: store_config.flush_scheduler,
|
||||
compaction_scheduler: store_config.compaction_scheduler,
|
||||
compaction_picker,
|
||||
sst_layer: store_config.sst_layer,
|
||||
manifest: store_config.manifest,
|
||||
});
|
||||
|
||||
RegionImpl { inner }
|
||||
}
|
||||
|
||||
/// Open an existing region and recover its data.
|
||||
///
|
||||
/// The caller should avoid calling this method simultaneously.
|
||||
pub async fn open(
|
||||
name: String,
|
||||
store_config: StoreConfig<S>,
|
||||
_opts: &OpenOptions,
|
||||
) -> Result<Option<RegionImpl<S>>> {
|
||||
// Load version meta data from manifest.
|
||||
let (version, mut recovered_metadata) = match Self::recover_from_manifest(
|
||||
&store_config.manifest,
|
||||
&store_config.memtable_builder,
|
||||
&store_config.sst_layer,
|
||||
&store_config.file_purger,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
(None, _) => return Ok(None),
|
||||
(Some(v), m) => (v, m),
|
||||
};
|
||||
|
||||
logging::debug!(
|
||||
"Region recovered version from manifest, version: {:?}",
|
||||
version
|
||||
);
|
||||
|
||||
let metadata = version.metadata().clone();
|
||||
let flushed_sequence = version.flushed_sequence();
|
||||
let version_control = Arc::new(VersionControl::with_version(version));
|
||||
|
||||
let recovered_metadata_after_flushed =
|
||||
recovered_metadata.split_off(&(flushed_sequence + 1));
|
||||
// apply the last flushed metadata
|
||||
if let Some((sequence, (manifest_version, metadata))) = recovered_metadata.pop_last() {
|
||||
let metadata: RegionMetadataRef = Arc::new(
|
||||
metadata
|
||||
.try_into()
|
||||
.context(error::InvalidRawRegionSnafu { region: &name })?,
|
||||
);
|
||||
let mutable_memtable = store_config
|
||||
.memtable_builder
|
||||
.build(metadata.schema().clone());
|
||||
version_control.freeze_mutable_and_apply_metadata(
|
||||
metadata,
|
||||
manifest_version,
|
||||
mutable_memtable,
|
||||
);
|
||||
|
||||
logging::debug!(
|
||||
"Applied the last flushed metadata to region: {}, sequence: {}, manifest: {}",
|
||||
name,
|
||||
sequence,
|
||||
manifest_version,
|
||||
);
|
||||
}
|
||||
|
||||
let wal = Wal::new(metadata.id(), store_config.log_store);
|
||||
wal.obsolete(flushed_sequence).await?;
|
||||
info!(
|
||||
"Obsolete WAL entries on startup, region: {}, flushed sequence: {}",
|
||||
metadata.id(),
|
||||
flushed_sequence
|
||||
);
|
||||
|
||||
let shared = Arc::new(SharedData {
|
||||
id: metadata.id(),
|
||||
name,
|
||||
version_control,
|
||||
last_flush_millis: AtomicI64::new(0),
|
||||
});
|
||||
|
||||
let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
|
||||
let writer = Arc::new(RegionWriter::new(
|
||||
store_config.memtable_builder,
|
||||
store_config.engine_config.clone(),
|
||||
store_config.ttl,
|
||||
store_config.write_buffer_size,
|
||||
store_config.compaction_scheduler.clone(),
|
||||
compaction_picker.clone(),
|
||||
));
|
||||
|
||||
let writer_ctx = WriterContext {
|
||||
shared: &shared,
|
||||
flush_strategy: &store_config.flush_strategy,
|
||||
flush_scheduler: &store_config.flush_scheduler,
|
||||
compaction_scheduler: &store_config.compaction_scheduler,
|
||||
sst_layer: &store_config.sst_layer,
|
||||
wal: &wal,
|
||||
writer: &writer,
|
||||
manifest: &store_config.manifest,
|
||||
compaction_picker: compaction_picker.clone(),
|
||||
};
|
||||
// Replay all unflushed data.
|
||||
writer
|
||||
.replay(recovered_metadata_after_flushed, writer_ctx)
|
||||
.await?;
|
||||
|
||||
let inner = Arc::new(RegionInner {
|
||||
shared,
|
||||
writer,
|
||||
wal,
|
||||
flush_strategy: store_config.flush_strategy,
|
||||
flush_scheduler: store_config.flush_scheduler,
|
||||
compaction_scheduler: store_config.compaction_scheduler,
|
||||
compaction_picker,
|
||||
sst_layer: store_config.sst_layer,
|
||||
manifest: store_config.manifest,
|
||||
});
|
||||
|
||||
crate::metrics::REGION_COUNT.inc();
|
||||
Ok(Some(RegionImpl { inner }))
|
||||
}
|
||||
|
||||
/// Get ID of this region.
|
||||
pub fn id(&self) -> RegionId {
|
||||
self.inner.shared.id()
|
||||
}
|
||||
|
||||
/// Returns last flush timestamp in millis.
|
||||
pub(crate) fn last_flush_millis(&self) -> i64 {
|
||||
self.inner.shared.last_flush_millis()
|
||||
}
|
||||
|
||||
/// Returns the [VersionControl] of the region.
|
||||
pub(crate) fn version_control(&self) -> &VersionControl {
|
||||
self.inner.version_control()
|
||||
}
|
||||
|
||||
fn create_version_with_checkpoint(
|
||||
checkpoint: RegionCheckpoint,
|
||||
memtable_builder: &MemtableBuilderRef,
|
||||
sst_layer: &AccessLayerRef,
|
||||
file_purger: &FilePurgerRef,
|
||||
) -> Result<Option<Version>> {
|
||||
if checkpoint.checkpoint.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
// Safety: it's safe to unwrap here, checking it above.
|
||||
let s = checkpoint.checkpoint.unwrap();
|
||||
|
||||
let region = s.metadata.name.clone();
|
||||
let region_metadata: RegionMetadata = s
|
||||
.metadata
|
||||
.try_into()
|
||||
.context(error::InvalidRawRegionSnafu { region })?;
|
||||
|
||||
let memtable = memtable_builder.build(region_metadata.schema().clone());
|
||||
let mut version = Version::with_manifest_version(
|
||||
Arc::new(region_metadata),
|
||||
checkpoint.last_version,
|
||||
memtable,
|
||||
sst_layer.clone(),
|
||||
file_purger.clone(),
|
||||
);
|
||||
|
||||
if let Some(v) = s.version {
|
||||
version.apply_checkpoint(
|
||||
v.flushed_sequence,
|
||||
v.manifest_version,
|
||||
v.files.into_values(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Some(version))
|
||||
}
|
||||
|
||||
async fn recover_from_manifest(
|
||||
manifest: &RegionManifest,
|
||||
memtable_builder: &MemtableBuilderRef,
|
||||
sst_layer: &AccessLayerRef,
|
||||
file_purger: &FilePurgerRef,
|
||||
) -> Result<(Option<Version>, RecoveredMetadataMap)> {
|
||||
let checkpoint = manifest.last_checkpoint().await?;
|
||||
|
||||
let (start, end, mut version) = if let Some(checkpoint) = checkpoint {
|
||||
(
|
||||
checkpoint.last_version + 1,
|
||||
manifest::MAX_VERSION,
|
||||
Self::create_version_with_checkpoint(
|
||||
checkpoint,
|
||||
memtable_builder,
|
||||
sst_layer,
|
||||
file_purger,
|
||||
)?,
|
||||
)
|
||||
} else {
|
||||
(manifest::MIN_VERSION, manifest::MAX_VERSION, None)
|
||||
};
|
||||
|
||||
let mut iter = manifest.scan(start, end).await?;
|
||||
|
||||
let mut actions = Vec::new();
|
||||
let mut last_manifest_version = manifest::MIN_VERSION;
|
||||
let mut recovered_metadata = BTreeMap::new();
|
||||
|
||||
while let Some((manifest_version, action_list)) = iter.next_action().await? {
|
||||
last_manifest_version = manifest_version;
|
||||
|
||||
for action in action_list.actions {
|
||||
match (action, version) {
|
||||
(RegionMetaAction::Change(c), None) => {
|
||||
let region = c.metadata.name.clone();
|
||||
let region_metadata: RegionMetadata = c
|
||||
.metadata
|
||||
.try_into()
|
||||
.context(error::InvalidRawRegionSnafu { region })?;
|
||||
// Use current schema to build a memtable. This might be replaced later
|
||||
// in `freeze_mutable_and_apply_metadata()`.
|
||||
let memtable = memtable_builder.build(region_metadata.schema().clone());
|
||||
version = Some(Version::with_manifest_version(
|
||||
Arc::new(region_metadata),
|
||||
last_manifest_version,
|
||||
memtable,
|
||||
sst_layer.clone(),
|
||||
file_purger.clone(),
|
||||
));
|
||||
for (manifest_version, action) in actions.drain(..) {
|
||||
version = Self::replay_edit(manifest_version, action, version);
|
||||
}
|
||||
}
|
||||
(RegionMetaAction::Change(c), Some(v)) => {
|
||||
let _ = recovered_metadata
|
||||
.insert(c.committed_sequence, (manifest_version, c.metadata));
|
||||
version = Some(v);
|
||||
}
|
||||
(RegionMetaAction::Remove(r), Some(v)) => {
|
||||
manifest.stop().await?;
|
||||
|
||||
let files = v.ssts().mark_all_files_deleted();
|
||||
logging::info!(
|
||||
"Try to remove all SSTs, region: {}, files: {:?}",
|
||||
r.region_id,
|
||||
files
|
||||
);
|
||||
|
||||
manifest
|
||||
.manifest_store()
|
||||
.delete_all(v.manifest_version())
|
||||
.await?;
|
||||
return Ok((None, recovered_metadata));
|
||||
}
|
||||
(RegionMetaAction::Truncate(t), Some(mut v)) => {
|
||||
let files = v.ssts().mark_all_files_deleted();
|
||||
logging::info!(
|
||||
"Try to remove all SSTs on truncate, region: {}, files: {:?}",
|
||||
t.region_id,
|
||||
files
|
||||
);
|
||||
let region_metadata = v.metadata().clone();
|
||||
let memtables = Arc::new(MemtableVersion::new(
|
||||
memtable_builder.build(region_metadata.schema().clone()),
|
||||
));
|
||||
let ssts =
|
||||
Arc::new(LevelMetas::new(sst_layer.clone(), file_purger.clone()));
|
||||
v.reset(
|
||||
v.manifest_version() + 1,
|
||||
memtables,
|
||||
ssts,
|
||||
t.committed_sequence,
|
||||
);
|
||||
version = Some(v);
|
||||
}
|
||||
(action, None) => {
|
||||
actions.push((manifest_version, action));
|
||||
version = None;
|
||||
}
|
||||
(action, Some(v)) => {
|
||||
version = Self::replay_edit(manifest_version, action, Some(v));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(actions.is_empty() || version.is_none());
|
||||
|
||||
if let Some(version) = &version {
|
||||
// update manifest state after recovering
|
||||
let protocol = iter.last_protocol();
|
||||
manifest.update_state(last_manifest_version + 1, protocol.clone());
|
||||
manifest.set_flushed_manifest_version(version.manifest_version());
|
||||
}
|
||||
|
||||
Ok((version, recovered_metadata))
|
||||
}
|
||||
|
||||
fn replay_edit(
|
||||
manifest_version: ManifestVersion,
|
||||
action: RegionMetaAction,
|
||||
version: Option<Version>,
|
||||
) -> Option<Version> {
|
||||
if let RegionMetaAction::Edit(e) = action {
|
||||
let edit = VersionEdit {
|
||||
files_to_add: e.files_to_add,
|
||||
files_to_remove: e.files_to_remove,
|
||||
flushed_sequence: e.flushed_sequence,
|
||||
manifest_version,
|
||||
max_memtable_id: None,
|
||||
compaction_time_window: e.compaction_time_window,
|
||||
};
|
||||
version.map(|mut v| {
|
||||
v.apply_edit(edit);
|
||||
v
|
||||
})
|
||||
} else {
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
/// Compact the region manually.
|
||||
pub async fn compact(&self, ctx: &CompactContext) -> Result<()> {
|
||||
self.inner.compact(ctx).await
|
||||
}
|
||||
|
||||
pub async fn close(&self, ctx: &CloseContext) -> Result<()> {
|
||||
crate::metrics::REGION_COUNT.dec();
|
||||
self.inner.close(ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
// Private methods for tests.
|
||||
#[cfg(test)]
|
||||
impl<S: LogStore> RegionImpl<S> {
|
||||
#[inline]
|
||||
fn committed_sequence(&self) -> store_api::storage::SequenceNumber {
|
||||
self.inner.version_control().committed_sequence()
|
||||
}
|
||||
|
||||
fn current_manifest_version(&self) -> ManifestVersion {
|
||||
self.inner.version_control().current_manifest_version()
|
||||
}
|
||||
|
||||
/// Write to inner, also the `RegionWriter` directly.
|
||||
async fn write_inner(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
|
||||
self.inner.write(ctx, request).await
|
||||
}
|
||||
|
||||
// Replay metadata to inner.
|
||||
async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) -> Result<()> {
|
||||
let inner = &self.inner;
|
||||
let writer_ctx = WriterContext {
|
||||
shared: &inner.shared,
|
||||
flush_strategy: &inner.flush_strategy,
|
||||
flush_scheduler: &inner.flush_scheduler,
|
||||
compaction_scheduler: &inner.compaction_scheduler,
|
||||
sst_layer: &inner.sst_layer,
|
||||
wal: &inner.wal,
|
||||
writer: &inner.writer,
|
||||
manifest: &inner.manifest,
|
||||
compaction_picker: inner.compaction_picker.clone(),
|
||||
};
|
||||
|
||||
inner.writer.replay(recovered_metadata, writer_ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn write_buffer_size(&self) -> usize {
|
||||
self.inner.writer.write_buffer_size().await
|
||||
}
|
||||
}
|
||||
|
||||
/// Shared data of region.
|
||||
#[derive(Debug)]
|
||||
pub struct SharedData {
|
||||
// Region id and name is immutable, so we cache them in shared data to avoid loading
|
||||
// current version from `version_control` each time we need to access them.
|
||||
id: RegionId,
|
||||
name: String,
|
||||
// TODO(yingwen): Maybe no need to use Arc for version control.
|
||||
pub version_control: VersionControlRef,
|
||||
|
||||
/// Last flush time in millis.
|
||||
last_flush_millis: AtomicI64,
|
||||
}
|
||||
|
||||
impl SharedData {
|
||||
#[inline]
|
||||
pub fn id(&self) -> RegionId {
|
||||
self.id
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
/// Update flush time to current time.
|
||||
pub(crate) fn update_flush_millis(&self) {
|
||||
let now = util::current_time_millis();
|
||||
self.last_flush_millis.store(now, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Returns last flush timestamp in millis.
|
||||
fn last_flush_millis(&self) -> i64 {
|
||||
self.last_flush_millis.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
pub type SharedDataRef = Arc<SharedData>;
|
||||
|
||||
struct RegionInner<S: LogStore> {
|
||||
shared: SharedDataRef,
|
||||
writer: RegionWriterRef<S>,
|
||||
wal: Wal<S>,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
flush_scheduler: FlushSchedulerRef<S>,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
compaction_picker: CompactionPickerRef<S>,
|
||||
sst_layer: AccessLayerRef,
|
||||
manifest: RegionManifest,
|
||||
}
|
||||
|
||||
impl<S: LogStore> RegionInner<S> {
|
||||
#[inline]
|
||||
fn version_control(&self) -> &VersionControl {
|
||||
&self.shared.version_control
|
||||
}
|
||||
|
||||
fn in_memory_metadata(&self) -> RegionMetaImpl {
|
||||
let metadata = self.version_control().metadata();
|
||||
|
||||
RegionMetaImpl::new(metadata)
|
||||
}
|
||||
|
||||
fn create_snapshot(&self) -> SnapshotImpl {
|
||||
let version = self.version_control().current();
|
||||
let sequence = self.version_control().committed_sequence();
|
||||
|
||||
SnapshotImpl::new(version, sequence, self.sst_layer.clone())
|
||||
}
|
||||
|
||||
fn compat_write_batch(&self, request: &mut WriteBatch) -> Result<()> {
|
||||
let metadata = self.version_control().metadata();
|
||||
let schema = metadata.schema();
|
||||
|
||||
// Try to make request schema compatible with region's outside of write lock. Note that
|
||||
// schema might be altered after this step.
|
||||
request.compat_write(schema.user_schema())
|
||||
}
|
||||
|
||||
/// Write to writer directly.
|
||||
async fn write(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
|
||||
let writer_ctx = WriterContext {
|
||||
shared: &self.shared,
|
||||
flush_strategy: &self.flush_strategy,
|
||||
flush_scheduler: &self.flush_scheduler,
|
||||
compaction_scheduler: &self.compaction_scheduler,
|
||||
sst_layer: &self.sst_layer,
|
||||
wal: &self.wal,
|
||||
writer: &self.writer,
|
||||
manifest: &self.manifest,
|
||||
compaction_picker: self.compaction_picker.clone(),
|
||||
};
|
||||
// The writer would also try to compat the schema of write batch if it finds out the
|
||||
// schema version of request is less than current schema version.
|
||||
self.writer.write(ctx, request, writer_ctx).await
|
||||
}
|
||||
|
||||
async fn alter(&self, request: AlterRequest) -> Result<()> {
|
||||
logging::info!(
|
||||
"Alter region {}, name: {}, request: {:?}",
|
||||
self.shared.id,
|
||||
self.shared.name,
|
||||
request
|
||||
);
|
||||
|
||||
let alter_ctx = AlterContext {
|
||||
shared: &self.shared,
|
||||
wal: &self.wal,
|
||||
manifest: &self.manifest,
|
||||
};
|
||||
|
||||
self.writer.alter(alter_ctx, request).await
|
||||
}
|
||||
|
||||
async fn close(&self, ctx: &CloseContext) -> Result<()> {
|
||||
self.writer.close().await?;
|
||||
if ctx.flush {
|
||||
let ctx = FlushContext {
|
||||
wait: true,
|
||||
reason: FlushReason::Manually,
|
||||
force: true,
|
||||
};
|
||||
self.flush(&ctx).await?;
|
||||
}
|
||||
self.manifest.stop().await
|
||||
}
|
||||
|
||||
async fn drop_region(&self) -> Result<()> {
|
||||
logging::info!("Drop region {}, name: {}", self.shared.id, self.shared.name);
|
||||
let drop_ctx = DropContext {
|
||||
shared: &self.shared,
|
||||
wal: &self.wal,
|
||||
manifest: &self.manifest,
|
||||
flush_scheduler: &self.flush_scheduler,
|
||||
compaction_scheduler: &self.compaction_scheduler,
|
||||
sst_layer: &self.sst_layer,
|
||||
};
|
||||
|
||||
self.manifest.stop().await?;
|
||||
self.writer.on_drop(drop_ctx).await
|
||||
}
|
||||
|
||||
async fn flush(&self, ctx: &FlushContext) -> Result<()> {
|
||||
let writer_ctx = WriterContext {
|
||||
shared: &self.shared,
|
||||
flush_strategy: &self.flush_strategy,
|
||||
flush_scheduler: &self.flush_scheduler,
|
||||
compaction_scheduler: &self.compaction_scheduler,
|
||||
sst_layer: &self.sst_layer,
|
||||
wal: &self.wal,
|
||||
writer: &self.writer,
|
||||
manifest: &self.manifest,
|
||||
compaction_picker: self.compaction_picker.clone(),
|
||||
};
|
||||
self.writer.flush(writer_ctx, ctx).await
|
||||
}
|
||||
|
||||
/// Compact the region manually.
|
||||
async fn compact(&self, compact_ctx: &CompactContext) -> Result<()> {
|
||||
self.writer
|
||||
.compact(WriterCompactRequest {
|
||||
shared_data: self.shared.clone(),
|
||||
sst_layer: self.sst_layer.clone(),
|
||||
manifest: self.manifest.clone(),
|
||||
wal: self.wal.clone(),
|
||||
region_writer: self.writer.clone(),
|
||||
compact_ctx: *compact_ctx,
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn truncate(&self) -> Result<()> {
|
||||
logging::info!(
|
||||
"Truncate region {}, name: {}",
|
||||
self.shared.id,
|
||||
self.shared.name
|
||||
);
|
||||
|
||||
let ctx = TruncateContext {
|
||||
shared: &self.shared,
|
||||
wal: &self.wal,
|
||||
manifest: &self.manifest,
|
||||
sst_layer: &self.sst_layer,
|
||||
};
|
||||
|
||||
self.writer.truncate(&ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,833 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region tests.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use arrow::compute::SortOptions;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_datasource::compression::CompressionType;
|
||||
use common_recordbatch::OrderOption;
|
||||
use common_telemetry::logging;
|
||||
use common_test_util::temp_dir::{create_temp_dir, TempDir};
|
||||
use datatypes::prelude::{LogicalTypeId, ScalarVector, WrapperType};
|
||||
use datatypes::timestamp::TimestampMillisecond;
|
||||
use datatypes::vectors::{
|
||||
BooleanVector, Int64Vector, StringVector, TimestampMillisecondVector, VectorRef,
|
||||
};
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use log_store::NoopLogStore;
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
use store_api::manifest::{Manifest, MAX_VERSION};
|
||||
use store_api::storage::{
|
||||
Chunk, ChunkReader, FlushContext, FlushReason, ReadContext, Region, RegionMeta, ScanRequest,
|
||||
SequenceNumber, Snapshot, WriteContext, WriteRequest,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::chunk::ChunkReaderImpl;
|
||||
use crate::compaction::noop::NoopCompactionScheduler;
|
||||
use crate::engine;
|
||||
use crate::engine::RegionMap;
|
||||
use crate::file_purger::noop::NoopFilePurgeHandler;
|
||||
use crate::flush::{FlushScheduler, PickerConfig, SizeBasedStrategy};
|
||||
use crate::manifest::action::{RegionChange, RegionMetaActionList};
|
||||
use crate::manifest::manifest_compress_type;
|
||||
use crate::manifest::region::RegionManifest;
|
||||
use crate::manifest::test_utils::*;
|
||||
use crate::memtable::DefaultMemtableBuilder;
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::region::{RegionImpl, StoreConfig};
|
||||
use crate::scheduler::{LocalScheduler, SchedulerConfig};
|
||||
use crate::sst::{FileId, FsAccessLayer};
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
use crate::test_util::{self, config_util, schema_util, write_batch_util};
|
||||
|
||||
mod alter;
|
||||
mod basic;
|
||||
mod close;
|
||||
mod compact;
|
||||
mod drop;
|
||||
mod flush;
|
||||
mod projection;
|
||||
mod truncate;
|
||||
|
||||
/// Create metadata of a region with schema: (timestamp, v0).
|
||||
pub fn new_metadata(region_name: &str) -> RegionMetadata {
|
||||
let desc = RegionDescBuilder::new(region_name)
|
||||
.id(123)
|
||||
.push_field_column(("v0", LogicalTypeId::String, true))
|
||||
.build();
|
||||
desc.try_into().unwrap()
|
||||
}
|
||||
|
||||
/// Test region with schema (timestamp, v0).
|
||||
pub struct TesterBase<S: LogStore> {
|
||||
pub region: RegionImpl<S>,
|
||||
pub write_ctx: WriteContext,
|
||||
pub read_ctx: ReadContext,
|
||||
}
|
||||
|
||||
impl<S: LogStore> TesterBase<S> {
|
||||
pub fn with_region(region: RegionImpl<S>) -> TesterBase<S> {
|
||||
TesterBase {
|
||||
region,
|
||||
write_ctx: WriteContext::default(),
|
||||
read_ctx: ReadContext::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn checkpoint_manifest(&self) {
|
||||
let manifest = &self.region.inner.manifest;
|
||||
manifest.set_flushed_manifest_version(manifest.last_version() - 1);
|
||||
let _ = manifest.do_checkpoint().await.unwrap().unwrap();
|
||||
}
|
||||
|
||||
pub async fn close(&self) {
|
||||
self.region.inner.flush_scheduler.stop().await.unwrap();
|
||||
self.region
|
||||
.inner
|
||||
.compaction_scheduler
|
||||
.stop(true)
|
||||
.await
|
||||
.unwrap();
|
||||
self.region.close(&CloseContext::default()).await.unwrap();
|
||||
self.region.inner.wal.close().await.unwrap();
|
||||
}
|
||||
|
||||
/// Put without version specified.
|
||||
///
|
||||
/// Format of data: (timestamp, v0), timestamp is key, v0 is value.
|
||||
pub async fn put(&self, data: &[(i64, Option<String>)]) -> WriteResponse {
|
||||
self.try_put(data).await.unwrap()
|
||||
}
|
||||
|
||||
/// Put without version specified, returns [`Result<WriteResponse>`]
|
||||
///
|
||||
/// Format of data: (timestamp, v0), timestamp is key, v0 is value.
|
||||
pub async fn try_put(&self, data: &[(i64, Option<String>)]) -> Result<WriteResponse> {
|
||||
let data: Vec<(TimestampMillisecond, Option<String>)> =
|
||||
data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect();
|
||||
// Build a batch without version.
|
||||
let mut batch = new_write_batch_for_test(false);
|
||||
let put_data = new_put_data(&data);
|
||||
batch.put(put_data).unwrap();
|
||||
|
||||
self.region.write(&self.write_ctx, batch).await
|
||||
}
|
||||
|
||||
/// Put without version specified directly to inner writer.
|
||||
pub async fn put_inner(&self, data: &[(i64, Option<String>)]) -> WriteResponse {
|
||||
let data: Vec<(TimestampMillisecond, Option<String>)> =
|
||||
data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect();
|
||||
let mut batch = new_write_batch_for_test(false);
|
||||
let put_data = new_put_data(&data);
|
||||
batch.put(put_data).unwrap();
|
||||
|
||||
self.region
|
||||
.write_inner(&self.write_ctx, batch)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) {
|
||||
self.region.replay_inner(recovered_metadata).await.unwrap()
|
||||
}
|
||||
|
||||
/// Scan all data.
|
||||
pub async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
|
||||
logging::info!("Full scan with ctx {:?}", self.read_ctx);
|
||||
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
|
||||
|
||||
let resp = snapshot
|
||||
.scan(&self.read_ctx, ScanRequest::default())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut reader = resp.reader;
|
||||
|
||||
let metadata = self.region.in_memory_metadata();
|
||||
assert_eq!(metadata.schema(), reader.user_schema());
|
||||
|
||||
let mut dst = Vec::new();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let chunk = reader.project_chunk(chunk);
|
||||
append_chunk_to(&chunk, &mut dst);
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
|
||||
pub async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option<String>)> {
|
||||
logging::info!("Full scan with ctx {:?}", self.read_ctx);
|
||||
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
|
||||
|
||||
let resp = snapshot.scan(&self.read_ctx, req).await.unwrap();
|
||||
let mut reader = resp.reader;
|
||||
|
||||
let metadata = self.region.in_memory_metadata();
|
||||
assert_eq!(metadata.schema(), reader.user_schema());
|
||||
|
||||
let mut dst = Vec::new();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let chunk = reader.project_chunk(chunk);
|
||||
append_chunk_to(&chunk, &mut dst);
|
||||
}
|
||||
dst
|
||||
}
|
||||
|
||||
pub fn committed_sequence(&self) -> SequenceNumber {
|
||||
self.region.committed_sequence()
|
||||
}
|
||||
|
||||
/// Delete by keys (timestamp).
|
||||
pub async fn delete(&self, keys: &[i64]) -> WriteResponse {
|
||||
let keys: Vec<TimestampMillisecond> = keys.iter().map(|v| (*v).into()).collect();
|
||||
// Build a batch without version.
|
||||
let mut batch = new_write_batch_for_test(false);
|
||||
let keys = new_delete_data(&keys);
|
||||
batch.delete(keys).unwrap();
|
||||
|
||||
self.region.write(&self.write_ctx, batch).await.unwrap()
|
||||
}
|
||||
|
||||
/// Returns a reader to scan all data.
|
||||
pub async fn full_scan_reader(&self) -> ChunkReaderImpl {
|
||||
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
|
||||
|
||||
let resp = snapshot
|
||||
.scan(&self.read_ctx, ScanRequest::default())
|
||||
.await
|
||||
.unwrap();
|
||||
resp.reader
|
||||
}
|
||||
|
||||
/// Collect data from the reader.
|
||||
pub async fn collect_reader(&self, mut reader: ChunkReaderImpl) -> Vec<(i64, Option<String>)> {
|
||||
let mut dst = Vec::new();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let chunk = reader.project_chunk(chunk);
|
||||
append_chunk_to(&chunk, &mut dst);
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
}
|
||||
|
||||
pub type FileTesterBase = TesterBase<RaftEngineLogStore>;
|
||||
|
||||
fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch {
|
||||
if enable_version_column {
|
||||
write_batch_util::new_write_batch(
|
||||
&[
|
||||
(
|
||||
test_util::TIMESTAMP_NAME,
|
||||
LogicalTypeId::TimestampMillisecond,
|
||||
false,
|
||||
),
|
||||
("v0", LogicalTypeId::String, true),
|
||||
],
|
||||
Some(0),
|
||||
2,
|
||||
)
|
||||
} else {
|
||||
write_batch_util::new_write_batch(
|
||||
&[
|
||||
(
|
||||
test_util::TIMESTAMP_NAME,
|
||||
LogicalTypeId::TimestampMillisecond,
|
||||
false,
|
||||
),
|
||||
("v0", LogicalTypeId::String, true),
|
||||
],
|
||||
Some(0),
|
||||
1,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn new_put_data(data: &[(TimestampMillisecond, Option<String>)]) -> HashMap<String, VectorRef> {
|
||||
let timestamps =
|
||||
TimestampMillisecondVector::from_vec(data.iter().map(|v| v.0.into()).collect());
|
||||
let values = StringVector::from(data.iter().map(|kv| kv.1.clone()).collect::<Vec<_>>());
|
||||
|
||||
HashMap::from([
|
||||
(
|
||||
test_util::TIMESTAMP_NAME.to_string(),
|
||||
Arc::new(timestamps) as VectorRef,
|
||||
),
|
||||
("v0".to_string(), Arc::new(values) as VectorRef),
|
||||
])
|
||||
}
|
||||
|
||||
fn new_delete_data(keys: &[TimestampMillisecond]) -> HashMap<String, VectorRef> {
|
||||
let timestamps =
|
||||
TimestampMillisecondVector::from_vec(keys.iter().map(|v| v.0.into()).collect());
|
||||
HashMap::from([(
|
||||
test_util::TIMESTAMP_NAME.to_string(),
|
||||
Arc::new(timestamps) as VectorRef,
|
||||
)])
|
||||
}
|
||||
|
||||
fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option<String>)>) {
|
||||
assert_eq!(2, chunk.columns.len());
|
||||
|
||||
let timestamps = chunk.columns[0]
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap();
|
||||
let values = chunk.columns[1]
|
||||
.as_any()
|
||||
.downcast_ref::<StringVector>()
|
||||
.unwrap();
|
||||
for (ts, value) in timestamps.iter_data().zip(values.iter_data()) {
|
||||
dst.push((ts.unwrap().into_native(), value.map(|s| s.to_string())));
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_new_region() {
|
||||
let region_name = "region-0";
|
||||
let desc = RegionDescBuilder::new(region_name)
|
||||
.push_key_column(("k1", LogicalTypeId::Int32, false))
|
||||
.push_field_column(("v0", LogicalTypeId::Float32, true))
|
||||
.build();
|
||||
let metadata: RegionMetadata = desc.try_into().unwrap();
|
||||
|
||||
let dir = create_temp_dir("test_new_region");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let store_config =
|
||||
config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await;
|
||||
let placeholder_memtable = store_config
|
||||
.memtable_builder
|
||||
.build(metadata.schema().clone());
|
||||
|
||||
let region = RegionImpl::new(
|
||||
Version::new(Arc::new(metadata), placeholder_memtable),
|
||||
store_config,
|
||||
);
|
||||
|
||||
let expect_schema = schema_util::new_schema_ref(
|
||||
&[
|
||||
("k1", LogicalTypeId::Int32, false),
|
||||
(
|
||||
test_util::TIMESTAMP_NAME,
|
||||
LogicalTypeId::TimestampMillisecond,
|
||||
false,
|
||||
),
|
||||
("v0", LogicalTypeId::Float32, true),
|
||||
],
|
||||
Some(1),
|
||||
);
|
||||
|
||||
assert_eq!(region_name, region.name());
|
||||
assert_eq!(expect_schema, *region.in_memory_metadata().schema());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_recover_region_manifets_compress() {
|
||||
test_recover_region_manifets(true).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_recover_region_manifets_uncompress() {
|
||||
test_recover_region_manifets(false).await;
|
||||
}
|
||||
|
||||
async fn test_recover_region_manifets(compress: bool) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let tmp_dir = create_temp_dir("test_recover_region_manifets");
|
||||
let memtable_builder = Arc::new(DefaultMemtableBuilder::default()) as _;
|
||||
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(&tmp_dir.path().to_string_lossy());
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
|
||||
let manifest = RegionManifest::with_checkpointer(
|
||||
"/manifest/",
|
||||
object_store.clone(),
|
||||
manifest_compress_type(compress),
|
||||
None,
|
||||
None,
|
||||
);
|
||||
let region_meta = Arc::new(build_region_meta());
|
||||
|
||||
let sst_layer = Arc::new(FsAccessLayer::new("sst", object_store)) as _;
|
||||
let file_purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
NoopFilePurgeHandler,
|
||||
));
|
||||
// Recover from empty
|
||||
assert!(RegionImpl::<NoopLogStore>::recover_from_manifest(
|
||||
&manifest,
|
||||
&memtable_builder,
|
||||
&sst_layer,
|
||||
&file_purger,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.0
|
||||
.is_none());
|
||||
|
||||
let file_id_a = FileId::random();
|
||||
let file_id_b = FileId::random();
|
||||
let file_id_c = FileId::random();
|
||||
|
||||
{
|
||||
// save some actions into region_meta
|
||||
assert!(manifest
|
||||
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
|
||||
RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 40,
|
||||
},
|
||||
)))
|
||||
.await
|
||||
.is_ok());
|
||||
|
||||
assert!(manifest
|
||||
.update(RegionMetaActionList::new(vec![
|
||||
RegionMetaAction::Edit(build_region_edit(1, &[file_id_a], &[])),
|
||||
RegionMetaAction::Edit(build_region_edit(2, &[file_id_b, file_id_c], &[])),
|
||||
]))
|
||||
.await
|
||||
.is_ok());
|
||||
|
||||
assert!(manifest
|
||||
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
|
||||
RegionChange {
|
||||
metadata: region_meta.as_ref().into(),
|
||||
committed_sequence: 42,
|
||||
},
|
||||
)))
|
||||
.await
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
// try to recover
|
||||
let (version, recovered_metadata) = RegionImpl::<NoopLogStore>::recover_from_manifest(
|
||||
&manifest,
|
||||
&memtable_builder,
|
||||
&sst_layer,
|
||||
&file_purger,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_recovered_manifest(
|
||||
version,
|
||||
recovered_metadata,
|
||||
&file_id_a,
|
||||
&file_id_b,
|
||||
&file_id_c,
|
||||
®ion_meta,
|
||||
);
|
||||
|
||||
// do a manifest checkpoint
|
||||
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
|
||||
assert_eq!(1, checkpoint.last_version);
|
||||
assert_eq!(2, checkpoint.compacted_actions);
|
||||
assert_eq!(
|
||||
manifest.last_checkpoint().await.unwrap().unwrap(),
|
||||
checkpoint
|
||||
);
|
||||
// recover from checkpoint
|
||||
let (version, recovered_metadata) = RegionImpl::<NoopLogStore>::recover_from_manifest(
|
||||
&manifest,
|
||||
&memtable_builder,
|
||||
&sst_layer,
|
||||
&file_purger,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_recovered_manifest(
|
||||
version,
|
||||
recovered_metadata,
|
||||
&file_id_a,
|
||||
&file_id_b,
|
||||
&file_id_c,
|
||||
®ion_meta,
|
||||
);
|
||||
|
||||
// check manifest state
|
||||
assert_eq!(3, manifest.last_version());
|
||||
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
|
||||
let (version, action) = iter.next_action().await.unwrap().unwrap();
|
||||
assert_eq!(2, version);
|
||||
assert!(matches!(action.actions[0], RegionMetaAction::Change(..)));
|
||||
assert!(iter.next_action().await.unwrap().is_none());
|
||||
}
|
||||
|
||||
fn assert_recovered_manifest(
|
||||
version: Option<Version>,
|
||||
recovered_metadata: RecoveredMetadataMap,
|
||||
file_id_a: &FileId,
|
||||
file_id_b: &FileId,
|
||||
file_id_c: &FileId,
|
||||
region_meta: &Arc<RegionMetadata>,
|
||||
) {
|
||||
assert_eq!(42, *recovered_metadata.first_key_value().unwrap().0);
|
||||
let version = version.unwrap();
|
||||
assert_eq!(*version.metadata(), *region_meta);
|
||||
assert_eq!(version.flushed_sequence(), 2);
|
||||
assert_eq!(version.manifest_version(), 1);
|
||||
let ssts = version.ssts();
|
||||
let files = ssts.levels()[0]
|
||||
.files()
|
||||
.map(|f| f.file_name())
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(3, files.len());
|
||||
assert_eq!(
|
||||
HashSet::from([
|
||||
file_id_a.as_parquet(),
|
||||
file_id_b.as_parquet(),
|
||||
file_id_c.as_parquet()
|
||||
]),
|
||||
files
|
||||
);
|
||||
}
|
||||
|
||||
fn create_region_meta(region_name: &str) -> RegionMetadata {
|
||||
let desc = RegionDescBuilder::new(region_name)
|
||||
.push_field_column(("v0", LogicalTypeId::Int64, true))
|
||||
.push_field_column(("v1", LogicalTypeId::String, true))
|
||||
.push_field_column(("v2", LogicalTypeId::Boolean, true))
|
||||
.build();
|
||||
desc.try_into().unwrap()
|
||||
}
|
||||
|
||||
async fn create_store_config(region_name: &str, root: &str) -> StoreConfig<NoopLogStore> {
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(root);
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
let parent_dir = "";
|
||||
let sst_dir = engine::region_sst_dir(parent_dir, region_name);
|
||||
let manifest_dir = engine::region_manifest_dir(parent_dir, region_name);
|
||||
|
||||
let sst_layer = Arc::new(FsAccessLayer::new(&sst_dir, object_store.clone()));
|
||||
let manifest = RegionManifest::with_checkpointer(
|
||||
&manifest_dir,
|
||||
object_store,
|
||||
CompressionType::Uncompressed,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
manifest.start().await.unwrap();
|
||||
|
||||
let compaction_scheduler = Arc::new(NoopCompactionScheduler::default());
|
||||
|
||||
let regions = Arc::new(RegionMap::new());
|
||||
|
||||
let flush_scheduler = Arc::new(
|
||||
FlushScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
compaction_scheduler.clone(),
|
||||
regions,
|
||||
PickerConfig::default(),
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
let log_store = Arc::new(NoopLogStore);
|
||||
|
||||
let file_purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
NoopFilePurgeHandler,
|
||||
));
|
||||
StoreConfig {
|
||||
log_store,
|
||||
sst_layer,
|
||||
manifest,
|
||||
memtable_builder: Arc::new(DefaultMemtableBuilder::default()),
|
||||
flush_scheduler,
|
||||
flush_strategy: Arc::new(SizeBasedStrategy::default()),
|
||||
compaction_scheduler,
|
||||
engine_config: Default::default(),
|
||||
file_purger,
|
||||
ttl: None,
|
||||
write_buffer_size: ReadableSize::mb(32).0 as usize,
|
||||
compaction_strategy: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
struct WindowedReaderTester {
|
||||
data_written: Vec<Vec<(i64, i64, String, bool)>>,
|
||||
expected: Vec<(i64, i64, String, bool)>,
|
||||
region: RegionImpl<NoopLogStore>,
|
||||
_temp_dir: TempDir,
|
||||
}
|
||||
|
||||
impl WindowedReaderTester {
|
||||
async fn new(
|
||||
region_name: &'static str,
|
||||
data_written: Vec<Vec<(i64, i64, String, bool)>>,
|
||||
expected: Vec<(i64, i64, String, bool)>,
|
||||
) -> Self {
|
||||
let temp_dir = create_temp_dir(&format!("write_and_read_windowed_{}", region_name));
|
||||
let root = temp_dir.path().to_str().unwrap();
|
||||
let metadata = create_region_meta(region_name);
|
||||
let store_config = create_store_config(region_name, root).await;
|
||||
let region = RegionImpl::create(metadata, store_config).await.unwrap();
|
||||
|
||||
let tester = Self {
|
||||
data_written,
|
||||
expected,
|
||||
region,
|
||||
_temp_dir: temp_dir,
|
||||
};
|
||||
tester.prepare().await;
|
||||
tester
|
||||
}
|
||||
|
||||
async fn prepare(&self) {
|
||||
for batch in &self.data_written {
|
||||
let mut write_batch = self.region.write_request();
|
||||
let ts = TimestampMillisecondVector::from_iterator(
|
||||
batch
|
||||
.iter()
|
||||
.map(|(v, _, _, _)| TimestampMillisecond::new(*v)),
|
||||
);
|
||||
let v0 = Int64Vector::from_iterator(batch.iter().map(|(_, v, _, _)| *v));
|
||||
let v1 = StringVector::from_iterator(batch.iter().map(|(_, _, v, _)| v.as_str()));
|
||||
let v2 = BooleanVector::from_iterator(batch.iter().map(|(_, _, _, v)| *v));
|
||||
|
||||
let columns = [
|
||||
("timestamp".to_string(), Arc::new(ts) as VectorRef),
|
||||
("v0".to_string(), Arc::new(v0) as VectorRef),
|
||||
("v1".to_string(), Arc::new(v1) as VectorRef),
|
||||
("v2".to_string(), Arc::new(v2) as VectorRef),
|
||||
]
|
||||
.into_iter()
|
||||
.collect::<HashMap<String, VectorRef>>();
|
||||
write_batch.put(columns).unwrap();
|
||||
|
||||
assert!(self
|
||||
.region
|
||||
.write(&WriteContext {}, write_batch)
|
||||
.await
|
||||
.is_ok());
|
||||
|
||||
// flush the region to ensure data resides across SST files.
|
||||
self.region
|
||||
.flush(&FlushContext {
|
||||
wait: true,
|
||||
reason: FlushReason::Others,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
async fn check(&self, order_options: Vec<OrderOption>) {
|
||||
let read_context = ReadContext::default();
|
||||
let snapshot = self.region.snapshot(&read_context).unwrap();
|
||||
let response = snapshot
|
||||
.scan(
|
||||
&read_context,
|
||||
ScanRequest {
|
||||
sequence: None,
|
||||
projection: None,
|
||||
filters: vec![],
|
||||
limit: None,
|
||||
output_ordering: Some(order_options),
|
||||
},
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut timestamps = Vec::with_capacity(self.expected.len());
|
||||
let mut col1 = Vec::with_capacity(self.expected.len());
|
||||
let mut col2 = Vec::with_capacity(self.expected.len());
|
||||
let mut col3 = Vec::with_capacity(self.expected.len());
|
||||
|
||||
let mut reader = response.reader;
|
||||
let ts_index = reader.user_schema().timestamp_index().unwrap();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let ts_col = &chunk.columns[ts_index];
|
||||
let ts_col = ts_col
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap();
|
||||
let v1_col = chunk.columns[1]
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Vector>()
|
||||
.unwrap();
|
||||
let v2_col = chunk.columns[2]
|
||||
.as_any()
|
||||
.downcast_ref::<StringVector>()
|
||||
.unwrap();
|
||||
let v3_col = chunk.columns[3]
|
||||
.as_any()
|
||||
.downcast_ref::<BooleanVector>()
|
||||
.unwrap();
|
||||
|
||||
for ts in ts_col.iter_data() {
|
||||
timestamps.push(ts.unwrap().0.value());
|
||||
}
|
||||
for v in v1_col.iter_data() {
|
||||
col1.push(v.unwrap());
|
||||
}
|
||||
for v in v2_col.iter_data() {
|
||||
col2.push(v.unwrap().to_string());
|
||||
}
|
||||
for v in v3_col.iter_data() {
|
||||
col3.push(v.unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
timestamps,
|
||||
self.expected
|
||||
.iter()
|
||||
.map(|(v, _, _, _)| *v)
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
col1,
|
||||
self.expected
|
||||
.iter()
|
||||
.map(|(_, v, _, _)| *v)
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
col2,
|
||||
self.expected
|
||||
.iter()
|
||||
.map(|(_, _, v, _)| v.clone())
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
col3,
|
||||
self.expected
|
||||
.iter()
|
||||
.map(|(_, _, _, v)| *v)
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_by_chunk_reader() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
WindowedReaderTester::new(
|
||||
"test_region",
|
||||
vec![vec![(1, 1, "1".to_string(), false)]],
|
||||
vec![(1, 1, "1".to_string(), false)],
|
||||
)
|
||||
.await
|
||||
.check(vec![OrderOption {
|
||||
name: "timestamp".to_string(),
|
||||
options: SortOptions {
|
||||
descending: true,
|
||||
nulls_first: true,
|
||||
},
|
||||
}])
|
||||
.await;
|
||||
|
||||
WindowedReaderTester::new(
|
||||
"test_region",
|
||||
vec![
|
||||
vec![
|
||||
(1, 1, "1".to_string(), false),
|
||||
(2, 2, "2".to_string(), false),
|
||||
],
|
||||
vec![
|
||||
(3, 3, "3".to_string(), false),
|
||||
(4, 4, "4".to_string(), false),
|
||||
],
|
||||
],
|
||||
vec![
|
||||
(4, 4, "4".to_string(), false),
|
||||
(3, 3, "3".to_string(), false),
|
||||
(2, 2, "2".to_string(), false),
|
||||
(1, 1, "1".to_string(), false),
|
||||
],
|
||||
)
|
||||
.await
|
||||
.check(vec![OrderOption {
|
||||
name: "timestamp".to_string(),
|
||||
options: SortOptions {
|
||||
descending: true,
|
||||
nulls_first: true,
|
||||
},
|
||||
}])
|
||||
.await;
|
||||
|
||||
WindowedReaderTester::new(
|
||||
"test_region",
|
||||
vec![
|
||||
vec![
|
||||
(1, 1, "1".to_string(), false),
|
||||
(2, 2, "2".to_string(), false),
|
||||
(60000, 60000, "60".to_string(), false),
|
||||
],
|
||||
vec![
|
||||
(3, 3, "3".to_string(), false),
|
||||
(61000, 61000, "61".to_string(), false),
|
||||
],
|
||||
],
|
||||
vec![
|
||||
(61000, 61000, "61".to_string(), false),
|
||||
(60000, 60000, "60".to_string(), false),
|
||||
(3, 3, "3".to_string(), false),
|
||||
(2, 2, "2".to_string(), false),
|
||||
(1, 1, "1".to_string(), false),
|
||||
],
|
||||
)
|
||||
.await
|
||||
.check(vec![OrderOption {
|
||||
name: "timestamp".to_string(),
|
||||
options: SortOptions {
|
||||
descending: true,
|
||||
nulls_first: true,
|
||||
},
|
||||
}])
|
||||
.await;
|
||||
|
||||
WindowedReaderTester::new(
|
||||
"test_region",
|
||||
vec![
|
||||
vec![
|
||||
(1, 1, "1".to_string(), false),
|
||||
(2, 2, "2".to_string(), false),
|
||||
(60000, 60000, "60".to_string(), false),
|
||||
],
|
||||
vec![
|
||||
(3, 3, "3".to_string(), false),
|
||||
(61000, 61000, "61".to_string(), false),
|
||||
],
|
||||
],
|
||||
vec![
|
||||
(1, 1, "1".to_string(), false),
|
||||
(2, 2, "2".to_string(), false),
|
||||
(3, 3, "3".to_string(), false),
|
||||
(60000, 60000, "60".to_string(), false),
|
||||
(61000, 61000, "61".to_string(), false),
|
||||
],
|
||||
)
|
||||
.await
|
||||
.check(vec![OrderOption {
|
||||
name: "timestamp".to_string(),
|
||||
options: SortOptions {
|
||||
descending: false,
|
||||
nulls_first: true,
|
||||
},
|
||||
}])
|
||||
.await;
|
||||
}
|
||||
@@ -1,491 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use datatypes::prelude::*;
|
||||
use datatypes::timestamp::TimestampMillisecond;
|
||||
use datatypes::vectors::{Int64Vector, StringVector, TimestampMillisecondVector, VectorRef};
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::storage::{
|
||||
AddColumn, AlterOperation, AlterRequest, Chunk, ChunkReader, ColumnDescriptor,
|
||||
ColumnDescriptorBuilder, ColumnId, FlushContext, FlushReason, Region, RegionMeta, ScanRequest,
|
||||
SchemaRef, Snapshot, WriteRequest,
|
||||
};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::{OpenOptions, RawRegionMetadata, RegionImpl, RegionMetadata};
|
||||
use crate::test_util;
|
||||
use crate::test_util::config_util;
|
||||
use crate::test_util::descriptor_util::RegionDescBuilder;
|
||||
|
||||
const REGION_NAME: &str = "region-alter-0";
|
||||
|
||||
async fn create_region_for_alter(store_dir: &str) -> RegionImpl<RaftEngineLogStore> {
|
||||
// Always disable version column in this test.
|
||||
let metadata = tests::new_metadata(REGION_NAME);
|
||||
|
||||
let store_config =
|
||||
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
|
||||
|
||||
RegionImpl::create(metadata, store_config).await.unwrap()
|
||||
}
|
||||
|
||||
/// Tester for region alter.
|
||||
struct AlterTester {
|
||||
store_dir: String,
|
||||
base: Option<FileTesterBase>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
struct DataRow {
|
||||
key: Option<i64>,
|
||||
ts: TimestampMillisecond,
|
||||
v0: Option<String>,
|
||||
v1: Option<i64>,
|
||||
}
|
||||
|
||||
impl DataRow {
|
||||
fn new_with_string(key: Option<i64>, ts: i64, v0: Option<String>, v1: Option<i64>) -> Self {
|
||||
DataRow {
|
||||
key,
|
||||
ts: ts.into(),
|
||||
v0,
|
||||
v1,
|
||||
}
|
||||
}
|
||||
|
||||
fn new(key: Option<i64>, ts: i64, v0: Option<i64>, v1: Option<i64>) -> Self {
|
||||
Self::new_with_string(key, ts, v0.map(|s| s.to_string()), v1)
|
||||
}
|
||||
}
|
||||
|
||||
fn new_put_data(data: &[DataRow]) -> HashMap<String, VectorRef> {
|
||||
let keys = Int64Vector::from(data.iter().map(|v| v.key).collect::<Vec<_>>());
|
||||
let timestamps = TimestampMillisecondVector::from(
|
||||
data.iter()
|
||||
.map(|v| Some(v.ts.into_native()))
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
let values1 = StringVector::from(data.iter().map(|v| v.v0.clone()).collect::<Vec<_>>());
|
||||
let values2 = Int64Vector::from(data.iter().map(|kv| kv.v1).collect::<Vec<_>>());
|
||||
|
||||
HashMap::from([
|
||||
("k0".to_string(), Arc::new(keys) as VectorRef),
|
||||
(
|
||||
test_util::TIMESTAMP_NAME.to_string(),
|
||||
Arc::new(timestamps) as VectorRef,
|
||||
),
|
||||
("v0".to_string(), Arc::new(values1) as VectorRef),
|
||||
("v1".to_string(), Arc::new(values2) as VectorRef),
|
||||
])
|
||||
}
|
||||
|
||||
impl AlterTester {
|
||||
async fn new(store_dir: &str) -> AlterTester {
|
||||
let region = create_region_for_alter(store_dir).await;
|
||||
|
||||
AlterTester {
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
store_dir: store_dir.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn reopen(&mut self) {
|
||||
// Close the old region.
|
||||
if let Some(base) = self.base.as_ref() {
|
||||
base.close().await;
|
||||
}
|
||||
self.base = None;
|
||||
// Reopen the region.
|
||||
let store_config =
|
||||
config_util::new_store_config(REGION_NAME, &self.store_dir, EngineConfig::default())
|
||||
.await;
|
||||
let opts = OpenOptions::default();
|
||||
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
self.base = Some(FileTesterBase::with_region(region));
|
||||
}
|
||||
|
||||
async fn flush(&self, wait: Option<bool>) {
|
||||
let ctx = wait
|
||||
.map(|wait| FlushContext {
|
||||
wait,
|
||||
reason: FlushReason::Manually,
|
||||
..Default::default()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
self.base().region.flush(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
async fn checkpoint_manifest(&self) {
|
||||
self.base().checkpoint_manifest().await
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
fn schema(&self) -> SchemaRef {
|
||||
let metadata = self.base().region.in_memory_metadata();
|
||||
metadata.schema().clone()
|
||||
}
|
||||
|
||||
// Put with schema k0, ts, v0, v1
|
||||
async fn put(&self, data: &[DataRow]) {
|
||||
let mut batch = self.base().region.write_request();
|
||||
let put_data = new_put_data(data);
|
||||
batch.put(put_data).unwrap();
|
||||
|
||||
assert!(self
|
||||
.base()
|
||||
.region
|
||||
.write(&self.base().write_ctx, batch)
|
||||
.await
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
/// Put data with initial schema.
|
||||
async fn put_with_init_schema(&self, data: &[(i64, Option<i64>)]) {
|
||||
// put of FileTesterBase always use initial schema version.
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
let _ = self.base().put(&data).await;
|
||||
}
|
||||
|
||||
/// Put data to inner writer with initial schema.
|
||||
async fn put_inner_with_init_schema(&self, data: &[(i64, Option<i64>)]) {
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
// put of FileTesterBase always use initial schema version.
|
||||
let _ = self.base().put_inner(&data).await;
|
||||
}
|
||||
|
||||
async fn alter(&self, mut req: AlterRequest) {
|
||||
let version = self.version();
|
||||
req.version = version;
|
||||
|
||||
self.base().region.alter(req).await.unwrap();
|
||||
}
|
||||
|
||||
fn version(&self) -> u32 {
|
||||
let metadata = self.base().region.in_memory_metadata();
|
||||
metadata.version()
|
||||
}
|
||||
|
||||
async fn full_scan_with_init_schema(&self) -> Vec<(i64, Option<String>)> {
|
||||
self.base().full_scan().await
|
||||
}
|
||||
|
||||
async fn full_scan(&self) -> Vec<DataRow> {
|
||||
let read_ctx = &self.base().read_ctx;
|
||||
let snapshot = self.base().region.snapshot(read_ctx).unwrap();
|
||||
|
||||
let resp = snapshot
|
||||
.scan(read_ctx, ScanRequest::default())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut reader = resp.reader;
|
||||
|
||||
let metadata = self.base().region.in_memory_metadata();
|
||||
assert_eq!(metadata.schema(), reader.user_schema());
|
||||
|
||||
let mut dst = Vec::new();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let chunk = reader.project_chunk(chunk);
|
||||
append_chunk_to(&chunk, &mut dst);
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
}
|
||||
|
||||
fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<DataRow>) {
|
||||
assert_eq!(4, chunk.columns.len());
|
||||
|
||||
let k0_vector = chunk.columns[0]
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Vector>()
|
||||
.unwrap();
|
||||
let ts_vector = chunk.columns[1]
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap();
|
||||
let v0_vector = chunk.columns[2]
|
||||
.as_any()
|
||||
.downcast_ref::<StringVector>()
|
||||
.unwrap();
|
||||
let v1_vector = chunk.columns[3]
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Vector>()
|
||||
.unwrap();
|
||||
for i in 0..k0_vector.len() {
|
||||
dst.push(DataRow::new_with_string(
|
||||
k0_vector.get_data(i),
|
||||
ts_vector.get_data(i).unwrap().into(),
|
||||
v0_vector.get_data(i).map(|s| s.to_string()),
|
||||
v1_vector.get_data(i),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
fn new_column_desc(id: ColumnId, name: &str) -> ColumnDescriptor {
|
||||
ColumnDescriptorBuilder::new(id, name, ConcreteDataType::int64_datatype())
|
||||
.is_nullable(true)
|
||||
.build()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn add_column_req(desc_and_is_key: &[(ColumnDescriptor, bool)]) -> AlterRequest {
|
||||
let columns = desc_and_is_key
|
||||
.iter()
|
||||
.map(|(desc, is_key)| AddColumn {
|
||||
desc: desc.clone(),
|
||||
is_key: *is_key,
|
||||
})
|
||||
.collect();
|
||||
let operation = AlterOperation::AddColumns { columns };
|
||||
|
||||
AlterRequest {
|
||||
operation,
|
||||
version: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn drop_column_req(names: &[&str]) -> AlterRequest {
|
||||
let names = names.iter().map(|s| s.to_string()).collect();
|
||||
let operation = AlterOperation::DropColumns { names };
|
||||
|
||||
AlterRequest {
|
||||
operation,
|
||||
version: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn check_schema_names(schema: &SchemaRef, names: &[&str]) {
|
||||
assert_eq!(names.len(), schema.num_columns());
|
||||
|
||||
for (idx, name) in names.iter().enumerate() {
|
||||
assert_eq!(*name, schema.column_name_by_index(idx));
|
||||
let _ = schema.column_schema_by_name(name).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alter_region_with_reopen() {
|
||||
test_alter_region_with_reopen0(true).await;
|
||||
test_alter_region_with_reopen0(false).await;
|
||||
}
|
||||
|
||||
async fn test_alter_region_with_reopen0(flush_and_checkpoint: bool) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("alter-region");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = AlterTester::new(store_dir).await;
|
||||
|
||||
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
|
||||
tester.put_with_init_schema(&data).await;
|
||||
assert_eq!(3, tester.full_scan_with_init_schema().await.len());
|
||||
|
||||
let req = add_column_req(&[
|
||||
(new_column_desc(4, "k0"), true), // key column k0
|
||||
(new_column_desc(5, "v1"), false), // value column v1
|
||||
]);
|
||||
tester.alter(req).await;
|
||||
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
|
||||
|
||||
// Put data after schema altered.
|
||||
let data = vec![
|
||||
DataRow::new(Some(10000), 1003, Some(103), Some(201)),
|
||||
DataRow::new(Some(10001), 1004, Some(104), Some(202)),
|
||||
DataRow::new(Some(10002), 1005, Some(105), Some(203)),
|
||||
];
|
||||
tester.put(&data).await;
|
||||
|
||||
if flush_and_checkpoint {
|
||||
tester.flush(None).await;
|
||||
tester.checkpoint_manifest().await;
|
||||
}
|
||||
|
||||
// Scan with new schema before reopen.
|
||||
let mut expect = vec![
|
||||
DataRow::new(None, 1000, Some(100), None),
|
||||
DataRow::new(None, 1001, Some(101), None),
|
||||
DataRow::new(None, 1002, Some(102), None),
|
||||
];
|
||||
expect.extend_from_slice(&data);
|
||||
let scanned = tester.full_scan().await;
|
||||
assert_eq!(expect, scanned);
|
||||
|
||||
// Reopen and put more data.
|
||||
tester.reopen().await;
|
||||
let data = vec![
|
||||
DataRow::new(Some(10003), 1006, Some(106), Some(204)),
|
||||
DataRow::new(Some(10004), 1007, Some(107), Some(205)),
|
||||
DataRow::new(Some(10005), 1008, Some(108), Some(206)),
|
||||
];
|
||||
tester.put(&data).await;
|
||||
// Extend expected result.
|
||||
expect.extend_from_slice(&data);
|
||||
|
||||
// add columns,then remove them without writing data.
|
||||
let req = add_column_req(&[
|
||||
(new_column_desc(6, "v2"), false), // key column k0
|
||||
(new_column_desc(7, "v3"), false), // value column v1
|
||||
]);
|
||||
tester.alter(req).await;
|
||||
|
||||
let req = drop_column_req(&["v2", "v3"]);
|
||||
tester.alter(req).await;
|
||||
|
||||
if flush_and_checkpoint {
|
||||
tester.flush(None).await;
|
||||
tester.checkpoint_manifest().await;
|
||||
}
|
||||
|
||||
// reopen and write again
|
||||
tester.reopen().await;
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
|
||||
|
||||
let data = vec![DataRow::new(Some(10006), 1009, Some(109), Some(207))];
|
||||
tester.put(&data).await;
|
||||
expect.extend_from_slice(&data);
|
||||
|
||||
// Scan with new schema after reopen and write.
|
||||
let scanned = tester.full_scan().await;
|
||||
assert_eq!(expect, scanned);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alter_region() {
|
||||
let dir = create_temp_dir("alter-region");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let tester = AlterTester::new(store_dir).await;
|
||||
|
||||
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
|
||||
|
||||
tester.put_with_init_schema(&data).await;
|
||||
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["timestamp", "v0"]);
|
||||
|
||||
let req = add_column_req(&[
|
||||
(new_column_desc(4, "k0"), true), // key column k0
|
||||
(new_column_desc(5, "v1"), false), // value column v1
|
||||
]);
|
||||
tester.alter(req).await;
|
||||
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
|
||||
|
||||
let req = add_column_req(&[
|
||||
(new_column_desc(6, "v2"), false),
|
||||
(new_column_desc(7, "v3"), false),
|
||||
]);
|
||||
tester.alter(req).await;
|
||||
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1", "v2", "v3"]);
|
||||
|
||||
// Remove v0, v1
|
||||
let req = drop_column_req(&["v0", "v1"]);
|
||||
tester.alter(req).await;
|
||||
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["k0", "timestamp", "v2", "v3"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_put_old_schema_after_alter() {
|
||||
let dir = create_temp_dir("put-old");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let tester = AlterTester::new(store_dir).await;
|
||||
|
||||
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
|
||||
|
||||
tester.put_with_init_schema(&data).await;
|
||||
|
||||
let req = add_column_req(&[
|
||||
(new_column_desc(4, "k0"), true), // key column k0
|
||||
(new_column_desc(5, "v1"), false), // value column v1
|
||||
]);
|
||||
tester.alter(req).await;
|
||||
|
||||
// Put with old schema.
|
||||
let data = vec![(1005, Some(105)), (1006, Some(106))];
|
||||
tester.put_with_init_schema(&data).await;
|
||||
|
||||
// Put data with old schema directly to the inner writer, to check that the region
|
||||
// writer could compat the schema of write batch.
|
||||
let data = vec![(1003, Some(103)), (1004, Some(104))];
|
||||
tester.put_inner_with_init_schema(&data).await;
|
||||
|
||||
let expect = vec![
|
||||
DataRow::new(None, 1000, Some(100), None),
|
||||
DataRow::new(None, 1001, Some(101), None),
|
||||
DataRow::new(None, 1002, Some(102), None),
|
||||
DataRow::new(None, 1003, Some(103), None),
|
||||
DataRow::new(None, 1004, Some(104), None),
|
||||
DataRow::new(None, 1005, Some(105), None),
|
||||
DataRow::new(None, 1006, Some(106), None),
|
||||
];
|
||||
let scanned = tester.full_scan().await;
|
||||
assert_eq!(expect, scanned);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_replay_metadata_after_open() {
|
||||
let dir = create_temp_dir("replay-metadata-after-open");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = AlterTester::new(store_dir).await;
|
||||
|
||||
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
|
||||
|
||||
tester.put_with_init_schema(&data).await;
|
||||
|
||||
tester.reopen().await;
|
||||
|
||||
let committed_sequence = tester.base().committed_sequence();
|
||||
let manifest_version = tester.base().region.current_manifest_version();
|
||||
let version = tester.version();
|
||||
|
||||
let desc = RegionDescBuilder::new(REGION_NAME)
|
||||
.push_key_column(("k1", LogicalTypeId::Int32, false))
|
||||
.push_field_column(("v0", LogicalTypeId::Float32, true))
|
||||
.build();
|
||||
let metadata: &RegionMetadata = &desc.try_into().unwrap();
|
||||
let mut raw_metadata: RawRegionMetadata = metadata.into();
|
||||
raw_metadata.version = version + 1;
|
||||
|
||||
let recovered_metadata =
|
||||
BTreeMap::from([(committed_sequence, (manifest_version + 1, raw_metadata))]);
|
||||
|
||||
tester.base().replay_inner(recovered_metadata).await;
|
||||
let schema = tester.schema();
|
||||
check_schema_names(&schema, &["k1", "timestamp", "v0"]);
|
||||
}
|
||||
@@ -1,288 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region read/write tests.
|
||||
|
||||
use common_telemetry::info;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::storage::{OpenOptions, SequenceNumber};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::error::Result;
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::RegionImpl;
|
||||
use crate::test_util::config_util;
|
||||
|
||||
const REGION_NAME: &str = "region-basic-0";
|
||||
|
||||
/// Create a new region for basic tests.
|
||||
async fn create_region_for_basic(
|
||||
region_name: &str,
|
||||
store_dir: &str,
|
||||
) -> RegionImpl<RaftEngineLogStore> {
|
||||
let metadata = tests::new_metadata(region_name);
|
||||
let store_config =
|
||||
config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await;
|
||||
RegionImpl::create(metadata, store_config).await.unwrap()
|
||||
}
|
||||
|
||||
/// Tester for basic tests.
|
||||
struct Tester {
|
||||
region_name: String,
|
||||
store_dir: String,
|
||||
base: Option<FileTesterBase>,
|
||||
}
|
||||
|
||||
impl Tester {
|
||||
async fn new(region_name: &str, store_dir: &str) -> Tester {
|
||||
let region = create_region_for_basic(region_name, store_dir).await;
|
||||
|
||||
Tester {
|
||||
region_name: region_name.to_string(),
|
||||
store_dir: store_dir.to_string(),
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn empty(region_name: &str, store_dir: &str) -> Tester {
|
||||
Tester {
|
||||
region_name: region_name.to_string(),
|
||||
store_dir: store_dir.to_string(),
|
||||
base: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn reopen(&mut self) {
|
||||
let _ = self.try_reopen().await.unwrap();
|
||||
}
|
||||
|
||||
async fn try_reopen(&mut self) -> Result<bool> {
|
||||
// Close the old region.
|
||||
if let Some(base) = self.base.as_ref() {
|
||||
info!("Reopen tester base");
|
||||
base.close().await;
|
||||
}
|
||||
|
||||
self.base = None;
|
||||
// Reopen the region.
|
||||
let store_config = config_util::new_store_config(
|
||||
&self.region_name,
|
||||
&self.store_dir,
|
||||
EngineConfig::default(),
|
||||
)
|
||||
.await;
|
||||
let opts = OpenOptions::default();
|
||||
let region = RegionImpl::open(self.region_name.clone(), store_config, &opts).await?;
|
||||
match region {
|
||||
None => Ok(false),
|
||||
Some(region) => {
|
||||
let base = FileTesterBase::with_region(region);
|
||||
self.base = Some(base);
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_batch_size(&mut self, batch_size: usize) {
|
||||
self.base.as_mut().unwrap().read_ctx.batch_size = batch_size;
|
||||
}
|
||||
|
||||
async fn put(&self, data: &[(i64, Option<String>)]) {
|
||||
let _ = self.base().put(data).await;
|
||||
}
|
||||
|
||||
async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
|
||||
self.base().full_scan().await
|
||||
}
|
||||
|
||||
fn committed_sequence(&self) -> SequenceNumber {
|
||||
self.base().committed_sequence()
|
||||
}
|
||||
|
||||
async fn delete(&self, keys: &[i64]) {
|
||||
let _ = self.base().delete(keys).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_put_scan() {
|
||||
let dir = create_temp_dir("put-scan");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let tester = Tester::new(REGION_NAME, store_dir).await;
|
||||
|
||||
let data = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(1001, Some(101.to_string())),
|
||||
(1002, None),
|
||||
(1003, Some(103.to_string())),
|
||||
(1004, Some(104.to_string())),
|
||||
];
|
||||
|
||||
tester.put(&data).await;
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(data, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sequence_increase() {
|
||||
let dir = create_temp_dir("sequence");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let tester = Tester::new(REGION_NAME, store_dir).await;
|
||||
|
||||
let mut committed_sequence = tester.committed_sequence();
|
||||
for i in 0..100 {
|
||||
tester.put(&[(i, Some(1234.to_string()))]).await;
|
||||
committed_sequence += 1;
|
||||
|
||||
assert_eq!(committed_sequence, tester.committed_sequence());
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_reopen() {
|
||||
common_telemetry::logging::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("reopen");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = Tester::new(REGION_NAME, store_dir).await;
|
||||
|
||||
let mut all_data = Vec::new();
|
||||
// Reopen region multiple times.
|
||||
for i in 0..5 {
|
||||
let data = (i, Some(i.to_string()));
|
||||
tester.put(&[data.clone()]).await;
|
||||
all_data.push(data.clone());
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(all_data, output);
|
||||
|
||||
tester.reopen().await;
|
||||
|
||||
// Scan after reopen.
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(all_data, output);
|
||||
|
||||
// Check committed sequence.
|
||||
assert_eq!(i + 1, tester.committed_sequence() as i64);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_open_empty() {
|
||||
let dir = create_temp_dir("open-empty");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = Tester::empty(REGION_NAME, store_dir).await;
|
||||
|
||||
let ret = tester.try_reopen().await;
|
||||
assert!(!ret.unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_different_batch() {
|
||||
let dir = create_temp_dir("different-batch");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = Tester::new(REGION_NAME, store_dir).await;
|
||||
|
||||
let data: Vec<_> = (0..=2000).map(|i| (i, Some(i.to_string()))).collect();
|
||||
|
||||
for chunk in data.chunks(100) {
|
||||
tester.put(chunk).await;
|
||||
}
|
||||
|
||||
let batch_sizes = [1, 2, 4, 16, 64, 128, 256, 512];
|
||||
for batch_size in batch_sizes {
|
||||
tester.set_batch_size(batch_size);
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(data, output);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_put_delete_scan() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("put-delete-scan");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = Tester::new(REGION_NAME, store_dir).await;
|
||||
|
||||
let data = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(1001, Some(101.to_string())),
|
||||
(1002, None),
|
||||
(1003, None),
|
||||
(1004, Some(104.to_string())),
|
||||
];
|
||||
|
||||
tester.put(&data).await;
|
||||
|
||||
let keys = [1001, 1003];
|
||||
|
||||
tester.delete(&keys).await;
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
let expect = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(1002, None),
|
||||
(1004, Some(104.to_string())),
|
||||
];
|
||||
assert_eq!(expect, output);
|
||||
|
||||
// Deletion is also persistent.
|
||||
let _ = tester.try_reopen().await.unwrap();
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_put_delete_absent_key() {
|
||||
let dir = create_temp_dir("put-delete-scan");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = Tester::new(REGION_NAME, store_dir).await;
|
||||
|
||||
let data = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(1001, Some(101.to_string())),
|
||||
(1002, None),
|
||||
(1003, None),
|
||||
(1004, Some(104.to_string())),
|
||||
];
|
||||
|
||||
tester.put(&data).await;
|
||||
|
||||
// 999 and 1006 is absent.
|
||||
let keys = [999, 1002, 1004, 1006];
|
||||
|
||||
tester.delete(&keys).await;
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
let expect = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(1001, Some(101.to_string())),
|
||||
(1003, None),
|
||||
];
|
||||
assert_eq!(expect, output);
|
||||
|
||||
// Deletion is also persistent.
|
||||
let _ = tester.try_reopen().await.unwrap();
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region close tests.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::storage::{
|
||||
AlterOperation, AlterRequest, CloseContext, Region, RegionMeta, WriteResponse,
|
||||
};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::engine;
|
||||
use crate::error::Error;
|
||||
use crate::flush::FlushStrategyRef;
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::RegionImpl;
|
||||
use crate::test_util::config_util;
|
||||
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
|
||||
|
||||
const REGION_NAME: &str = "region-close-0";
|
||||
|
||||
/// Tester for region close
|
||||
struct CloseTester {
|
||||
base: Option<FileTesterBase>,
|
||||
}
|
||||
|
||||
/// Create a new region for close test
|
||||
async fn create_region_for_close(
|
||||
store_dir: &str,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
) -> RegionImpl<RaftEngineLogStore> {
|
||||
let metadata = tests::new_metadata(REGION_NAME);
|
||||
|
||||
let mut store_config =
|
||||
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
|
||||
store_config.flush_strategy = flush_strategy;
|
||||
|
||||
RegionImpl::create(metadata, store_config).await.unwrap()
|
||||
}
|
||||
|
||||
impl CloseTester {
|
||||
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> CloseTester {
|
||||
let region = create_region_for_close(store_dir, flush_strategy.clone()).await;
|
||||
|
||||
CloseTester {
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
async fn put(&self, data: &[(i64, Option<i64>)]) {
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
let _ = self.base().put(&data).await;
|
||||
}
|
||||
|
||||
async fn try_put(&self, data: &[(i64, Option<i64>)]) -> Result<WriteResponse, Error> {
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
self.base().try_put(&data).await
|
||||
}
|
||||
|
||||
async fn try_alter(&self, mut req: AlterRequest) -> Result<(), Error> {
|
||||
let version = self.version();
|
||||
req.version = version;
|
||||
|
||||
self.base().region.alter(req).await
|
||||
}
|
||||
|
||||
fn version(&self) -> u32 {
|
||||
let metadata = self.base().region.in_memory_metadata();
|
||||
metadata.version()
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_close_basic() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("close-basic");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = CloseTester::new(store_dir, flush_switch).await;
|
||||
|
||||
tester
|
||||
.base()
|
||||
.region
|
||||
.close(&CloseContext::default())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let data = [(1000, Some(100))];
|
||||
|
||||
let closed_region_error = "Try to write the closed region".to_string();
|
||||
// Put one element should return ClosedRegion error
|
||||
assert_eq!(
|
||||
tester.try_put(&data).await.unwrap_err().to_string(),
|
||||
closed_region_error
|
||||
);
|
||||
|
||||
// Alter table should return ClosedRegion error
|
||||
assert_eq!(
|
||||
tester
|
||||
.try_alter(AlterRequest {
|
||||
operation: AlterOperation::AddColumns {
|
||||
columns: Vec::new(),
|
||||
},
|
||||
version: 0,
|
||||
})
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string(),
|
||||
closed_region_error
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_close_wait_flush_done() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("close-basic");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = CloseTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [(1000, Some(100))];
|
||||
|
||||
// Now set should flush to true to trigger flush.
|
||||
flush_switch.set_should_flush(true);
|
||||
|
||||
// Put one element so we have content to flush.
|
||||
tester.put(&data).await;
|
||||
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
assert!(!has_parquet_file(&sst_dir));
|
||||
|
||||
// Close should cancel the flush.
|
||||
tester
|
||||
.base()
|
||||
.region
|
||||
.close(&CloseContext::default())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(!has_parquet_file(&sst_dir));
|
||||
}
|
||||
@@ -1,458 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region compaction tests.
|
||||
|
||||
use std::env;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::logging;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use object_store::services::{Fs, S3};
|
||||
use object_store::ObjectStore;
|
||||
use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region};
|
||||
use tokio::sync::{Notify, RwLock};
|
||||
|
||||
use crate::compaction::CompactionHandler;
|
||||
use crate::config::EngineConfig;
|
||||
use crate::error::Result;
|
||||
use crate::file_purger::{FilePurgeHandler, FilePurgeRequest};
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::{CompactContext, FlushStrategyRef, RegionImpl};
|
||||
use crate::scheduler::rate_limit::BoxedRateLimitToken;
|
||||
use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig};
|
||||
use crate::test_util::config_util;
|
||||
use crate::test_util::flush_switch::FlushSwitch;
|
||||
|
||||
const REGION_NAME: &str = "region-compact-0";
|
||||
|
||||
fn new_object_store(store_dir: &str, s3_bucket: Option<String>) -> ObjectStore {
|
||||
if let Some(bucket) = s3_bucket {
|
||||
if !bucket.is_empty() {
|
||||
logging::info!("Use S3 object store");
|
||||
|
||||
let root = uuid::Uuid::new_v4().to_string();
|
||||
|
||||
let mut builder = S3::default();
|
||||
let _ = builder
|
||||
.root(&root)
|
||||
.access_key_id(&env::var("GT_S3_ACCESS_KEY_ID").unwrap())
|
||||
.secret_access_key(&env::var("GT_S3_ACCESS_KEY").unwrap())
|
||||
.region(&env::var("GT_S3_REGION").unwrap())
|
||||
.bucket(&bucket);
|
||||
|
||||
return ObjectStore::new(builder).unwrap().finish();
|
||||
}
|
||||
}
|
||||
|
||||
logging::info!("Use local fs object store");
|
||||
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(store_dir);
|
||||
ObjectStore::new(builder).unwrap().finish()
|
||||
}
|
||||
|
||||
/// Create a new region for compaction test
|
||||
async fn create_region_for_compaction<
|
||||
H: Handler<Request = FilePurgeRequest> + Send + Sync + 'static,
|
||||
>(
|
||||
store_dir: &str,
|
||||
engine_config: EngineConfig,
|
||||
purge_handler: H,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
s3_bucket: Option<String>,
|
||||
) -> (
|
||||
RegionImpl<RaftEngineLogStore>,
|
||||
ObjectStore,
|
||||
Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
|
||||
) {
|
||||
let metadata = tests::new_metadata(REGION_NAME);
|
||||
|
||||
let object_store = new_object_store(store_dir, s3_bucket);
|
||||
|
||||
let (mut store_config, _) = config_util::new_store_config_with_object_store(
|
||||
REGION_NAME,
|
||||
store_dir,
|
||||
object_store.clone(),
|
||||
EngineConfig::default(),
|
||||
)
|
||||
.await;
|
||||
store_config.engine_config = Arc::new(engine_config);
|
||||
store_config.flush_strategy = flush_strategy;
|
||||
|
||||
let pending_compaction_tasks = Arc::new(RwLock::new(vec![]));
|
||||
let handler = CompactionHandler::new_with_pending_tasks(pending_compaction_tasks.clone());
|
||||
let config = SchedulerConfig::default();
|
||||
// Overwrite test compaction scheduler and file purger.
|
||||
store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
|
||||
store_config.file_purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig {
|
||||
max_inflight_tasks: store_config.engine_config.max_purge_tasks,
|
||||
},
|
||||
purge_handler,
|
||||
));
|
||||
|
||||
(
|
||||
RegionImpl::create(metadata, store_config).await.unwrap(),
|
||||
object_store,
|
||||
pending_compaction_tasks,
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct MockFilePurgeHandler {
|
||||
num_deleted: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Handler for MockFilePurgeHandler {
|
||||
type Request = FilePurgeRequest;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
logging::info!(
|
||||
"Try to delete file: {:?}, num_deleted: {:?}",
|
||||
req.file_id,
|
||||
self.num_deleted
|
||||
);
|
||||
|
||||
let handler = FilePurgeHandler;
|
||||
handler
|
||||
.handle_request(req, token, finish_notifier)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let _ = self.num_deleted.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl MockFilePurgeHandler {
|
||||
fn num_deleted(&self) -> usize {
|
||||
self.num_deleted.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tester for region compaction.
|
||||
struct CompactionTester {
|
||||
base: Option<FileTesterBase>,
|
||||
purge_handler: MockFilePurgeHandler,
|
||||
object_store: ObjectStore,
|
||||
store_dir: String,
|
||||
engine_config: EngineConfig,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
pending_tasks: Arc<RwLock<Vec<tokio::task::JoinHandle<()>>>>,
|
||||
}
|
||||
|
||||
impl CompactionTester {
|
||||
async fn new(
|
||||
store_dir: &str,
|
||||
engine_config: EngineConfig,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
s3_bucket: Option<String>,
|
||||
) -> CompactionTester {
|
||||
let purge_handler = MockFilePurgeHandler::default();
|
||||
let (region, object_store, pending_tasks) = create_region_for_compaction(
|
||||
store_dir,
|
||||
engine_config.clone(),
|
||||
purge_handler.clone(),
|
||||
flush_strategy.clone(),
|
||||
s3_bucket,
|
||||
)
|
||||
.await;
|
||||
|
||||
CompactionTester {
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
purge_handler,
|
||||
object_store,
|
||||
store_dir: store_dir.to_string(),
|
||||
engine_config,
|
||||
flush_strategy,
|
||||
pending_tasks,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base_mut(&mut self) -> &mut FileTesterBase {
|
||||
self.base.as_mut().unwrap()
|
||||
}
|
||||
|
||||
async fn put(&self, data: &[(i64, Option<i64>)]) {
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
let _ = self.base().put(&data).await;
|
||||
}
|
||||
|
||||
async fn flush(&self, wait: Option<bool>) {
|
||||
let ctx = wait
|
||||
.map(|wait| FlushContext {
|
||||
wait,
|
||||
reason: FlushReason::Manually,
|
||||
..Default::default()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
self.base().region.flush(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
async fn compact(&self) {
|
||||
// Trigger compaction and wait until it is done.
|
||||
self.base()
|
||||
.region
|
||||
.compact(&CompactContext::default())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Close region and clean up files.
|
||||
async fn clean_up(mut self) {
|
||||
self.base = None;
|
||||
|
||||
self.object_store.remove_all("/").await.unwrap();
|
||||
}
|
||||
|
||||
async fn reopen(&mut self) -> Result<bool> {
|
||||
// Close the old region.
|
||||
if let Some(base) = self.base.take() {
|
||||
let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await;
|
||||
base.close().await;
|
||||
}
|
||||
|
||||
// Reopen the region.
|
||||
let object_store = new_object_store(&self.store_dir, None);
|
||||
let (mut store_config, _) = config_util::new_store_config_with_object_store(
|
||||
REGION_NAME,
|
||||
&self.store_dir,
|
||||
object_store.clone(),
|
||||
EngineConfig {
|
||||
max_files_in_l0: usize::MAX,
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await;
|
||||
store_config.engine_config = Arc::new(self.engine_config.clone());
|
||||
store_config.flush_strategy = self.flush_strategy.clone();
|
||||
|
||||
let handler = CompactionHandler::new_with_pending_tasks(Arc::new(Default::default()));
|
||||
let config = SchedulerConfig::default();
|
||||
// Overwrite test compaction scheduler and file purger.
|
||||
store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
|
||||
store_config.file_purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig {
|
||||
max_inflight_tasks: store_config.engine_config.max_purge_tasks,
|
||||
},
|
||||
MockFilePurgeHandler::default(),
|
||||
));
|
||||
|
||||
let Some(region) = RegionImpl::open(
|
||||
REGION_NAME.to_string(),
|
||||
store_config,
|
||||
&OpenOptions::default(),
|
||||
)
|
||||
.await?
|
||||
else {
|
||||
return Ok(false);
|
||||
};
|
||||
self.base = Some(FileTesterBase::with_region(region));
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
async fn compact_during_read(s3_bucket: Option<String>) {
|
||||
let dir = create_temp_dir("compact_read");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
// Use a large max_files_in_l0 to avoid compaction automatically.
|
||||
let mut tester = CompactionTester::new(
|
||||
store_dir,
|
||||
EngineConfig {
|
||||
max_files_in_l0: 100,
|
||||
..Default::default()
|
||||
},
|
||||
// Disable auto-flush.
|
||||
Arc::new(FlushSwitch::default()),
|
||||
s3_bucket,
|
||||
)
|
||||
.await;
|
||||
|
||||
let expect: Vec<_> = (0..200).map(|v| (v, Some(v))).collect();
|
||||
// Put elements so we have content to flush (In SST1).
|
||||
tester.put(&expect[0..100]).await;
|
||||
|
||||
// Flush content to SST1.
|
||||
tester.flush(None).await;
|
||||
|
||||
// Put element (In SST2).
|
||||
tester.put(&expect[100..200]).await;
|
||||
|
||||
// Flush content to SST2.
|
||||
tester.flush(None).await;
|
||||
|
||||
tester.base_mut().read_ctx.batch_size = 1;
|
||||
// Create a reader.
|
||||
let reader = tester.base().full_scan_reader().await;
|
||||
|
||||
assert_eq!(0, tester.purge_handler.num_deleted());
|
||||
|
||||
// Trigger compaction.
|
||||
tester.compact().await;
|
||||
|
||||
// The files are still referenced.
|
||||
assert_eq!(0, tester.purge_handler.num_deleted());
|
||||
|
||||
// Read from the reader.
|
||||
let output = tester.base().collect_reader(reader).await;
|
||||
|
||||
assert_eq!(expect.len(), output.len());
|
||||
|
||||
tester.clean_up().await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_compact_during_read_on_fs() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
compact_during_read(None).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_compact_during_read_on_s3() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
if let Ok(bucket) = env::var("GT_S3_BUCKET") {
|
||||
if !bucket.is_empty() {
|
||||
compact_during_read(Some(bucket)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_persist_region_compaction_time_window() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("put-delete-scan");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let mut tester = CompactionTester::new(
|
||||
store_dir,
|
||||
EngineConfig {
|
||||
max_files_in_l0: 100,
|
||||
..Default::default()
|
||||
},
|
||||
// Disable auto-flush.
|
||||
Arc::new(FlushSwitch::default()),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// initially the time window is not present since no compaction ever happened.
|
||||
assert_eq!(
|
||||
None,
|
||||
tester
|
||||
.base
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.region
|
||||
.inner
|
||||
.shared
|
||||
.version_control
|
||||
.current()
|
||||
.ssts()
|
||||
.compaction_time_window()
|
||||
);
|
||||
|
||||
// write some data with one hour span
|
||||
for idx in 0..10 {
|
||||
tester
|
||||
.put(&[(idx * 1000, Some(idx)), ((idx + 360) * 1000, Some(idx))])
|
||||
.await;
|
||||
tester.flush(Some(true)).await;
|
||||
}
|
||||
|
||||
tester.compact().await;
|
||||
// the inferred and persisted compaction time window should be 3600 seconds.
|
||||
assert_eq!(
|
||||
3600,
|
||||
tester
|
||||
.base
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.region
|
||||
.inner
|
||||
.shared
|
||||
.version_control
|
||||
.current()
|
||||
.ssts()
|
||||
.compaction_time_window()
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
// try write data with a larger time window
|
||||
for idx in 0..10 {
|
||||
tester
|
||||
.put(&[
|
||||
(idx * 1000, Some(idx)),
|
||||
((idx + 2 * 60 * 60) * 1000, Some(idx)),
|
||||
])
|
||||
.await;
|
||||
tester.flush(Some(true)).await;
|
||||
}
|
||||
tester.compact().await;
|
||||
|
||||
// but we won't changed persisted compaction window for now, so it remains unchanged.
|
||||
assert_eq!(
|
||||
3600,
|
||||
tester
|
||||
.base
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.region
|
||||
.inner
|
||||
.shared
|
||||
.version_control
|
||||
.current()
|
||||
.ssts()
|
||||
.compaction_time_window()
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
let reopened = tester.reopen().await.unwrap();
|
||||
assert!(reopened);
|
||||
assert_eq!(
|
||||
3600,
|
||||
tester
|
||||
.base
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.region
|
||||
.inner
|
||||
.shared
|
||||
.version_control
|
||||
.current()
|
||||
.ssts()
|
||||
.compaction_time_window()
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
@@ -1,192 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region drop tests.
|
||||
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::info;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::manifest::{Manifest, MetaAction};
|
||||
use store_api::storage::{FlushContext, OpenOptions, Region};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::engine;
|
||||
use crate::flush::FlushStrategyRef;
|
||||
use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionRemove};
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::RegionImpl;
|
||||
use crate::test_util::config_util;
|
||||
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
|
||||
|
||||
const REGION_NAME: &str = "region-drop-0";
|
||||
|
||||
/// Create a new region for drop tests.
|
||||
async fn create_region_for_drop(
|
||||
store_dir: &str,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
) -> RegionImpl<RaftEngineLogStore> {
|
||||
let metadata = tests::new_metadata(REGION_NAME);
|
||||
|
||||
let mut store_config =
|
||||
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
|
||||
store_config.flush_strategy = flush_strategy;
|
||||
|
||||
RegionImpl::create(metadata, store_config).await.unwrap()
|
||||
}
|
||||
|
||||
/// Tester for drop tests.
|
||||
struct DropTester {
|
||||
base: Option<FileTesterBase>,
|
||||
}
|
||||
|
||||
impl DropTester {
|
||||
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> DropTester {
|
||||
let region = create_region_for_drop(store_dir, flush_strategy).await;
|
||||
DropTester {
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
async fn put(&self, data: &[(i64, Option<i64>)]) {
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
let _ = self.base().put(&data).await;
|
||||
}
|
||||
|
||||
async fn flush(&self) {
|
||||
let ctx = FlushContext::default();
|
||||
self.base().region.flush(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
async fn close(&mut self) {
|
||||
if let Some(base) = self.base.take() {
|
||||
base.close().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_all_files(path: &str) -> Vec<String> {
|
||||
let mut files = Vec::new();
|
||||
for entry in std::fs::read_dir(path).unwrap() {
|
||||
let entry = entry.unwrap();
|
||||
let path = entry.path();
|
||||
if path.is_file() {
|
||||
files.push(path.to_str().unwrap().to_string());
|
||||
} else if path.is_dir() {
|
||||
files.extend(get_all_files(path.to_str().unwrap()));
|
||||
}
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_basic() {
|
||||
let dir = create_temp_dir("drop-basic");
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
let manifest_dir = format!(
|
||||
"{}/{}",
|
||||
store_dir,
|
||||
engine::region_manifest_dir("", REGION_NAME)
|
||||
);
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let mut tester = DropTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [(1000, Some(100))];
|
||||
|
||||
// Put one element so we have content to flush.
|
||||
tester.put(&data).await;
|
||||
|
||||
// Manually trigger flush.
|
||||
tester.flush().await;
|
||||
|
||||
assert!(has_parquet_file(&sst_dir));
|
||||
|
||||
tester.base().checkpoint_manifest().await;
|
||||
let manifest_files = get_all_files(&manifest_dir);
|
||||
info!("manifest_files: {:?}", manifest_files);
|
||||
|
||||
tester.base().region.drop_region().await.unwrap();
|
||||
tester.close().await;
|
||||
|
||||
assert!(!Path::new(&manifest_dir).exists());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_drop_reopen() {
|
||||
let dir = create_temp_dir("drop-basic");
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
let manifest_dir = format!(
|
||||
"{}/{}",
|
||||
store_dir,
|
||||
engine::region_manifest_dir("", REGION_NAME)
|
||||
);
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let mut tester = DropTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [(1000, Some(100))];
|
||||
|
||||
// Put one element so we have content to flush.
|
||||
tester.put(&data).await;
|
||||
// Manually trigger flush.
|
||||
tester.flush().await;
|
||||
|
||||
assert!(has_parquet_file(&sst_dir));
|
||||
|
||||
tester.base().checkpoint_manifest().await;
|
||||
let version_control = tester.base().region.version_control();
|
||||
|
||||
let mut action_list =
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove {
|
||||
region_id: tester.base().region.id(),
|
||||
}));
|
||||
let prev_version = version_control.current_manifest_version();
|
||||
action_list.set_prev_version(prev_version);
|
||||
let manifest = &tester.base().region.inner.manifest;
|
||||
let _ = manifest.update(action_list).await.unwrap();
|
||||
tester.close().await;
|
||||
|
||||
// Reopen the region.
|
||||
let store_config = config_util::new_store_config(
|
||||
REGION_NAME,
|
||||
store_dir,
|
||||
EngineConfig {
|
||||
max_files_in_l0: usize::MAX,
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let opts = OpenOptions::default();
|
||||
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(region.is_none());
|
||||
assert!(!Path::new(&manifest_dir).exists());
|
||||
}
|
||||
@@ -1,462 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region flush tests.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use arrow::compute::SortOptions;
|
||||
use common_query::prelude::Expr;
|
||||
use common_recordbatch::OrderOption;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use datafusion_common::Column;
|
||||
use datatypes::value::timestamp_to_scalar_value;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region, ScanRequest};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::engine::{self, RegionMap};
|
||||
use crate::flush::{FlushStrategyRef, FlushType};
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::RegionImpl;
|
||||
use crate::test_util::config_util;
|
||||
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
|
||||
|
||||
const REGION_NAME: &str = "region-flush-0";
|
||||
|
||||
/// Create a new region for flush test
|
||||
async fn create_region_for_flush(
|
||||
store_dir: &str,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
) -> (
|
||||
RegionImpl<RaftEngineLogStore>,
|
||||
Arc<RegionMap<RaftEngineLogStore>>,
|
||||
) {
|
||||
let metadata = tests::new_metadata(REGION_NAME);
|
||||
|
||||
let (mut store_config, regions) = config_util::new_store_config_and_region_map(
|
||||
REGION_NAME,
|
||||
store_dir,
|
||||
EngineConfig {
|
||||
max_files_in_l0: usize::MAX,
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await;
|
||||
store_config.flush_strategy = flush_strategy;
|
||||
|
||||
(
|
||||
RegionImpl::create(metadata, store_config).await.unwrap(),
|
||||
regions,
|
||||
)
|
||||
}
|
||||
|
||||
/// Tester for region flush.
|
||||
struct FlushTester {
|
||||
base: Option<FileTesterBase>,
|
||||
store_dir: String,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
regions: Arc<RegionMap<RaftEngineLogStore>>,
|
||||
}
|
||||
|
||||
impl FlushTester {
|
||||
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> FlushTester {
|
||||
let (region, regions) = create_region_for_flush(store_dir, flush_strategy.clone()).await;
|
||||
|
||||
FlushTester {
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
store_dir: store_dir.to_string(),
|
||||
flush_strategy: flush_strategy.clone(),
|
||||
regions,
|
||||
}
|
||||
}
|
||||
|
||||
async fn reopen(&mut self) {
|
||||
self.regions.clear();
|
||||
// Close the old region.
|
||||
if let Some(base) = self.base.take() {
|
||||
base.close().await;
|
||||
}
|
||||
// Reopen the region.
|
||||
let mut store_config = config_util::new_store_config(
|
||||
REGION_NAME,
|
||||
&self.store_dir,
|
||||
EngineConfig {
|
||||
max_files_in_l0: usize::MAX,
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await;
|
||||
store_config.flush_strategy = self.flush_strategy.clone();
|
||||
let opts = OpenOptions::default();
|
||||
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
self.base = Some(FileTesterBase::with_region(region));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
async fn put(&self, data: &[(i64, Option<i64>)]) {
|
||||
let data = data
|
||||
.iter()
|
||||
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
|
||||
.collect::<Vec<_>>();
|
||||
let _ = self.base().put(&data).await;
|
||||
}
|
||||
|
||||
async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
|
||||
self.base().full_scan().await
|
||||
}
|
||||
|
||||
async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option<String>)> {
|
||||
self.base().scan(req).await
|
||||
}
|
||||
|
||||
async fn flush(&self, wait: Option<bool>) {
|
||||
let ctx = wait
|
||||
.map(|wait| FlushContext {
|
||||
wait,
|
||||
reason: FlushReason::Manually,
|
||||
..Default::default()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
self.base().region.flush(&ctx).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FlushTester {
|
||||
fn drop(&mut self) {
|
||||
self.regions.clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flush_and_stall() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("flush-stall");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [(1000, Some(100))];
|
||||
// Put one element so we have content to flush.
|
||||
tester.put(&data).await;
|
||||
|
||||
// Now set should flush to true to trigger flush.
|
||||
flush_switch.set_should_flush(true);
|
||||
// Put element to trigger flush.
|
||||
tester.put(&data).await;
|
||||
|
||||
// Now put another data to trigger write stall and wait until last flush done to
|
||||
// ensure at least one parquet file is generated.
|
||||
tester.put(&data).await;
|
||||
|
||||
// Check parquet files.
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
assert!(has_parquet_file(&sst_dir));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manual_flush() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("manual_flush");
|
||||
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [(1000, Some(100))];
|
||||
// Put one element so we have content to flush.
|
||||
tester.put(&data).await;
|
||||
|
||||
// No parquet file should be flushed.
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
assert!(!has_parquet_file(&sst_dir));
|
||||
|
||||
tester.flush(None).await;
|
||||
|
||||
assert!(has_parquet_file(&sst_dir));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flush_and_reopen() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("manual_flush");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let mut tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
tester.put(&[(1000, Some(100))]).await;
|
||||
tester.flush(Some(true)).await;
|
||||
tester.reopen().await;
|
||||
let i = tester
|
||||
.base()
|
||||
.region
|
||||
.inner
|
||||
.shared
|
||||
.version_control
|
||||
.committed_sequence();
|
||||
|
||||
// we wrote a request and flushed the region (involving writing a manifest), thus
|
||||
// committed_sequence should be 2.
|
||||
assert_eq!(2, i);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flush_empty() {
|
||||
let dir = create_temp_dir("flush-empty");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
// Flush empty table.
|
||||
tester.flush(None).await;
|
||||
let data = [(1000, Some(100))];
|
||||
// Put element to trigger flush.
|
||||
tester.put(&data).await;
|
||||
|
||||
// Put again.
|
||||
let data = [(2000, Some(200))];
|
||||
tester.put(&data).await;
|
||||
|
||||
// No parquet file should be flushed.
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
assert!(!has_parquet_file(&sst_dir));
|
||||
|
||||
let expect = vec![(1000, Some(100.to_string())), (2000, Some(200.to_string()))];
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_after_flush_across_window() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("read-flush");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
// Put elements so we have content to flush.
|
||||
tester.put(&[(1000, Some(100))]).await;
|
||||
tester.put(&[(2000, Some(200))]).await;
|
||||
|
||||
// Flush.
|
||||
tester.flush(None).await;
|
||||
|
||||
// Put element again.
|
||||
tester.put(&[(3000, Some(300))]).await;
|
||||
|
||||
let expect = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(2000, Some(200.to_string())),
|
||||
(3000, Some(300.to_string())),
|
||||
];
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
|
||||
// Reopen
|
||||
let mut tester = tester;
|
||||
tester.reopen().await;
|
||||
|
||||
// Scan after reopen.
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_after_flush_same_window() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("read-flush");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
// Put elements so we have content to flush.
|
||||
tester.put(&[(1000, Some(100))]).await;
|
||||
tester.put(&[(2000, Some(200))]).await;
|
||||
|
||||
// Flush.
|
||||
tester.flush(None).await;
|
||||
|
||||
// Put element again.
|
||||
tester.put(&[(1003, Some(300))]).await;
|
||||
|
||||
let expect = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(1003, Some(300.to_string())),
|
||||
(2000, Some(200.to_string())),
|
||||
];
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
|
||||
// Reopen
|
||||
let mut tester = tester;
|
||||
tester.reopen().await;
|
||||
|
||||
// Scan after reopen.
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_read_after_flush() {
|
||||
let dir = create_temp_dir("merge-read-flush");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
// Put elements so we have content to flush (In SST1).
|
||||
tester.put(&[(3000, Some(300))]).await;
|
||||
tester.put(&[(2000, Some(200))]).await;
|
||||
|
||||
// Flush content to SST1.
|
||||
tester.flush(None).await;
|
||||
|
||||
// Put element (In SST2).
|
||||
tester.put(&[(2000, Some(201))]).await;
|
||||
|
||||
// In SST2.
|
||||
tester.put(&[(2000, Some(202))]).await;
|
||||
tester.put(&[(1000, Some(100))]).await;
|
||||
|
||||
// Trigger flush.
|
||||
tester.flush(None).await;
|
||||
|
||||
// Overwrite row (In memtable).
|
||||
tester.put(&[(2000, Some(203))]).await;
|
||||
|
||||
let expect = vec![
|
||||
(1000, Some(100.to_string())),
|
||||
(2000, Some(203.to_string())),
|
||||
(3000, Some(300.to_string())),
|
||||
];
|
||||
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
|
||||
// Reopen
|
||||
let mut tester = tester;
|
||||
tester.reopen().await;
|
||||
|
||||
// Scan after reopen.
|
||||
let output = tester.full_scan().await;
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_engine_flush() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("engine-flush");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
assert_eq!(0, tester.base().region.last_flush_millis());
|
||||
|
||||
// Insert the region to the region map.
|
||||
let _ = tester.regions.get_or_occupy_slot(
|
||||
REGION_NAME,
|
||||
engine::RegionSlot::Ready(tester.base().region.clone()),
|
||||
);
|
||||
|
||||
// Put elements so we have content to flush.
|
||||
tester.put(&[(1000, Some(100))]).await;
|
||||
tester.put(&[(2000, Some(200))]).await;
|
||||
|
||||
flush_switch.set_flush_type(FlushType::Engine);
|
||||
|
||||
// Put element and trigger an engine level flush.
|
||||
tester.put(&[(3000, Some(300))]).await;
|
||||
|
||||
// Wait for flush.
|
||||
let mut count = 0;
|
||||
while tester.base().region.last_flush_millis() == 0 && count < 50 {
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
// Check parquet files.
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
assert!(has_parquet_file(&sst_dir));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flush_and_query_empty() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let dir = create_temp_dir("flush_and_query_empty_range");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
tester
|
||||
.put(
|
||||
&(20000..30000)
|
||||
.map(|v| (v as i64, Some(v as i64)))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
.await;
|
||||
tester.flush(Some(true)).await;
|
||||
|
||||
tester
|
||||
.put(
|
||||
&(20100..20200)
|
||||
.map(|v| (v as i64, Some(v as i64)))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
.await;
|
||||
tester.flush(Some(true)).await;
|
||||
|
||||
use datafusion_expr::Expr as DfExpr;
|
||||
let req = ScanRequest {
|
||||
sequence: None,
|
||||
projection: None,
|
||||
filters: vec![Expr::from(datafusion_expr::binary_expr(
|
||||
DfExpr::Column(Column::from("timestamp")),
|
||||
datafusion_expr::Operator::GtEq,
|
||||
datafusion_expr::lit(timestamp_to_scalar_value(
|
||||
TimeUnit::Millisecond,
|
||||
Some(20000),
|
||||
)),
|
||||
))],
|
||||
output_ordering: Some(vec![OrderOption {
|
||||
name: "timestamp".to_string(),
|
||||
options: SortOptions {
|
||||
descending: true,
|
||||
nulls_first: true,
|
||||
},
|
||||
}]),
|
||||
limit: Some(1),
|
||||
};
|
||||
let _ = tester.scan(req).await;
|
||||
}
|
||||
@@ -1,206 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::storage::{
|
||||
Chunk, ChunkReader, ReadContext, Region, ScanRequest, Snapshot, WriteContext, WriteRequest,
|
||||
};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::region::{RegionImpl, RegionMetadata};
|
||||
use crate::test_util::{self, config_util, descriptor_util, write_batch_util};
|
||||
use crate::write_batch::WriteBatch;
|
||||
|
||||
/// Create metadata with schema (k0, timestamp, v0, v1)
|
||||
fn new_metadata(region_name: &str) -> RegionMetadata {
|
||||
let desc = descriptor_util::desc_with_field_columns(region_name, 2);
|
||||
desc.try_into().unwrap()
|
||||
}
|
||||
|
||||
fn new_write_batch_for_test() -> WriteBatch {
|
||||
write_batch_util::new_write_batch(
|
||||
&[
|
||||
("k0", LogicalTypeId::Int64, false),
|
||||
(
|
||||
test_util::TIMESTAMP_NAME,
|
||||
LogicalTypeId::TimestampMillisecond,
|
||||
false,
|
||||
),
|
||||
("v0", LogicalTypeId::Int64, true),
|
||||
("v1", LogicalTypeId::Int64, true),
|
||||
],
|
||||
Some(1),
|
||||
2,
|
||||
)
|
||||
}
|
||||
|
||||
/// Build put data
|
||||
///
|
||||
/// ```text
|
||||
/// k0: [key_start, key_start + 1, ... key_start + len - 1]
|
||||
/// timestamp: [ts_start, ts_start + 1, ... ts_start + len - 1]
|
||||
/// v0: [initial_value, ...., initial_value]
|
||||
/// v1: [initial_value, ..., initial_value + len - 1]
|
||||
/// ```
|
||||
fn new_put_data(
|
||||
len: usize,
|
||||
key_start: i64,
|
||||
ts_start: i64,
|
||||
initial_value: i64,
|
||||
) -> HashMap<String, VectorRef> {
|
||||
let k0 = Arc::new(Int64Vector::from_values(
|
||||
(0..len).map(|v| key_start + v as i64),
|
||||
)) as VectorRef;
|
||||
let ts = Arc::new(TimestampMillisecondVector::from_values(
|
||||
(0..len).map(|v| ts_start + v as i64),
|
||||
)) as VectorRef;
|
||||
let v0 = Arc::new(Int64Vector::from_values(
|
||||
std::iter::repeat(initial_value).take(len),
|
||||
)) as VectorRef;
|
||||
let v1 = Arc::new(Int64Vector::from_values(
|
||||
(0..len).map(|v| initial_value + v as i64),
|
||||
)) as VectorRef;
|
||||
|
||||
HashMap::from([
|
||||
("k0".to_string(), k0),
|
||||
(test_util::TIMESTAMP_NAME.to_string(), ts),
|
||||
("v0".to_string(), v0),
|
||||
("v1".to_string(), v1),
|
||||
])
|
||||
}
|
||||
|
||||
fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<Vec<i64>>) {
|
||||
if chunk.columns.is_empty() {
|
||||
return;
|
||||
}
|
||||
let num_rows = chunk.columns[0].len();
|
||||
dst.resize(num_rows, Vec::new());
|
||||
for (i, row) in dst.iter_mut().enumerate() {
|
||||
for col in &chunk.columns {
|
||||
match col.data_type() {
|
||||
ConcreteDataType::Int64(_) => {
|
||||
let val = col
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Vector>()
|
||||
.unwrap()
|
||||
.get_data(i)
|
||||
.unwrap();
|
||||
row.push(val);
|
||||
}
|
||||
ConcreteDataType::Timestamp(_) => {
|
||||
let val = col
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap()
|
||||
.get_data(i)
|
||||
.unwrap();
|
||||
row.push(val.into());
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ProjectionTester<S: LogStore> {
|
||||
region: RegionImpl<S>,
|
||||
write_ctx: WriteContext,
|
||||
read_ctx: ReadContext,
|
||||
}
|
||||
|
||||
impl<S: LogStore> ProjectionTester<S> {
|
||||
fn with_region(region: RegionImpl<S>) -> ProjectionTester<S> {
|
||||
ProjectionTester {
|
||||
region,
|
||||
write_ctx: WriteContext::default(),
|
||||
read_ctx: ReadContext::default(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn put(&self, len: usize, key_start: i64, ts_start: i64, initial_value: i64) {
|
||||
let mut batch = new_write_batch_for_test();
|
||||
let put_data = new_put_data(len, key_start, ts_start, initial_value);
|
||||
batch.put(put_data).unwrap();
|
||||
|
||||
let _ = self.region.write(&self.write_ctx, batch).await.unwrap();
|
||||
}
|
||||
|
||||
async fn scan(&self, projection: Option<Vec<usize>>) -> Vec<Vec<i64>> {
|
||||
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
|
||||
|
||||
let request = ScanRequest {
|
||||
projection,
|
||||
..Default::default()
|
||||
};
|
||||
let resp = snapshot.scan(&self.read_ctx, request).await.unwrap();
|
||||
let mut reader = resp.reader;
|
||||
|
||||
let mut dst = Vec::new();
|
||||
while let Some(chunk) = reader.next_chunk().await.unwrap() {
|
||||
let chunk = reader.project_chunk(chunk);
|
||||
append_chunk_to(&chunk, &mut dst);
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
}
|
||||
|
||||
const REGION_NAME: &str = "region-projection-0";
|
||||
|
||||
async fn new_tester(store_dir: &str) -> ProjectionTester<RaftEngineLogStore> {
|
||||
let metadata = new_metadata(REGION_NAME);
|
||||
|
||||
let store_config =
|
||||
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
|
||||
let region = RegionImpl::create(metadata, store_config).await.unwrap();
|
||||
|
||||
ProjectionTester::with_region(region)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_projection_ordered() {
|
||||
let dir = create_temp_dir("projection-ordered");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let tester = new_tester(store_dir).await;
|
||||
tester.put(4, 1, 10, 100).await;
|
||||
|
||||
// timestamp, v1
|
||||
let output = tester.scan(Some(vec![1, 3])).await;
|
||||
let expect = vec![vec![10, 100], vec![11, 101], vec![12, 102], vec![13, 103]];
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_projection_unordered() {
|
||||
let dir = create_temp_dir("projection-unordered");
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let tester = new_tester(store_dir).await;
|
||||
tester.put(4, 1, 10, 100).await;
|
||||
|
||||
// v1, k0
|
||||
let output = tester.scan(Some(vec![3, 0])).await;
|
||||
let expect = vec![vec![100, 1], vec![101, 2], vec![102, 3], vec![103, 4]];
|
||||
assert_eq!(expect, output);
|
||||
}
|
||||
@@ -1,242 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Region truncate tests.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use store_api::manifest::{Manifest, MetaAction};
|
||||
use store_api::storage::{FlushContext, OpenOptions, Region};
|
||||
|
||||
use crate::config::EngineConfig;
|
||||
use crate::engine;
|
||||
use crate::flush::FlushStrategyRef;
|
||||
use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionTruncate};
|
||||
use crate::region::tests::{self, FileTesterBase};
|
||||
use crate::region::RegionImpl;
|
||||
use crate::test_util::config_util;
|
||||
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
|
||||
|
||||
const REGION_NAME: &str = "region-truncate-0";
|
||||
|
||||
/// Create a new region for truncate tests.
|
||||
async fn create_region_for_truncate(
|
||||
store_dir: &str,
|
||||
flush_strategy: FlushStrategyRef,
|
||||
) -> RegionImpl<RaftEngineLogStore> {
|
||||
let metadata = tests::new_metadata(REGION_NAME);
|
||||
|
||||
let mut store_config =
|
||||
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
|
||||
store_config.flush_strategy = flush_strategy;
|
||||
|
||||
RegionImpl::create(metadata, store_config).await.unwrap()
|
||||
}
|
||||
|
||||
/// Tester for truncate tests.
|
||||
struct TruncateTester {
|
||||
store_dir: String,
|
||||
base: Option<FileTesterBase>,
|
||||
}
|
||||
|
||||
impl TruncateTester {
|
||||
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> TruncateTester {
|
||||
let region = create_region_for_truncate(store_dir, flush_strategy).await;
|
||||
TruncateTester {
|
||||
store_dir: store_dir.to_string(),
|
||||
base: Some(FileTesterBase::with_region(region)),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn base(&self) -> &FileTesterBase {
|
||||
self.base.as_ref().unwrap()
|
||||
}
|
||||
|
||||
async fn flush(&self) {
|
||||
let ctx = FlushContext::default();
|
||||
self.base().region.flush(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
async fn truncate(&self) {
|
||||
self.base().region.truncate().await.unwrap();
|
||||
}
|
||||
|
||||
async fn reopen(&mut self) {
|
||||
// Close the old region.
|
||||
if let Some(base) = self.base.as_ref() {
|
||||
base.close().await;
|
||||
}
|
||||
self.base = None;
|
||||
// Reopen the region.
|
||||
let store_config = config_util::new_store_config(
|
||||
REGION_NAME,
|
||||
&self.store_dir,
|
||||
EngineConfig {
|
||||
max_files_in_l0: usize::MAX,
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let opts = OpenOptions::default();
|
||||
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
self.base = Some(FileTesterBase::with_region(region));
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_truncate_basic() {
|
||||
let dir = create_temp_dir("truncate-basic");
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [
|
||||
(1000, Some("1000".to_string())),
|
||||
(1001, Some("1001".to_string())),
|
||||
(1002, Some("1002".to_string())),
|
||||
(1003, Some("1003".to_string())),
|
||||
];
|
||||
|
||||
// Data in Memtable
|
||||
tester.base().put(&data).await;
|
||||
let res = tester.base().full_scan().await;
|
||||
assert_eq!(4, res.len());
|
||||
|
||||
// Truncate region.
|
||||
tester.truncate().await;
|
||||
|
||||
let res = tester.base().full_scan().await;
|
||||
assert_eq!(0, res.len());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_put_data_after_truncate() {
|
||||
let dir = create_temp_dir("put_data_after_truncate");
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [
|
||||
(1000, Some("1000".to_string())),
|
||||
(1001, Some("1001".to_string())),
|
||||
(1002, None),
|
||||
(1003, Some("1003".to_string())),
|
||||
];
|
||||
|
||||
tester.base().put(&data).await;
|
||||
|
||||
// Manually trigger flush.
|
||||
tester.flush().await;
|
||||
assert!(has_parquet_file(&sst_dir));
|
||||
|
||||
let data = [
|
||||
(1002, Some("1002".to_string())),
|
||||
(1004, Some("1004".to_string())),
|
||||
(1005, Some("1005".to_string())),
|
||||
];
|
||||
tester.base().put(&data).await;
|
||||
|
||||
// Truncate region.
|
||||
tester.truncate().await;
|
||||
let res = tester.base().full_scan().await;
|
||||
assert_eq!(0, res.len());
|
||||
|
||||
let new_data = [
|
||||
(1010, Some("0".to_string())),
|
||||
(1011, Some("1".to_string())),
|
||||
(1012, Some("2".to_string())),
|
||||
(1013, Some("3".to_string())),
|
||||
];
|
||||
tester.base().put(&new_data).await;
|
||||
|
||||
let res = tester.base().full_scan().await;
|
||||
assert_eq!(new_data, res.as_slice());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_truncate_reopen() {
|
||||
let dir = create_temp_dir("put_data_after_truncate");
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let store_dir = dir.path().to_str().unwrap();
|
||||
|
||||
let flush_switch = Arc::new(FlushSwitch::default());
|
||||
let mut tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
|
||||
|
||||
let data = [
|
||||
(1000, Some("1000".to_string())),
|
||||
(1001, Some("1001".to_string())),
|
||||
(1002, None),
|
||||
(1003, Some("1003".to_string())),
|
||||
];
|
||||
|
||||
tester.base().put(&data).await;
|
||||
|
||||
// Manually trigger flush.
|
||||
tester.flush().await;
|
||||
|
||||
let data = [
|
||||
(1002, Some("1002".to_string())),
|
||||
(1004, Some("1004".to_string())),
|
||||
(1005, Some("1005".to_string())),
|
||||
];
|
||||
tester.base().put(&data).await;
|
||||
|
||||
let manifest = &tester.base().region.inner.manifest;
|
||||
let manifest_version = tester
|
||||
.base()
|
||||
.region
|
||||
.version_control()
|
||||
.current_manifest_version();
|
||||
|
||||
let committed_sequence = tester.base().committed_sequence();
|
||||
let mut action_list =
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
|
||||
region_id: 0.into(),
|
||||
committed_sequence,
|
||||
}));
|
||||
|
||||
// Persist the meta action.
|
||||
let prev_version = manifest_version;
|
||||
action_list.set_prev_version(prev_version);
|
||||
manifest.update(action_list).await.unwrap();
|
||||
|
||||
// Reopen and put data.
|
||||
tester.reopen().await;
|
||||
let res = tester.base().full_scan().await;
|
||||
assert_eq!(0, res.len());
|
||||
|
||||
let new_data = [
|
||||
(0, Some("0".to_string())),
|
||||
(1, Some("1".to_string())),
|
||||
(2, Some("2".to_string())),
|
||||
(3, Some("3".to_string())),
|
||||
];
|
||||
|
||||
tester.base().put(&new_data).await;
|
||||
let res = tester.base().full_scan().await;
|
||||
assert_eq!(new_data, res.as_slice());
|
||||
}
|
||||
@@ -1,984 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_telemetry::logging;
|
||||
use futures::TryStreamExt;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::manifest::{Manifest, ManifestLogStorage, ManifestVersion, MetaAction};
|
||||
use store_api::storage::{
|
||||
AlterRequest, FlushContext, FlushReason, SequenceNumber, WriteContext, WriteResponse,
|
||||
};
|
||||
use tokio::sync::{oneshot, Mutex};
|
||||
|
||||
use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
|
||||
use crate::config::EngineConfig;
|
||||
use crate::error::{self, Result};
|
||||
use crate::flush::{
|
||||
FlushHandle, FlushRegionRequest, FlushSchedulerRef, FlushStrategyRef, FlushType, RegionStatus,
|
||||
};
|
||||
use crate::manifest::action::{
|
||||
RawRegionMetadata, RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList,
|
||||
RegionRemove, RegionTruncate,
|
||||
};
|
||||
use crate::memtable::{Inserter, MemtableBuilderRef, MemtableId, MemtableRef, MemtableVersion};
|
||||
use crate::metadata::RegionMetadataRef;
|
||||
use crate::metrics::{FLUSH_REQUESTS_TOTAL, PREPROCESS_ELAPSED};
|
||||
use crate::proto::wal::WalHeader;
|
||||
use crate::region::{
|
||||
CompactContext, RecoveredMetadata, RecoveredMetadataMap, RegionManifest, SharedDataRef,
|
||||
};
|
||||
use crate::schema::compat::CompatWrite;
|
||||
use crate::sst::{AccessLayerRef, LevelMetas};
|
||||
use crate::version::{VersionControl, VersionControlRef, VersionEdit};
|
||||
use crate::wal::Wal;
|
||||
use crate::write_batch::WriteBatch;
|
||||
|
||||
pub type RegionWriterRef<S> = Arc<RegionWriter<S>>;
|
||||
|
||||
// TODO(yingwen): Add benches for write and support group commit to improve write throughput.
|
||||
|
||||
/// Region writer manages all write operations to the region.
|
||||
#[derive(Debug)]
|
||||
pub struct RegionWriter<S: LogStore> {
|
||||
// To avoid dead lock, we need to ensure the lock order is: inner -> version_mutex.
|
||||
/// Inner writer guarded by write lock, the write lock is used to ensure
|
||||
/// all write operations are serialized.
|
||||
inner: Mutex<WriterInner>,
|
||||
/// Version lock, protects read-write-update to region `Version`.
|
||||
///
|
||||
/// Increasing committed sequence should be guarded by this lock.
|
||||
version_mutex: Mutex<()>,
|
||||
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
compaction_picker: CompactionPickerRef<S>,
|
||||
}
|
||||
|
||||
impl<S> RegionWriter<S>
|
||||
where
|
||||
S: LogStore,
|
||||
{
|
||||
pub fn new(
|
||||
memtable_builder: MemtableBuilderRef,
|
||||
config: Arc<EngineConfig>,
|
||||
ttl: Option<Duration>,
|
||||
write_buffer_size: usize,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
compaction_picker: CompactionPickerRef<S>,
|
||||
) -> RegionWriter<S> {
|
||||
RegionWriter {
|
||||
inner: Mutex::new(WriterInner::new(
|
||||
memtable_builder,
|
||||
config,
|
||||
ttl,
|
||||
write_buffer_size,
|
||||
)),
|
||||
version_mutex: Mutex::new(()),
|
||||
compaction_scheduler,
|
||||
compaction_picker,
|
||||
}
|
||||
}
|
||||
|
||||
/// Write to region in the write lock.
|
||||
pub async fn write(
|
||||
&self,
|
||||
ctx: &WriteContext,
|
||||
request: WriteBatch,
|
||||
writer_ctx: WriterContext<'_, S>,
|
||||
) -> Result<WriteResponse> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
|
||||
|
||||
inner
|
||||
.write(&self.version_mutex, ctx, request, writer_ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Replay data to memtables.
|
||||
pub async fn replay(
|
||||
&self,
|
||||
recovered_metadata: RecoveredMetadataMap,
|
||||
writer_ctx: WriterContext<'_, S>,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner
|
||||
.replay(&self.version_mutex, recovered_metadata, writer_ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Write and apply the region edit.
|
||||
pub(crate) async fn write_edit_and_apply(
|
||||
&self,
|
||||
wal: &Wal<S>,
|
||||
shared: &SharedDataRef,
|
||||
manifest: &RegionManifest,
|
||||
edit: RegionEdit,
|
||||
max_memtable_id: Option<MemtableId>,
|
||||
) -> Result<()> {
|
||||
let _lock = self.version_mutex.lock().await;
|
||||
// HACK: We won't acquire the write lock here because write stall would hold
|
||||
// write lock thus we have no chance to get the lock and apply the version edit.
|
||||
// So we add a version lock to ensure modification to `VersionControl` is
|
||||
// serialized.
|
||||
let version_control = &shared.version_control;
|
||||
let prev_version = version_control.current_manifest_version();
|
||||
|
||||
logging::debug!(
|
||||
"Write region edit: {:?} to manifest, prev_version: {}.",
|
||||
edit,
|
||||
prev_version,
|
||||
);
|
||||
|
||||
let files_to_add = edit.files_to_add.clone();
|
||||
let files_to_remove = edit.files_to_remove.clone();
|
||||
let flushed_sequence = edit.flushed_sequence;
|
||||
let compaction_time_window = edit.compaction_time_window;
|
||||
// Persist the meta action.
|
||||
let mut action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit));
|
||||
action_list.set_prev_version(prev_version);
|
||||
let manifest_version = manifest.update(action_list).await?;
|
||||
|
||||
// Notify checkpointer the flushed manifest version after flushing memtable
|
||||
if flushed_sequence.is_some() {
|
||||
manifest.set_flushed_manifest_version(manifest_version);
|
||||
}
|
||||
|
||||
let version_edit = VersionEdit {
|
||||
files_to_add,
|
||||
files_to_remove,
|
||||
flushed_sequence,
|
||||
manifest_version,
|
||||
max_memtable_id,
|
||||
compaction_time_window,
|
||||
};
|
||||
|
||||
// We could tolerate failure during persisting manifest version to the WAL, since it won't
|
||||
// affect how we applying the edit to the version.
|
||||
version_control.apply_edit(version_edit);
|
||||
// TODO(yingwen): We should set the flush handle to `None`, but we can't acquire
|
||||
// write lock here.
|
||||
|
||||
// Persist the manifest version to notify subscriber of the wal that the manifest has been
|
||||
// updated. This should be done at the end of the method.
|
||||
self.persist_manifest_version(wal, version_control, manifest_version)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Alter schema of the region.
|
||||
pub async fn alter(&self, alter_ctx: AlterContext<'_, S>, request: AlterRequest) -> Result<()> {
|
||||
// To alter the schema, we need to acquire the write lock first, so we could
|
||||
// avoid other writers write to the region and switch the memtable safely.
|
||||
// Another potential benefit is that the write lock also protect against concurrent
|
||||
// alter request to the region.
|
||||
let inner = self.inner.lock().await;
|
||||
|
||||
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
|
||||
|
||||
let version_control = alter_ctx.version_control();
|
||||
|
||||
let old_metadata = version_control.metadata();
|
||||
old_metadata
|
||||
.validate_alter(&request)
|
||||
.context(error::InvalidAlterRequestSnafu)?;
|
||||
|
||||
// The write lock protects us against other alter request, so we could build the new
|
||||
// metadata struct outside of the version mutex.
|
||||
let new_metadata = old_metadata
|
||||
.alter(&request)
|
||||
.context(error::AlterMetadataSnafu)?;
|
||||
|
||||
let raw = RawRegionMetadata::from(&new_metadata);
|
||||
|
||||
// Acquire the version lock before altering the metadata.
|
||||
let _lock = self.version_mutex.lock().await;
|
||||
|
||||
let committed_sequence = version_control.committed_sequence();
|
||||
let mut action_list =
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
|
||||
metadata: raw,
|
||||
committed_sequence,
|
||||
}));
|
||||
let new_metadata = Arc::new(new_metadata);
|
||||
|
||||
// Persist the meta action.
|
||||
let prev_version = version_control.current_manifest_version();
|
||||
action_list.set_prev_version(prev_version);
|
||||
|
||||
logging::debug!(
|
||||
"Try to alter schema of region {}, region_id: {}, action_list: {:?}",
|
||||
new_metadata.name(),
|
||||
new_metadata.id(),
|
||||
action_list
|
||||
);
|
||||
|
||||
let manifest_version = alter_ctx.manifest.update(action_list).await?;
|
||||
|
||||
// Now we could switch memtables and apply the new metadata to the version.
|
||||
let new_mutable = inner.memtable_builder.build(new_metadata.schema().clone());
|
||||
version_control.freeze_mutable_and_apply_metadata(
|
||||
new_metadata,
|
||||
manifest_version,
|
||||
new_mutable,
|
||||
);
|
||||
|
||||
self.persist_manifest_version(alter_ctx.wal, version_control, manifest_version)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Allocate a sequence and persist the manifest version using that sequence to the wal.
|
||||
///
|
||||
/// This method should be protected by the `version_mutex`.
|
||||
async fn persist_manifest_version(
|
||||
&self,
|
||||
wal: &Wal<S>,
|
||||
version_control: &VersionControlRef,
|
||||
manifest_version: ManifestVersion,
|
||||
) -> Result<()> {
|
||||
// We always bump the committed sequence regardless whether persisting the manifest version
|
||||
// to wal is success, to avoid RegionMetaAction use same committed sequence in accident.
|
||||
let next_sequence = version_control.committed_sequence() + 1;
|
||||
version_control.set_committed_sequence(next_sequence);
|
||||
|
||||
let header = WalHeader::with_last_manifest_version(manifest_version);
|
||||
let _ = wal.write_to_wal(next_sequence, header, None).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn close(&self) -> Result<()> {
|
||||
// In order to close a writer
|
||||
// 1. Acquires the write lock.
|
||||
// 2. Sets a memory flag to reject any potential writing.
|
||||
// 3. Waits for the pending flush task.
|
||||
{
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
if inner.is_closed() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
inner.mark_closed();
|
||||
}
|
||||
// we release the writer lock once for rejecting any following potential writing requests immediately.
|
||||
|
||||
self.wait_flush().await?;
|
||||
|
||||
// TODO: cancel the compaction task
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn on_drop(&self, drop_ctx: DropContext<'_, S>) -> Result<()> {
|
||||
// 1. Acquires the write lock.
|
||||
// 2. Close writer reject any potential writing.
|
||||
// 3. Waits or cancels the flush job.
|
||||
// 4. Add `RegionMetaAction::Remove` to recover from manifest in case of failure.
|
||||
// The main task is to restore the cleaning of sst files. If there is a failure
|
||||
// in the previous stops, it can be restored through the `Procedure` framework.
|
||||
// 5. Mark all data obsolete in the WAL.
|
||||
// 6. Delete the namespace of the region from the WAL.
|
||||
// 7. Mark all SSTs deleted.
|
||||
// 8. Remove all manifests.
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner.mark_closed();
|
||||
|
||||
if let Some(handle) = inner.flush_handle.take() {
|
||||
handle.wait().await?;
|
||||
}
|
||||
|
||||
let version_control = drop_ctx.version_control();
|
||||
|
||||
let _lock = self.version_mutex.lock().await;
|
||||
let committed_sequence = version_control.committed_sequence();
|
||||
let current_version = version_control.current();
|
||||
|
||||
let mut action_list =
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove {
|
||||
region_id: drop_ctx.shared.id,
|
||||
}));
|
||||
|
||||
// Persist the meta action.
|
||||
let prev_version = version_control.current_manifest_version();
|
||||
action_list.set_prev_version(prev_version);
|
||||
|
||||
logging::info!(
|
||||
"Try to remove region {}, action_list: {:?}",
|
||||
drop_ctx.shared.id(),
|
||||
action_list
|
||||
);
|
||||
|
||||
let remove_action_version = drop_ctx.manifest.update(action_list).await?;
|
||||
|
||||
// Mark all data obsolete and delete the namespace in the WAL
|
||||
drop_ctx.wal.obsolete(committed_sequence).await?;
|
||||
drop_ctx.wal.delete_namespace().await?;
|
||||
logging::info!(
|
||||
"Remove WAL entries in region: {}, committed sequence: {}",
|
||||
drop_ctx.shared.id(),
|
||||
committed_sequence
|
||||
);
|
||||
|
||||
// Mark all SSTs deleted
|
||||
let files = current_version.ssts().mark_all_files_deleted();
|
||||
logging::info!(
|
||||
"Try to remove all SSTs, region: {}, files: {:?}",
|
||||
drop_ctx.shared.id(),
|
||||
files
|
||||
);
|
||||
|
||||
drop_ctx
|
||||
.manifest
|
||||
.manifest_store()
|
||||
.delete_all(remove_action_version)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush task manually
|
||||
pub async fn flush(&self, writer_ctx: WriterContext<'_, S>, ctx: &FlushContext) -> Result<()> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
if !ctx.force {
|
||||
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
|
||||
}
|
||||
|
||||
inner.manual_flush(writer_ctx, ctx.reason).await?;
|
||||
|
||||
if ctx.wait {
|
||||
if let Some(handle) = inner.flush_handle.take() {
|
||||
handle.wait().await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compact manually.
|
||||
pub async fn compact(&self, request: WriterCompactRequest<S>) -> Result<()> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
|
||||
let sst_write_buffer_size = ReadableSize::mb(8); // deprecated usage
|
||||
|
||||
inner
|
||||
.manual_compact(
|
||||
request,
|
||||
self.compaction_picker.clone(),
|
||||
self.compaction_scheduler.clone(),
|
||||
sst_write_buffer_size,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Wait flush task if any
|
||||
async fn wait_flush(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
if let Some(handle) = inner.flush_handle.take() {
|
||||
handle.wait().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn truncate(&self, ctx: &TruncateContext<'_, S>) -> Result<()> {
|
||||
// Acquires the write lock.
|
||||
let mut inner = self.inner.lock().await;
|
||||
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
|
||||
|
||||
if let Some(handle) = inner.flush_handle.take() {
|
||||
handle.wait().await?;
|
||||
}
|
||||
|
||||
let version_control = ctx.version_control();
|
||||
let _lock = self.version_mutex.lock().await;
|
||||
let committed_sequence = version_control.committed_sequence();
|
||||
|
||||
// Add `RegionMetaAction::Truncate` to recover from manifest in case of failure.
|
||||
let mut action_list =
|
||||
RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
|
||||
region_id: ctx.shared.id,
|
||||
committed_sequence,
|
||||
}));
|
||||
|
||||
// Persist the meta action.
|
||||
let current_version = version_control.current();
|
||||
let manifest_version = version_control.current_manifest_version();
|
||||
let prev_version = manifest_version;
|
||||
action_list.set_prev_version(prev_version);
|
||||
ctx.manifest.update(action_list).await?;
|
||||
|
||||
// Mark all data obsolete
|
||||
ctx.wal.obsolete(committed_sequence).await?;
|
||||
|
||||
// Mark all SSTs deleted
|
||||
let files = current_version.ssts().mark_all_files_deleted();
|
||||
logging::info!(
|
||||
"Try to remove all SSTs, region: {}, files: {:?}",
|
||||
ctx.shared.id(),
|
||||
files
|
||||
);
|
||||
|
||||
// Reset version
|
||||
let memtables = Arc::new(MemtableVersion::new(inner.alloc_memtable(version_control)));
|
||||
let ssts = Arc::new(LevelMetas::new(
|
||||
ctx.sst_layer.clone(),
|
||||
current_version.ssts().file_purger(),
|
||||
));
|
||||
version_control.reset_version(manifest_version + 1, memtables, ssts);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Methods for tests.
|
||||
#[cfg(test)]
|
||||
impl<S> RegionWriter<S>
|
||||
where
|
||||
S: LogStore,
|
||||
{
|
||||
pub(crate) async fn write_buffer_size(&self) -> usize {
|
||||
self.inner.lock().await.write_buffer_size
|
||||
}
|
||||
}
|
||||
|
||||
/// Structs needed by triggering a compaction.
|
||||
pub struct WriterCompactRequest<S: LogStore> {
|
||||
pub shared_data: SharedDataRef,
|
||||
pub sst_layer: AccessLayerRef,
|
||||
pub manifest: RegionManifest,
|
||||
pub wal: Wal<S>,
|
||||
pub region_writer: RegionWriterRef<S>,
|
||||
pub compact_ctx: CompactContext,
|
||||
}
|
||||
|
||||
pub struct WriterContext<'a, S: LogStore> {
|
||||
pub shared: &'a SharedDataRef,
|
||||
pub flush_strategy: &'a FlushStrategyRef,
|
||||
pub flush_scheduler: &'a FlushSchedulerRef<S>,
|
||||
pub compaction_scheduler: &'a CompactionSchedulerRef<S>,
|
||||
pub sst_layer: &'a AccessLayerRef,
|
||||
pub wal: &'a Wal<S>,
|
||||
pub writer: &'a RegionWriterRef<S>,
|
||||
pub manifest: &'a RegionManifest,
|
||||
pub compaction_picker: CompactionPickerRef<S>,
|
||||
}
|
||||
|
||||
impl<'a, S: LogStore> WriterContext<'a, S> {
|
||||
#[inline]
|
||||
fn version_control(&self) -> &VersionControlRef {
|
||||
&self.shared.version_control
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AlterContext<'a, S: LogStore> {
|
||||
pub shared: &'a SharedDataRef,
|
||||
pub wal: &'a Wal<S>,
|
||||
pub manifest: &'a RegionManifest,
|
||||
}
|
||||
|
||||
impl<'a, S: LogStore> AlterContext<'a, S> {
|
||||
#[inline]
|
||||
fn version_control(&self) -> &VersionControlRef {
|
||||
&self.shared.version_control
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DropContext<'a, S: LogStore> {
|
||||
pub shared: &'a SharedDataRef,
|
||||
pub wal: &'a Wal<S>,
|
||||
pub manifest: &'a RegionManifest,
|
||||
pub flush_scheduler: &'a FlushSchedulerRef<S>,
|
||||
pub compaction_scheduler: &'a CompactionSchedulerRef<S>,
|
||||
pub sst_layer: &'a AccessLayerRef,
|
||||
}
|
||||
|
||||
impl<'a, S: LogStore> DropContext<'a, S> {
|
||||
#[inline]
|
||||
fn version_control(&self) -> &VersionControlRef {
|
||||
&self.shared.version_control
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TruncateContext<'a, S: LogStore> {
|
||||
pub shared: &'a SharedDataRef,
|
||||
pub wal: &'a Wal<S>,
|
||||
pub manifest: &'a RegionManifest,
|
||||
pub sst_layer: &'a AccessLayerRef,
|
||||
}
|
||||
|
||||
impl<'a, S: LogStore> TruncateContext<'a, S> {
|
||||
#[inline]
|
||||
fn version_control(&self) -> &VersionControlRef {
|
||||
&self.shared.version_control
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WriterInner {
|
||||
memtable_builder: MemtableBuilderRef,
|
||||
flush_handle: Option<FlushHandle>,
|
||||
|
||||
/// `WriterInner` will reject any future writing, if the closed flag is set.
|
||||
///
|
||||
/// It should protected by upper mutex
|
||||
closed: bool,
|
||||
engine_config: Arc<EngineConfig>,
|
||||
ttl: Option<Duration>,
|
||||
/// Size in bytes to freeze the mutable memtable.
|
||||
write_buffer_size: usize,
|
||||
}
|
||||
|
||||
impl WriterInner {
|
||||
fn new(
|
||||
memtable_builder: MemtableBuilderRef,
|
||||
engine_config: Arc<EngineConfig>,
|
||||
ttl: Option<Duration>,
|
||||
write_buffer_size: usize,
|
||||
) -> WriterInner {
|
||||
WriterInner {
|
||||
memtable_builder,
|
||||
flush_handle: None,
|
||||
engine_config,
|
||||
closed: false,
|
||||
ttl,
|
||||
write_buffer_size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Write `WriteBatch` to region, now the schema of batch needs to be validated outside.
|
||||
///
|
||||
/// Mutable reference of writer ensure no other reference of this writer can modify the
|
||||
/// version control (write is exclusive).
|
||||
async fn write<S: LogStore>(
|
||||
&mut self,
|
||||
version_mutex: &Mutex<()>,
|
||||
_ctx: &WriteContext,
|
||||
mut request: WriteBatch,
|
||||
writer_ctx: WriterContext<'_, S>,
|
||||
) -> Result<WriteResponse> {
|
||||
self.preprocess_write(&writer_ctx).await?;
|
||||
let version_control = writer_ctx.version_control();
|
||||
|
||||
let _lock = version_mutex.lock().await;
|
||||
|
||||
let metadata = version_control.metadata();
|
||||
// We need to check the schema again since it might has been altered. We need
|
||||
// to compat request's schema before writing it into the WAL otherwise some
|
||||
// default constraint like `current_timestamp()` would yield different value
|
||||
// during replay.
|
||||
request.compat_write(metadata.schema().user_schema())?;
|
||||
|
||||
let committed_sequence = version_control.committed_sequence();
|
||||
// Sequence for current write batch.
|
||||
let next_sequence = committed_sequence + 1;
|
||||
|
||||
let version = version_control.current();
|
||||
let wal_header = WalHeader::with_last_manifest_version(version.manifest_version());
|
||||
let _ = writer_ctx
|
||||
.wal
|
||||
.write_to_wal(next_sequence, wal_header, Some(request.payload()))
|
||||
.await?;
|
||||
|
||||
// Insert batch into memtable.
|
||||
let mut inserter = Inserter::new(next_sequence);
|
||||
inserter.insert_memtable(request.payload(), version.mutable_memtable())?;
|
||||
|
||||
// Update committed_sequence to make current batch visible. The `&mut self` of WriterInner
|
||||
// guarantees the writer is exclusive.
|
||||
version_control.set_committed_sequence(next_sequence);
|
||||
|
||||
Ok(WriteResponse {})
|
||||
}
|
||||
|
||||
async fn replay<S: LogStore>(
|
||||
&mut self,
|
||||
version_mutex: &Mutex<()>,
|
||||
mut recovered_metadata: RecoveredMetadataMap,
|
||||
writer_ctx: WriterContext<'_, S>,
|
||||
) -> Result<()> {
|
||||
let version_control = writer_ctx.version_control();
|
||||
|
||||
let (flushed_sequence, mut last_sequence);
|
||||
let mut num_requests = 0;
|
||||
let mut num_recovered_metadata = 0;
|
||||
let mut next_apply_metadata = recovered_metadata.pop_first();
|
||||
{
|
||||
let _lock = version_mutex.lock().await;
|
||||
|
||||
// Data after flushed sequence need to be recovered.
|
||||
flushed_sequence = version_control.current().flushed_sequence();
|
||||
last_sequence = flushed_sequence;
|
||||
// Read starts from the first entry after last flushed entry, so the start sequence
|
||||
// should be flushed_sequence + 1.
|
||||
let mut stream = writer_ctx.wal.read_from_wal(flushed_sequence + 1).await?;
|
||||
while let Some((req_sequence, _header, payload)) = stream.try_next().await? {
|
||||
while let Some((sequence_before_alter, _)) = next_apply_metadata {
|
||||
// There might be multiple metadata changes to be applied, so a loop is necessary.
|
||||
if req_sequence > sequence_before_alter {
|
||||
// This is the first request that use the new metadata.
|
||||
self.apply_metadata(
|
||||
&writer_ctx,
|
||||
sequence_before_alter,
|
||||
next_apply_metadata,
|
||||
version_control,
|
||||
)?;
|
||||
|
||||
num_recovered_metadata += 1;
|
||||
next_apply_metadata = recovered_metadata.pop_first();
|
||||
} else {
|
||||
// Keep the next_apply_metadata until req_sequence > sequence_before_alter
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if req_sequence > last_sequence {
|
||||
last_sequence = req_sequence;
|
||||
} else {
|
||||
logging::error!(
|
||||
"Sequence should not decrease during replay, found {} <= {}, \
|
||||
region_id: {}, region_name: {}, flushed_sequence: {}, num_requests: {}",
|
||||
req_sequence,
|
||||
last_sequence,
|
||||
writer_ctx.shared.id,
|
||||
writer_ctx.shared.name,
|
||||
flushed_sequence,
|
||||
num_requests,
|
||||
);
|
||||
|
||||
error::SequenceNotMonotonicSnafu {
|
||||
prev: last_sequence,
|
||||
given: req_sequence,
|
||||
}
|
||||
.fail()?;
|
||||
}
|
||||
|
||||
if let Some(payload) = payload {
|
||||
num_requests += 1;
|
||||
// Note that memtables of `Version` may be updated during replay.
|
||||
let version = version_control.current();
|
||||
// TODO(yingwen): Trigger flush if the size of memtables reach the flush threshold to avoid
|
||||
// out of memory during replay, but we need to do it carefully to avoid dead lock.
|
||||
let mut inserter = Inserter::new(last_sequence);
|
||||
inserter.insert_memtable(&payload, version.mutable_memtable())?;
|
||||
}
|
||||
}
|
||||
|
||||
// Apply metadata after last WAL entry
|
||||
while let Some((sequence_before_alter, _)) = next_apply_metadata {
|
||||
assert!(
|
||||
sequence_before_alter >= last_sequence,
|
||||
"The sequence in metadata after last WAL entry is less than last sequence, \
|
||||
metadata sequence: {}, last_sequence: {}, region_id: {}, region_name: {}",
|
||||
sequence_before_alter,
|
||||
last_sequence,
|
||||
writer_ctx.shared.id,
|
||||
writer_ctx.shared.name
|
||||
);
|
||||
|
||||
self.apply_metadata(
|
||||
&writer_ctx,
|
||||
sequence_before_alter,
|
||||
next_apply_metadata,
|
||||
version_control,
|
||||
)?;
|
||||
|
||||
num_recovered_metadata += 1;
|
||||
next_apply_metadata = recovered_metadata.pop_first();
|
||||
}
|
||||
|
||||
version_control.set_committed_sequence(last_sequence);
|
||||
}
|
||||
|
||||
logging::info!(
|
||||
"Region replay finished, region_id: {}, region_name: {}, flushed_sequence: {}, last_sequence: {}, num_requests: {}, num_recovered_metadata: {}",
|
||||
writer_ctx.shared.id,
|
||||
writer_ctx.shared.name,
|
||||
flushed_sequence,
|
||||
last_sequence,
|
||||
num_requests,
|
||||
num_recovered_metadata,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn apply_metadata<S: LogStore>(
|
||||
&self,
|
||||
writer_ctx: &WriterContext<'_, S>,
|
||||
sequence: SequenceNumber,
|
||||
mut metadata: Option<RecoveredMetadata>,
|
||||
version_control: &VersionControl,
|
||||
) -> Result<()> {
|
||||
// It's safe to unwrap here, it's checked outside.
|
||||
// Move out metadata to avoid cloning it.
|
||||
|
||||
let (_, (manifest_version, metadata)) = metadata.take().unwrap();
|
||||
let region_metadata: RegionMetadataRef =
|
||||
Arc::new(metadata.try_into().context(error::InvalidRawRegionSnafu {
|
||||
region: &writer_ctx.shared.name,
|
||||
})?);
|
||||
let new_mutable = self
|
||||
.memtable_builder
|
||||
.build(region_metadata.schema().clone());
|
||||
version_control.freeze_mutable_and_apply_metadata(
|
||||
region_metadata,
|
||||
manifest_version,
|
||||
new_mutable,
|
||||
);
|
||||
logging::debug!(
|
||||
"Applied metadata to region: {} when replaying WAL: sequence={} manifest={} ",
|
||||
writer_ctx.shared.name,
|
||||
sequence,
|
||||
manifest_version
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Preprocess before write.
|
||||
///
|
||||
/// Creates needed mutable memtables, ensures there is enough capacity in memtable and trigger
|
||||
/// flush if necessary. Returns time ranges of the input write batch.
|
||||
async fn preprocess_write<S: LogStore>(
|
||||
&mut self,
|
||||
writer_ctx: &WriterContext<'_, S>,
|
||||
) -> Result<()> {
|
||||
let _timer = PREPROCESS_ELAPSED.start_timer();
|
||||
|
||||
let version_control = writer_ctx.version_control();
|
||||
// Check whether memtable is full or flush should be triggered. We need to do this first since
|
||||
// switching memtables will clear all mutable memtables.
|
||||
if let Some(flush_type) = self.should_flush(
|
||||
writer_ctx.shared,
|
||||
version_control,
|
||||
writer_ctx.flush_strategy,
|
||||
) {
|
||||
// Trigger flush according to the flush type.
|
||||
match flush_type {
|
||||
FlushType::Region => {
|
||||
// Trigger flush for current region.
|
||||
self.trigger_flush(writer_ctx, FlushReason::MemtableFull)
|
||||
.await?;
|
||||
}
|
||||
FlushType::Engine => {
|
||||
// Trigger engine level flush. This wakeup the flush handler
|
||||
// to pick region to flush.
|
||||
writer_ctx.flush_scheduler.schedule_engine_flush()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new mutable memtable.
|
||||
fn alloc_memtable(&self, version_control: &VersionControlRef) -> MemtableRef {
|
||||
let memtable_schema = version_control.current().schema().clone();
|
||||
self.memtable_builder.build(memtable_schema)
|
||||
}
|
||||
|
||||
fn should_flush(
|
||||
&self,
|
||||
shared: &SharedDataRef,
|
||||
version_control: &VersionControlRef,
|
||||
flush_strategy: &FlushStrategyRef,
|
||||
) -> Option<FlushType> {
|
||||
let current = version_control.current();
|
||||
let memtables = current.memtables();
|
||||
let status = RegionStatus {
|
||||
region_id: shared.id(),
|
||||
bytes_mutable: memtables.mutable_bytes_allocated(),
|
||||
write_buffer_size: self.write_buffer_size,
|
||||
};
|
||||
flush_strategy.should_flush(status)
|
||||
}
|
||||
|
||||
async fn trigger_flush<S: LogStore>(
|
||||
&mut self,
|
||||
ctx: &WriterContext<'_, S>,
|
||||
reason: FlushReason,
|
||||
) -> Result<()> {
|
||||
let version_control = &ctx.shared.version_control;
|
||||
let new_mutable = self.alloc_memtable(version_control);
|
||||
// Freeze all mutable memtables so we can flush them later.
|
||||
version_control.freeze_mutable(new_mutable);
|
||||
|
||||
FLUSH_REQUESTS_TOTAL
|
||||
.with_label_values(&[reason.as_str()])
|
||||
.inc();
|
||||
|
||||
if let Some(flush_handle) = self.flush_handle.take() {
|
||||
// Previous flush job is incomplete, wait util it is finished.
|
||||
// However the last flush job may fail, in which case, we just return error
|
||||
// and abort current write request. The flush handle is left empty, so the next
|
||||
// time we still have chance to trigger a new flush.
|
||||
// TODO(yingwen): We should release the write lock during waiting flush done, which
|
||||
// needs something like async condvar.
|
||||
flush_handle.wait().await.map_err(|e| {
|
||||
logging::error!(e; "Previous flush job failed, region: {}", ctx.shared.name);
|
||||
e
|
||||
})?;
|
||||
}
|
||||
|
||||
let current_version = version_control.current();
|
||||
let (max_memtable_id, mem_to_flush) = current_version.memtables().memtables_to_flush();
|
||||
|
||||
if max_memtable_id.is_none() {
|
||||
// We still update the flush time to avoid the picker picks this region again.
|
||||
ctx.shared.update_flush_millis();
|
||||
|
||||
logging::info!("No memtables to flush in region: {}", ctx.shared.name);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let flush_req = FlushRegionRequest {
|
||||
max_memtable_id: max_memtable_id.unwrap(),
|
||||
memtables: mem_to_flush,
|
||||
// In write thread, safe to use current committed sequence.
|
||||
flush_sequence: version_control.committed_sequence(),
|
||||
shared: ctx.shared.clone(),
|
||||
sst_layer: ctx.sst_layer.clone(),
|
||||
writer: ctx.writer.clone(),
|
||||
wal: ctx.wal.clone(),
|
||||
manifest: ctx.manifest.clone(),
|
||||
engine_config: self.engine_config.clone(),
|
||||
ttl: self.ttl,
|
||||
compaction_time_window: current_version.ssts().compaction_time_window(),
|
||||
compaction_picker: ctx.compaction_picker.clone(),
|
||||
};
|
||||
|
||||
let flush_handle = ctx
|
||||
.flush_scheduler
|
||||
.schedule_region_flush(flush_req)
|
||||
.map_err(|e| {
|
||||
logging::error!(e; "Failed to schedule flush request");
|
||||
e
|
||||
})?;
|
||||
self.flush_handle = Some(flush_handle);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn manual_compact<S: LogStore>(
|
||||
&mut self,
|
||||
request: WriterCompactRequest<S>,
|
||||
compaction_picker: CompactionPickerRef<S>,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
sst_write_buffer_size: ReadableSize,
|
||||
) -> Result<()> {
|
||||
let region_id = request.shared_data.id();
|
||||
let compaction_time_window = request
|
||||
.shared_data
|
||||
.version_control
|
||||
.current()
|
||||
.ssts()
|
||||
.compaction_time_window();
|
||||
let mut compaction_request = CompactionRequestImpl {
|
||||
region_id,
|
||||
sst_layer: request.sst_layer,
|
||||
writer: request.region_writer,
|
||||
shared: request.shared_data.clone(),
|
||||
manifest: request.manifest,
|
||||
wal: request.wal,
|
||||
ttl: self.ttl,
|
||||
compaction_time_window,
|
||||
sender: None,
|
||||
picker: compaction_picker,
|
||||
sst_write_buffer_size,
|
||||
// manual compaction does not reschedule itself.
|
||||
reschedule_on_finish: false,
|
||||
};
|
||||
|
||||
let compaction_scheduler = compaction_scheduler.clone();
|
||||
logging::info!(
|
||||
"Manual compact, region_id: {}, compact_ctx: {:?}",
|
||||
region_id,
|
||||
request.compact_ctx
|
||||
);
|
||||
|
||||
if request.compact_ctx.wait {
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
compaction_request.sender = Some(sender);
|
||||
|
||||
if schedule_compaction(
|
||||
request.shared_data,
|
||||
compaction_scheduler,
|
||||
compaction_request,
|
||||
) {
|
||||
receiver
|
||||
.await
|
||||
.context(error::CompactTaskCancelSnafu { region_id })??;
|
||||
}
|
||||
} else {
|
||||
let _ = schedule_compaction(
|
||||
request.shared_data,
|
||||
compaction_scheduler,
|
||||
compaction_request,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn manual_flush<S: LogStore>(
|
||||
&mut self,
|
||||
writer_ctx: WriterContext<'_, S>,
|
||||
reason: FlushReason,
|
||||
) -> Result<()> {
|
||||
self.trigger_flush(&writer_ctx, reason).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_closed(&self) -> bool {
|
||||
self.closed
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn mark_closed(&mut self) {
|
||||
self.closed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Schedule compaction task, returns whether the task is scheduled.
|
||||
pub(crate) fn schedule_compaction<S: LogStore>(
|
||||
shared_data: SharedDataRef,
|
||||
compaction_scheduler: CompactionSchedulerRef<S>,
|
||||
compaction_request: CompactionRequestImpl<S>,
|
||||
) -> bool {
|
||||
let region_id = shared_data.id();
|
||||
|
||||
match compaction_scheduler.schedule(compaction_request) {
|
||||
Ok(scheduled) => {
|
||||
logging::info!(
|
||||
"Schedule region {} compaction request result: {}",
|
||||
region_id,
|
||||
scheduled
|
||||
);
|
||||
|
||||
scheduled
|
||||
}
|
||||
Err(e) => {
|
||||
logging::error!(e;"Failed to schedule region compaction request {}", region_id);
|
||||
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,652 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::hash::Hash;
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_telemetry::{debug, error, info};
|
||||
use snafu::{ensure, ResultExt};
|
||||
use tokio::sync::Notify;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::error::{IllegalSchedulerStateSnafu, Result, StopSchedulerSnafu};
|
||||
use crate::scheduler::dedup_deque::DedupDeque;
|
||||
use crate::scheduler::rate_limit::{
|
||||
BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter, RateLimiter,
|
||||
};
|
||||
|
||||
pub mod dedup_deque;
|
||||
pub mod rate_limit;
|
||||
|
||||
/// Request that can be scheduled.
|
||||
/// It must contain a key for deduplication.
|
||||
pub trait Request: Send + Sync + 'static {
|
||||
/// Type of request key.
|
||||
type Key: Eq + Hash + Clone + Debug + Send + Sync;
|
||||
|
||||
/// Returns the request key.
|
||||
fn key(&self) -> Self::Key;
|
||||
|
||||
/// Notify the request result.
|
||||
fn complete(self, result: Result<()>);
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Handler {
|
||||
type Request;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()>;
|
||||
}
|
||||
|
||||
/// [Scheduler] defines a set of API to schedule requests.
|
||||
#[async_trait]
|
||||
pub trait Scheduler: Debug {
|
||||
type Request;
|
||||
|
||||
/// Schedules a request.
|
||||
/// Returns true if request is scheduled. Returns false if task queue already
|
||||
/// contains the request with same key.
|
||||
fn schedule(&self, request: Self::Request) -> Result<bool>;
|
||||
|
||||
/// Stops scheduler. If `await_termination` is set to true, the scheduler will
|
||||
/// wait until all queued requests are processed.
|
||||
async fn stop(&self, await_termination: bool) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Scheduler config.
|
||||
#[derive(Debug)]
|
||||
pub struct SchedulerConfig {
|
||||
pub max_inflight_tasks: usize,
|
||||
}
|
||||
|
||||
impl Default for SchedulerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_inflight_tasks: 4,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const STATE_RUNNING: u8 = 0;
|
||||
const STATE_STOP: u8 = 1;
|
||||
const STATE_AWAIT_TERMINATION: u8 = 2;
|
||||
|
||||
/// Request scheduler based on local state.
|
||||
pub struct LocalScheduler<R: Request> {
|
||||
/// Request FIFO with key deduplication.
|
||||
request_queue: Arc<RwLock<DedupDeque<R::Key, R>>>,
|
||||
/// Token used to halt the scheduler.
|
||||
cancel_token: CancellationToken,
|
||||
/// Tasks use a cooperative manner to notify scheduler that another request can be scheduled.
|
||||
task_notifier: Arc<Notify>,
|
||||
/// Join handle of spawned request handling loop.
|
||||
join_handle: Mutex<Option<JoinHandle<()>>>,
|
||||
/// State of scheduler.
|
||||
state: Arc<AtomicU8>,
|
||||
}
|
||||
|
||||
impl<R> Debug for LocalScheduler<R>
|
||||
where
|
||||
R: Request + Send + Sync,
|
||||
{
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("LocalScheduler")
|
||||
.field("state", &self.state)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> Drop for LocalScheduler<R>
|
||||
where
|
||||
R: Request,
|
||||
{
|
||||
fn drop(&mut self) {
|
||||
self.state.store(STATE_STOP, Ordering::Relaxed);
|
||||
|
||||
self.cancel_token.cancel();
|
||||
|
||||
// Clear all requests
|
||||
self.request_queue.write().unwrap().clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<R> Scheduler for LocalScheduler<R>
|
||||
where
|
||||
R: Request + Send,
|
||||
{
|
||||
type Request = R;
|
||||
|
||||
fn schedule(&self, request: Self::Request) -> Result<bool> {
|
||||
ensure!(self.running(), IllegalSchedulerStateSnafu);
|
||||
debug!(
|
||||
"Schedule request: {:?}, queue size: {}",
|
||||
request.key(),
|
||||
self.remaining_requests()
|
||||
);
|
||||
let mut queue = self.request_queue.write().unwrap();
|
||||
let res = queue.push_back(request.key(), request);
|
||||
self.task_notifier.notify_one();
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn stop(&self, await_termination: bool) -> Result<()> {
|
||||
let state = if await_termination {
|
||||
STATE_AWAIT_TERMINATION
|
||||
} else {
|
||||
STATE_STOP
|
||||
};
|
||||
self.state.store(state, Ordering::Relaxed);
|
||||
|
||||
self.cancel_token.cancel();
|
||||
let handle = { self.join_handle.lock().unwrap().take() };
|
||||
if let Some(handle) = handle {
|
||||
handle.await.context(StopSchedulerSnafu)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> LocalScheduler<R>
|
||||
where
|
||||
R: Request,
|
||||
{
|
||||
/// Creates a new scheduler instance with given config and request handler.
|
||||
pub fn new<H>(config: SchedulerConfig, handler: H) -> Self
|
||||
where
|
||||
H: Handler<Request = R> + Send + Sync + 'static,
|
||||
{
|
||||
let request_queue = Arc::new(RwLock::new(DedupDeque::default()));
|
||||
let cancel_token = CancellationToken::new();
|
||||
let task_notifier = Arc::new(Notify::new());
|
||||
let state = Arc::new(AtomicU8::new(STATE_RUNNING));
|
||||
let handle_loop = HandlerLoop {
|
||||
task_notifier: task_notifier.clone(),
|
||||
req_queue: request_queue.clone(),
|
||||
cancel_token: cancel_token.child_token(),
|
||||
limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new(
|
||||
MaxInflightTaskLimiter::new(config.max_inflight_tasks),
|
||||
)])),
|
||||
request_handler: handler,
|
||||
state: state.clone(),
|
||||
};
|
||||
let join_handle = common_runtime::spawn_bg(async move {
|
||||
debug!("Task handler loop spawned");
|
||||
handle_loop.run().await;
|
||||
});
|
||||
Self {
|
||||
join_handle: Mutex::new(Some(join_handle)),
|
||||
request_queue,
|
||||
cancel_token,
|
||||
task_notifier,
|
||||
state,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns remaining requests number.
|
||||
#[inline]
|
||||
fn remaining_requests(&self) -> usize {
|
||||
self.request_queue.read().unwrap().len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn running(&self) -> bool {
|
||||
self.state.load(Ordering::Relaxed) == STATE_RUNNING
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HandlerLoop<R: Request, H: Handler> {
|
||||
pub req_queue: Arc<RwLock<DedupDeque<R::Key, R>>>,
|
||||
pub cancel_token: CancellationToken,
|
||||
pub task_notifier: Arc<Notify>,
|
||||
pub request_handler: H,
|
||||
pub limiter: Arc<CascadeRateLimiter<R>>,
|
||||
pub state: Arc<AtomicU8>,
|
||||
}
|
||||
|
||||
impl<R, H> HandlerLoop<R, H>
|
||||
where
|
||||
R: Request,
|
||||
H: Handler<Request = R>,
|
||||
{
|
||||
/// Runs scheduled requests dispatch loop.
|
||||
pub async fn run(&self) {
|
||||
let limiter = self.limiter.clone();
|
||||
while self.running() {
|
||||
tokio::select! {
|
||||
_ = self.task_notifier.notified() => {
|
||||
debug!("Notified, queue size: {:?}",self.req_queue.read().unwrap().len());
|
||||
self.poll_and_execute(&limiter).await;
|
||||
}
|
||||
_ = self.cancel_token.cancelled() => {
|
||||
info!("Task scheduler cancelled.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// For correctness, we need to poll requests from fifo again.
|
||||
if self.state.load(Ordering::Relaxed) == STATE_AWAIT_TERMINATION {
|
||||
info!("Waiting for all pending tasks to finish.");
|
||||
self.poll_and_execute(&limiter).await;
|
||||
self.state.store(STATE_STOP, Ordering::Relaxed);
|
||||
}
|
||||
info!("Task scheduler stopped");
|
||||
}
|
||||
|
||||
/// Polls and executes requests as many as possible until rate limited.
|
||||
async fn poll_and_execute(&self, limiter: &Arc<CascadeRateLimiter<R>>) {
|
||||
while let Some((task_key, req)) = self.poll_task().await {
|
||||
if let Ok(token) = limiter.acquire_token(&req) {
|
||||
debug!("Executing request: {:?}", task_key);
|
||||
if let Err(e) = self
|
||||
.handle_request(req, token, self.task_notifier.clone())
|
||||
.await
|
||||
{
|
||||
error!(e; "Failed to submit request: {:?}", task_key);
|
||||
} else {
|
||||
info!("Submitted task: {:?}", task_key);
|
||||
}
|
||||
} else {
|
||||
// rate limited, put back to req queue to wait for next schedule
|
||||
debug!(
|
||||
"Put back request {:?}, queue size: {}",
|
||||
task_key,
|
||||
self.req_queue.read().unwrap().len()
|
||||
);
|
||||
self.put_back_req(task_key, req).await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn poll_task(&self) -> Option<(R::Key, R)> {
|
||||
let mut queue = self.req_queue.write().unwrap();
|
||||
queue.pop_front()
|
||||
}
|
||||
|
||||
/// Puts request back to the front of request queue.
|
||||
#[inline]
|
||||
async fn put_back_req(&self, key: R::Key, req: R) {
|
||||
let mut queue = self.req_queue.write().unwrap();
|
||||
let _ = queue.push_front(key, req);
|
||||
}
|
||||
|
||||
// Handles request, submit task to bg runtime.
|
||||
async fn handle_request(
|
||||
&self,
|
||||
req: R,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
self.request_handler
|
||||
.handle_request(req, token, finish_notifier)
|
||||
.await
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn running(&self) -> bool {
|
||||
self.state.load(Ordering::Relaxed) == STATE_RUNNING
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::atomic::{AtomicBool, AtomicI32};
|
||||
use std::time::Duration;
|
||||
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
use crate::scheduler::dedup_deque::DedupDeque;
|
||||
use crate::scheduler::rate_limit::{
|
||||
BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter,
|
||||
};
|
||||
use crate::scheduler::{HandlerLoop, LocalScheduler, Scheduler, SchedulerConfig};
|
||||
|
||||
struct CountdownLatch {
|
||||
counter: std::sync::Mutex<usize>,
|
||||
notify: Notify,
|
||||
}
|
||||
|
||||
impl CountdownLatch {
|
||||
fn new(size: usize) -> Self {
|
||||
Self {
|
||||
counter: std::sync::Mutex::new(size),
|
||||
notify: Notify::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn countdown(&self) {
|
||||
let mut counter = self.counter.lock().unwrap();
|
||||
if *counter >= 1 {
|
||||
*counter -= 1;
|
||||
if *counter == 0 {
|
||||
self.notify.notify_one();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Users should only call this once.
|
||||
async fn wait(&self) {
|
||||
self.notify.notified().await
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_handler() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let queue = Arc::new(std::sync::RwLock::new(DedupDeque::default()));
|
||||
let latch = Arc::new(CountdownLatch::new(2));
|
||||
let latch_cloned = latch.clone();
|
||||
let handler = Arc::new(HandlerLoop {
|
||||
req_queue: queue.clone(),
|
||||
cancel_token: Default::default(),
|
||||
task_notifier: Arc::new(Default::default()),
|
||||
request_handler: MockHandler {
|
||||
cb: move || {
|
||||
latch_cloned.countdown();
|
||||
},
|
||||
},
|
||||
limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new(
|
||||
MaxInflightTaskLimiter::new(3),
|
||||
)])),
|
||||
state: Arc::new(AtomicU8::default()),
|
||||
});
|
||||
|
||||
let handler_cloned = handler.clone();
|
||||
let _handle = common_runtime::spawn_bg(async move { handler_cloned.run().await });
|
||||
|
||||
let _ = queue
|
||||
.write()
|
||||
.unwrap()
|
||||
.push_back(1.into(), MockRequest::default());
|
||||
handler.task_notifier.notify_one();
|
||||
let _ = queue
|
||||
.write()
|
||||
.unwrap()
|
||||
.push_back(2.into(), MockRequest::default());
|
||||
handler.task_notifier.notify_one();
|
||||
|
||||
tokio::time::timeout(Duration::from_secs(1), latch.wait())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct MockRequest {
|
||||
region_id: RegionId,
|
||||
}
|
||||
|
||||
struct MockHandler<F> {
|
||||
cb: F,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<F> Handler for MockHandler<F>
|
||||
where
|
||||
F: Fn() + Send + Sync,
|
||||
{
|
||||
type Request = MockRequest;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
_req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
(self.cb)();
|
||||
token.try_release();
|
||||
finish_notifier.notify_one();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Request for MockRequest {
|
||||
type Key = RegionId;
|
||||
|
||||
fn key(&self) -> Self::Key {
|
||||
self.region_id
|
||||
}
|
||||
|
||||
fn complete(self, _result: Result<()>) {}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scheduler() {
|
||||
let latch = Arc::new(CountdownLatch::new(2));
|
||||
let latch_cloned = latch.clone();
|
||||
|
||||
let handler = MockHandler {
|
||||
cb: move || {
|
||||
latch_cloned.countdown();
|
||||
},
|
||||
};
|
||||
let scheduler: LocalScheduler<MockRequest> = LocalScheduler::new(
|
||||
SchedulerConfig {
|
||||
max_inflight_tasks: 3,
|
||||
},
|
||||
handler,
|
||||
);
|
||||
|
||||
let _ = scheduler
|
||||
.schedule(MockRequest {
|
||||
region_id: 1.into(),
|
||||
})
|
||||
.unwrap();
|
||||
let _ = scheduler
|
||||
.schedule(MockRequest {
|
||||
region_id: 2.into(),
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
tokio::time::timeout(Duration::from_secs(1), latch.wait())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scheduler_many() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let task_size = 100;
|
||||
|
||||
let latch = Arc::new(CountdownLatch::new(task_size));
|
||||
let latch_clone = latch.clone();
|
||||
|
||||
let handler = MockHandler {
|
||||
cb: move || {
|
||||
latch_clone.countdown();
|
||||
},
|
||||
};
|
||||
|
||||
let config = SchedulerConfig {
|
||||
max_inflight_tasks: 3,
|
||||
};
|
||||
let scheduler = LocalScheduler::new(config, handler);
|
||||
|
||||
for i in 0..task_size {
|
||||
assert!(scheduler
|
||||
.schedule(MockRequest {
|
||||
region_id: RegionId::from(i as u64),
|
||||
})
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
tokio::time::timeout(Duration::from_secs(3), latch.wait())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scheduler_interval() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let task_size = 100;
|
||||
let latch = Arc::new(CountdownLatch::new(task_size));
|
||||
let latch_clone = latch.clone();
|
||||
|
||||
let handler = MockHandler {
|
||||
cb: move || {
|
||||
latch_clone.countdown();
|
||||
},
|
||||
};
|
||||
|
||||
let config = SchedulerConfig {
|
||||
max_inflight_tasks: 3,
|
||||
};
|
||||
let scheduler = LocalScheduler::new(config, handler);
|
||||
|
||||
for i in 0..task_size / 2 {
|
||||
assert!(scheduler
|
||||
.schedule(MockRequest {
|
||||
region_id: RegionId::from(i as u64),
|
||||
})
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
for i in task_size / 2..task_size {
|
||||
assert!(scheduler
|
||||
.schedule(MockRequest {
|
||||
region_id: RegionId::from(i as u64),
|
||||
})
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
tokio::time::timeout(Duration::from_secs(6), latch.wait())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
struct MockAsyncHandler<F> {
|
||||
cb: F,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<F> Handler for MockAsyncHandler<F>
|
||||
where
|
||||
F: Fn() -> BoxFuture<'static, ()> + Send + Sync,
|
||||
{
|
||||
type Request = MockRequest;
|
||||
|
||||
async fn handle_request(
|
||||
&self,
|
||||
_req: Self::Request,
|
||||
token: BoxedRateLimitToken,
|
||||
finish_notifier: Arc<Notify>,
|
||||
) -> Result<()> {
|
||||
let fut = (self.cb)();
|
||||
fut.await;
|
||||
token.try_release();
|
||||
finish_notifier.notify_one();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_duplicate_tasks() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (tx, rx) = tokio::sync::watch::channel(false);
|
||||
let handler = MockAsyncHandler {
|
||||
cb: move || {
|
||||
let mut rx = rx.clone();
|
||||
Box::pin(async move {
|
||||
// Block the handler so it can't handle more requests.
|
||||
loop {
|
||||
rx.changed().await.unwrap();
|
||||
if *rx.borrow() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}) as _ // Casts the Pin<Box<async block>> to Pin<Box<dyn Future>>
|
||||
},
|
||||
};
|
||||
let config = SchedulerConfig {
|
||||
max_inflight_tasks: 30,
|
||||
};
|
||||
let scheduler = LocalScheduler::new(config, handler);
|
||||
|
||||
let mut scheduled_task = 0;
|
||||
for _ in 0..10 {
|
||||
if scheduler
|
||||
.schedule(MockRequest {
|
||||
region_id: 1.into(),
|
||||
})
|
||||
.unwrap()
|
||||
{
|
||||
scheduled_task += 1;
|
||||
}
|
||||
}
|
||||
tx.send(true).unwrap();
|
||||
scheduler.stop(true).await.unwrap();
|
||||
debug!("Schedule tasks: {}", scheduled_task);
|
||||
assert!(scheduled_task < 10);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_await_termination() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let finished = Arc::new(AtomicI32::new(0));
|
||||
let finished_clone = finished.clone();
|
||||
let handler = MockHandler {
|
||||
cb: move || {
|
||||
let _ = finished_clone.fetch_add(1, Ordering::Relaxed);
|
||||
},
|
||||
};
|
||||
|
||||
let config = SchedulerConfig {
|
||||
max_inflight_tasks: 3,
|
||||
};
|
||||
let scheduler = Arc::new(LocalScheduler::new(config, handler));
|
||||
let scheduler_cloned = scheduler.clone();
|
||||
let task_scheduled = Arc::new(AtomicI32::new(0));
|
||||
let task_scheduled_cloned = task_scheduled.clone();
|
||||
|
||||
let scheduling = Arc::new(AtomicBool::new(true));
|
||||
let scheduling_clone = scheduling.clone();
|
||||
let handle = common_runtime::spawn_write(async move {
|
||||
for i in 0..10000 {
|
||||
if let Ok(res) = scheduler_cloned.schedule(MockRequest {
|
||||
region_id: RegionId::from(i as u64),
|
||||
}) {
|
||||
if res {
|
||||
let _ = task_scheduled_cloned.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
if !scheduling_clone.load(Ordering::Relaxed) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
scheduler.stop(true).await.unwrap();
|
||||
scheduling.store(false, Ordering::Relaxed);
|
||||
|
||||
let finished = finished.load(Ordering::Relaxed);
|
||||
handle.await.unwrap();
|
||||
|
||||
assert_eq!(finished, task_scheduled.load(Ordering::Relaxed));
|
||||
}
|
||||
}
|
||||
@@ -1,124 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::hash::Hash;
|
||||
|
||||
/// Deque with key deduplication.
|
||||
pub struct DedupDeque<K, V> {
|
||||
deque: VecDeque<K>,
|
||||
existing: HashMap<K, V>,
|
||||
}
|
||||
|
||||
impl<K, V> Default for DedupDeque<K, V> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
deque: VecDeque::new(),
|
||||
existing: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash + Clone, V> DedupDeque<K, V> {
|
||||
/// Pushes a key value to the back of deque.
|
||||
/// Returns true if the deque does not already contain value with the same key, otherwise
|
||||
/// returns false.
|
||||
pub fn push_back(&mut self, key: K, value: V) -> bool {
|
||||
debug_assert_eq!(self.deque.len(), self.existing.len());
|
||||
if let Entry::Vacant(entry) = self.existing.entry(key.clone()) {
|
||||
let _ = entry.insert(value);
|
||||
self.deque.push_back(key);
|
||||
return true;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Pushes a key value to the front of deque.
|
||||
/// Returns true if the deque does not already contain value with the same key, otherwise
|
||||
/// returns false.
|
||||
pub fn push_front(&mut self, key: K, value: V) -> bool {
|
||||
if let Entry::Vacant(entry) = self.existing.entry(key.clone()) {
|
||||
let _ = entry.insert(value);
|
||||
self.deque.push_front(key);
|
||||
return true;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Pops a pair from the back of deque. Returns [None] if the deque is empty.
|
||||
pub fn pop_front(&mut self) -> Option<(K, V)> {
|
||||
debug_assert_eq!(self.deque.len(), self.existing.len());
|
||||
let key = self.deque.pop_front()?;
|
||||
let value = self.existing.remove(&key)?;
|
||||
Some((key, value))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
debug_assert_eq!(self.deque.len(), self.existing.len());
|
||||
self.deque.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.deque.is_empty()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn clear(&mut self) {
|
||||
self.deque.clear();
|
||||
self.existing.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V> Debug for DedupDeque<K, V>
|
||||
where
|
||||
K: Debug,
|
||||
V: Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DedupDeque")
|
||||
.field("deque", &self.deque)
|
||||
.field("existing", &self.existing)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_dedup_deque() {
|
||||
let mut deque = DedupDeque::default();
|
||||
assert!(deque.push_back(1, "hello".to_string()));
|
||||
assert_eq!(1, deque.len());
|
||||
assert!(deque.push_back(2, "world".to_string()));
|
||||
assert_eq!(2, deque.len());
|
||||
assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap());
|
||||
assert_eq!(1, deque.len());
|
||||
assert_eq!((2, "world".to_string()), deque.pop_front().unwrap());
|
||||
assert_eq!(0, deque.len());
|
||||
|
||||
// insert duplicated item
|
||||
assert!(deque.push_back(1, "hello".to_string()));
|
||||
assert!(!deque.push_back(1, "world".to_string()));
|
||||
assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap());
|
||||
|
||||
deque.clear();
|
||||
assert!(deque.is_empty());
|
||||
}
|
||||
}
|
||||
@@ -1,185 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::error::{RateLimitedSnafu, Result};
|
||||
|
||||
pub trait RateLimitToken {
|
||||
/// Releases the token.
|
||||
/// ### Note
|
||||
/// Implementation should guarantee the idempotency.
|
||||
fn try_release(&self);
|
||||
}
|
||||
|
||||
pub type BoxedRateLimitToken = Box<dyn RateLimitToken + Send + Sync>;
|
||||
|
||||
impl<T: RateLimitToken + ?Sized> RateLimitToken for Box<T> {
|
||||
fn try_release(&self) {
|
||||
(**self).try_release()
|
||||
}
|
||||
}
|
||||
|
||||
/// Rate limiter
|
||||
pub trait RateLimiter {
|
||||
type Request;
|
||||
|
||||
/// Acquires a token from rate limiter. Returns `Err` on failure.
|
||||
fn acquire_token(&self, req: &Self::Request) -> Result<BoxedRateLimitToken>;
|
||||
}
|
||||
|
||||
pub type BoxedRateLimiter<R> = Box<dyn RateLimiter<Request = R> + Send + Sync>;
|
||||
|
||||
/// Limits max inflight tasks number.
|
||||
pub struct MaxInflightTaskLimiter<R> {
|
||||
max_inflight_tasks: usize,
|
||||
inflight_tasks: Arc<AtomicUsize>,
|
||||
_phantom_data: PhantomData<R>,
|
||||
}
|
||||
|
||||
impl<R> MaxInflightTaskLimiter<R> {
|
||||
pub fn new(max_inflight_tasks: usize) -> Self {
|
||||
Self {
|
||||
max_inflight_tasks,
|
||||
inflight_tasks: Arc::new(AtomicUsize::new(0)),
|
||||
_phantom_data: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> RateLimiter for MaxInflightTaskLimiter<R> {
|
||||
type Request = R;
|
||||
|
||||
fn acquire_token(&self, _: &Self::Request) -> Result<BoxedRateLimitToken> {
|
||||
if self.inflight_tasks.fetch_add(1, Ordering::Relaxed) >= self.max_inflight_tasks {
|
||||
let _ = self.inflight_tasks.fetch_sub(1, Ordering::Relaxed);
|
||||
return RateLimitedSnafu {
|
||||
msg: format!(
|
||||
"Max inflight task num exceeds, current: {}, max: {}",
|
||||
self.inflight_tasks.load(Ordering::Relaxed),
|
||||
self.max_inflight_tasks
|
||||
),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
|
||||
Ok(Box::new(MaxInflightLimiterToken::new(
|
||||
self.inflight_tasks.clone(),
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MaxInflightLimiterToken {
|
||||
counter: Arc<AtomicUsize>,
|
||||
released: AtomicBool,
|
||||
}
|
||||
|
||||
impl MaxInflightLimiterToken {
|
||||
pub fn new(counter: Arc<AtomicUsize>) -> Self {
|
||||
Self {
|
||||
counter,
|
||||
released: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RateLimitToken for MaxInflightLimiterToken {
|
||||
fn try_release(&self) {
|
||||
if self
|
||||
.released
|
||||
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
let _ = self.counter.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A composite rate limiter that allows token acquisition only when all internal limiters allow.
|
||||
pub struct CascadeRateLimiter<T> {
|
||||
limits: Vec<BoxedRateLimiter<T>>,
|
||||
}
|
||||
|
||||
impl<T> CascadeRateLimiter<T> {
|
||||
pub fn new(limits: Vec<BoxedRateLimiter<T>>) -> Self {
|
||||
Self { limits }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> RateLimiter for CascadeRateLimiter<T> {
|
||||
type Request = T;
|
||||
|
||||
fn acquire_token(&self, req: &Self::Request) -> Result<BoxedRateLimitToken> {
|
||||
let mut res = vec![];
|
||||
for limit in &self.limits {
|
||||
match limit.acquire_token(req) {
|
||||
Ok(token) => {
|
||||
res.push(token);
|
||||
}
|
||||
Err(e) => {
|
||||
res.iter().for_each(RateLimitToken::try_release);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Box::new(CompositeToken { tokens: res }))
|
||||
}
|
||||
}
|
||||
|
||||
/// Composite token that releases all acquired token when released.
|
||||
pub struct CompositeToken {
|
||||
tokens: Vec<BoxedRateLimitToken>,
|
||||
}
|
||||
|
||||
impl RateLimitToken for CompositeToken {
|
||||
fn try_release(&self) {
|
||||
for token in &self.tokens {
|
||||
token.try_release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_max_inflight_limiter() {
|
||||
let limiter = MaxInflightTaskLimiter::new(3);
|
||||
let t1 = limiter.acquire_token(&1).unwrap();
|
||||
assert_eq!(1, limiter.inflight_tasks.load(Ordering::Relaxed));
|
||||
let _t2 = limiter.acquire_token(&1).unwrap();
|
||||
assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed));
|
||||
let _t3 = limiter.acquire_token(&1).unwrap();
|
||||
assert_eq!(3, limiter.inflight_tasks.load(Ordering::Relaxed));
|
||||
assert!(limiter.acquire_token(&1).is_err());
|
||||
t1.try_release();
|
||||
assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed));
|
||||
let _t4 = limiter.acquire_token(&1).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cascade_limiter() {
|
||||
let limiter: CascadeRateLimiter<usize> =
|
||||
CascadeRateLimiter::new(vec![Box::new(MaxInflightTaskLimiter::new(3))]);
|
||||
let t1 = limiter.acquire_token(&1).unwrap();
|
||||
let _t2 = limiter.acquire_token(&1).unwrap();
|
||||
let _t3 = limiter.acquire_token(&1).unwrap();
|
||||
assert!(limiter.acquire_token(&1).is_err());
|
||||
t1.try_release();
|
||||
let _t4 = limiter.acquire_token(&1).unwrap();
|
||||
}
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod compat;
|
||||
mod projected;
|
||||
mod region;
|
||||
mod store;
|
||||
|
||||
pub use crate::schema::projected::{ProjectedSchema, ProjectedSchemaRef};
|
||||
pub use crate::schema::region::{RegionSchema, RegionSchemaRef};
|
||||
pub use crate::schema::store::{StoreSchema, StoreSchemaRef};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::vectors::{
|
||||
Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector, VectorRef,
|
||||
};
|
||||
|
||||
use crate::read::Batch;
|
||||
|
||||
pub const REGION_NAME: &str = "test";
|
||||
|
||||
pub(crate) fn new_batch() -> Batch {
|
||||
new_batch_with_num_values(1)
|
||||
}
|
||||
|
||||
pub(crate) fn new_batch_with_num_values(num_field_columns: usize) -> Batch {
|
||||
let k0 = Int64Vector::from_slice([1, 2, 3]);
|
||||
let timestamp = TimestampMillisecondVector::from_vec(vec![4, 5, 6]);
|
||||
|
||||
let mut columns: Vec<VectorRef> = vec![Arc::new(k0), Arc::new(timestamp)];
|
||||
|
||||
for i in 0..num_field_columns {
|
||||
let vi = Int64Vector::from_slice([i as i64, i as i64, i as i64]);
|
||||
columns.push(Arc::new(vi));
|
||||
}
|
||||
|
||||
let sequences = UInt64Vector::from_slice([100, 100, 100]);
|
||||
let op_types = UInt8Vector::from_slice([0, 0, 0]);
|
||||
|
||||
columns.push(Arc::new(sequences));
|
||||
columns.push(Arc::new(op_types));
|
||||
|
||||
Batch::new(columns)
|
||||
}
|
||||
}
|
||||
@@ -1,611 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Utilities for resolving schema compatibility problems.
|
||||
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use datatypes::vectors::{Helper, VectorRef};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::metadata::ColumnMetadata;
|
||||
use crate::read::Batch;
|
||||
use crate::schema::{ProjectedSchemaRef, StoreSchemaRef};
|
||||
|
||||
/// Make schema compatible to write to target with another schema.
|
||||
pub trait CompatWrite {
|
||||
/// Makes the schema of `self` compatible with `dest_schema`.
|
||||
///
|
||||
/// For column in `dest_schema` but not in `self`, this method would insert a
|
||||
/// vector with default value.
|
||||
///
|
||||
/// If there are columns not in `dest_schema`, an error would be returned.
|
||||
fn compat_write(&mut self, dest_schema: &SchemaRef) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Checks whether column with `source_column` could be read as a column with `dest_column`.
|
||||
///
|
||||
/// Returns
|
||||
/// - `Ok(true)` if `source_column` is compatible to read using `dest_column` as schema.
|
||||
/// - `Ok(false)` if they are considered different columns.
|
||||
/// - `Err` if there is incompatible issue that could not be resolved.
|
||||
fn is_source_column_compatible(
|
||||
source_column: &ColumnMetadata,
|
||||
dest_column: &ColumnMetadata,
|
||||
) -> Result<bool> {
|
||||
ensure!(
|
||||
source_column.name() == dest_column.name(),
|
||||
error::CompatReadSnafu {
|
||||
reason: format!(
|
||||
"try to use column in {} for column {}",
|
||||
source_column.name(),
|
||||
dest_column.name()
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
if source_column.id() != dest_column.id() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
ensure!(
|
||||
source_column.desc.data_type == dest_column.desc.data_type,
|
||||
error::CompatReadSnafu {
|
||||
reason: format!(
|
||||
"could not read column {} from {:?} type as {:?} type",
|
||||
dest_column.name(),
|
||||
source_column.desc.data_type,
|
||||
dest_column.desc.data_type
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
ensure!(
|
||||
dest_column.desc.is_nullable() || !source_column.desc.is_nullable(),
|
||||
error::CompatReadSnafu {
|
||||
reason: format!(
|
||||
"unable to read nullable data for non null column {}",
|
||||
dest_column.name()
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Adapter to help reading data with source schema as data with dest schema.
|
||||
#[derive(Debug)]
|
||||
pub struct ReadAdapter {
|
||||
/// Schema of data source.
|
||||
source_schema: StoreSchemaRef,
|
||||
/// Schema user expects to read.
|
||||
dest_schema: ProjectedSchemaRef,
|
||||
/// For each column in dest schema, stores the index in read result for
|
||||
/// this column, or None if the column is not in result.
|
||||
///
|
||||
/// This vec would be left empty if `source_version == dest_version`.
|
||||
indices_in_result: Vec<Option<usize>>,
|
||||
/// For each column in source schema, stores whether we need to read that column. All
|
||||
/// columns are needed by default.
|
||||
is_source_needed: Vec<bool>,
|
||||
}
|
||||
|
||||
impl ReadAdapter {
|
||||
/// Creates a new [ReadAdapter] that could convert data with `source_schema` into data
|
||||
/// with `dest_schema`.
|
||||
pub fn new(
|
||||
source_schema: StoreSchemaRef,
|
||||
dest_schema: ProjectedSchemaRef,
|
||||
) -> Result<ReadAdapter> {
|
||||
if source_schema.version() == dest_schema.schema_to_read().version() {
|
||||
ReadAdapter::from_same_version(source_schema, dest_schema)
|
||||
} else {
|
||||
ReadAdapter::from_different_version(source_schema, dest_schema)
|
||||
}
|
||||
}
|
||||
|
||||
fn from_same_version(
|
||||
source_schema: StoreSchemaRef,
|
||||
dest_schema: ProjectedSchemaRef,
|
||||
) -> Result<ReadAdapter> {
|
||||
let mut is_source_needed = vec![true; source_schema.num_columns()];
|
||||
if source_schema.num_columns() != dest_schema.schema_to_read().num_columns() {
|
||||
// `dest_schema` might be projected, so we need to find out value columns that not be read
|
||||
// by the `dest_schema`.
|
||||
|
||||
for (offset, field_column) in source_schema.field_columns().iter().enumerate() {
|
||||
// Iterate value columns in source and mark those not in destination as unneeded.
|
||||
if !dest_schema.is_needed(field_column.id()) {
|
||||
is_source_needed[source_schema.field_column_index_by_offset(offset)] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ReadAdapter {
|
||||
source_schema,
|
||||
dest_schema,
|
||||
indices_in_result: Vec::new(),
|
||||
is_source_needed,
|
||||
})
|
||||
}
|
||||
|
||||
fn from_different_version(
|
||||
source_schema: StoreSchemaRef,
|
||||
dest_schema: ProjectedSchemaRef,
|
||||
) -> Result<ReadAdapter> {
|
||||
let schema_to_read = dest_schema.schema_to_read();
|
||||
let mut indices_in_result = vec![None; schema_to_read.num_columns()];
|
||||
let mut is_source_needed = vec![true; source_schema.num_columns()];
|
||||
// Number of columns in result from source data.
|
||||
let mut num_columns_in_result = 0;
|
||||
|
||||
for (idx, source_column) in source_schema.columns().iter().enumerate() {
|
||||
// For each column in source schema, check whether we need to read it.
|
||||
if let Some(dest_idx) = schema_to_read
|
||||
.schema()
|
||||
.column_index_by_name(source_column.name())
|
||||
{
|
||||
let dest_column = &schema_to_read.columns()[dest_idx];
|
||||
// Check whether we could read this column.
|
||||
if is_source_column_compatible(source_column, dest_column)? {
|
||||
// Mark that this column could be read from source data, since some
|
||||
// columns in source schema would be skipped, we should not use
|
||||
// the source column's index directly.
|
||||
indices_in_result[dest_idx] = Some(num_columns_in_result);
|
||||
num_columns_in_result += 1;
|
||||
} else {
|
||||
// This column is not the same column in dest schema, should be fill by default value
|
||||
// instead of reading from source data.
|
||||
is_source_needed[idx] = false;
|
||||
}
|
||||
} else {
|
||||
// The column is not in `dest_schema`, we don't need to read it.
|
||||
is_source_needed[idx] = false;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ReadAdapter {
|
||||
source_schema,
|
||||
dest_schema,
|
||||
indices_in_result,
|
||||
is_source_needed,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a bool slice to denote which key column in source is needed.
|
||||
#[inline]
|
||||
pub fn source_key_needed(&self) -> &[bool] {
|
||||
&self.is_source_needed[..self.source_schema.row_key_end()]
|
||||
}
|
||||
|
||||
/// Returns a bool slice to denote which value column in source is needed.
|
||||
#[inline]
|
||||
pub fn source_value_needed(&self) -> &[bool] {
|
||||
&self.is_source_needed
|
||||
[self.source_schema.row_key_end()..self.source_schema.user_column_end()]
|
||||
}
|
||||
|
||||
/// Construct a new [Batch] from row key, value, sequence and op_type.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if input `VectorRef` is empty.
|
||||
pub fn batch_from_parts(
|
||||
&self,
|
||||
row_key_columns: Vec<VectorRef>,
|
||||
mut field_columns: Vec<VectorRef>,
|
||||
sequences: VectorRef,
|
||||
op_types: VectorRef,
|
||||
) -> Result<Batch> {
|
||||
// Each vector should has same length, so here we just use the length of `sequence`.
|
||||
let num_rows = sequences.len();
|
||||
|
||||
let mut source = row_key_columns;
|
||||
// Reserve space for value, sequence and op_type
|
||||
source.reserve(field_columns.len() + 2);
|
||||
source.append(&mut field_columns);
|
||||
// Internal columns are push in sequence, op_type order.
|
||||
source.push(sequences);
|
||||
source.push(op_types);
|
||||
|
||||
if !self.need_compat() {
|
||||
return Ok(Batch::new(source));
|
||||
}
|
||||
|
||||
self.source_columns_to_batch(source, num_rows)
|
||||
}
|
||||
|
||||
/// Returns list of fields indices need to read from the parquet file.
|
||||
pub fn fields_to_read(&self) -> Vec<usize> {
|
||||
self.is_source_needed
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, needed)| if *needed { Some(idx) } else { None })
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Convert [RecordBatch] read from the parquet file into [Batch].
|
||||
///
|
||||
/// The [RecordBatch] should have the same schema as [`ReadAdapter::fields_to_read()`].
|
||||
pub fn arrow_record_batch_to_batch(&self, record_batch: &RecordBatch) -> Result<Batch> {
|
||||
let names = self
|
||||
.source_schema
|
||||
.schema()
|
||||
.column_schemas()
|
||||
.iter()
|
||||
.zip(self.is_source_needed.iter())
|
||||
.filter_map(|(column_schema, is_needed)| {
|
||||
if *is_needed {
|
||||
Some(&column_schema.name)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
let source = record_batch
|
||||
.columns()
|
||||
.iter()
|
||||
.zip(names)
|
||||
.map(|(column, name)| {
|
||||
Helper::try_into_vector(column.clone()).context(error::ConvertChunkSnafu { name })
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
|
||||
if !self.need_compat() || record_batch.num_rows() == 0 {
|
||||
return Ok(Batch::new(source));
|
||||
}
|
||||
|
||||
let num_rows = record_batch.num_rows();
|
||||
self.source_columns_to_batch(source, num_rows)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn need_compat(&self) -> bool {
|
||||
self.source_schema.version() != self.dest_schema.schema_to_read().version()
|
||||
}
|
||||
|
||||
fn source_columns_to_batch(&self, source: Vec<VectorRef>, num_rows: usize) -> Result<Batch> {
|
||||
let column_schemas = self.dest_schema.schema_to_read().schema().column_schemas();
|
||||
let columns = self
|
||||
.indices_in_result
|
||||
.iter()
|
||||
.zip(column_schemas)
|
||||
.map(|(index_opt, column_schema)| {
|
||||
if let Some(idx) = index_opt {
|
||||
Ok(source[*idx].clone())
|
||||
} else {
|
||||
let vector = column_schema
|
||||
.create_default_vector(num_rows)
|
||||
.context(error::CreateDefaultToReadSnafu {
|
||||
column: &column_schema.name,
|
||||
})?
|
||||
.context(error::NoDefaultToReadSnafu {
|
||||
column: &column_schema.name,
|
||||
})?;
|
||||
Ok(vector)
|
||||
}
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
Ok(Batch::new(columns))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::schema::Schema;
|
||||
use store_api::storage::ColumnDescriptorBuilder;
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::metadata::RegionMetadata;
|
||||
use crate::schema::{tests, ProjectedSchema, RegionSchema};
|
||||
use crate::test_util::{descriptor_util, schema_util};
|
||||
|
||||
fn call_batch_from_parts(
|
||||
adapter: &ReadAdapter,
|
||||
batch: &Batch,
|
||||
num_field_columns: usize,
|
||||
) -> Batch {
|
||||
let key = batch.columns()[0..2].to_vec();
|
||||
let value = batch.columns()[2..2 + num_field_columns].to_vec();
|
||||
let sequence = batch.column(2 + num_field_columns).clone();
|
||||
let op_type = batch.column(2 + num_field_columns + 1).clone();
|
||||
|
||||
adapter
|
||||
.batch_from_parts(key, value, sequence, op_type)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn check_batch_from_parts_without_padding(
|
||||
adapter: &ReadAdapter,
|
||||
batch: &Batch,
|
||||
num_field_columns: usize,
|
||||
) {
|
||||
let new_batch = call_batch_from_parts(adapter, batch, num_field_columns);
|
||||
assert_eq!(*batch, new_batch);
|
||||
}
|
||||
|
||||
fn call_arrow_chunk_to_batch(adapter: &ReadAdapter, batch: &Batch) -> Batch {
|
||||
let columns_schema = adapter
|
||||
.source_schema
|
||||
.columns()
|
||||
.iter()
|
||||
.zip(adapter.is_source_needed.iter())
|
||||
.filter_map(|(field, is_needed)| {
|
||||
if *is_needed {
|
||||
Some(field.to_column_schema().unwrap())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let arrow_schema = Schema::try_new(columns_schema)
|
||||
.unwrap()
|
||||
.arrow_schema()
|
||||
.clone();
|
||||
let arrays = batch.columns().iter().map(|v| v.to_arrow_array()).collect();
|
||||
let chunk = RecordBatch::try_new(arrow_schema, arrays).unwrap();
|
||||
adapter.arrow_record_batch_to_batch(&chunk).unwrap()
|
||||
}
|
||||
|
||||
fn check_arrow_chunk_to_batch_without_padding(adapter: &ReadAdapter, batch: &Batch) {
|
||||
let new_batch = call_arrow_chunk_to_batch(adapter, batch);
|
||||
assert_eq!(*batch, new_batch);
|
||||
}
|
||||
|
||||
fn check_batch_with_null_padding(batch: &Batch, new_batch: &Batch, null_columns: &[usize]) {
|
||||
assert_eq!(
|
||||
batch.num_columns() + null_columns.len(),
|
||||
new_batch.num_columns()
|
||||
);
|
||||
|
||||
let columns_from_source = new_batch
|
||||
.columns()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, v)| {
|
||||
if null_columns.contains(&i) {
|
||||
None
|
||||
} else {
|
||||
Some(v.clone())
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(batch.columns(), &columns_from_source);
|
||||
|
||||
for idx in null_columns {
|
||||
assert!(new_batch.column(*idx).only_null());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compat_same_schema() {
|
||||
// (k0, timestamp, v0, v1) with version 0.
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(0, 2));
|
||||
let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema.clone()));
|
||||
let source_schema = region_schema.store_schema().clone();
|
||||
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
|
||||
|
||||
assert_eq!(&[true, true], adapter.source_key_needed());
|
||||
assert_eq!(&[true, true], adapter.source_value_needed());
|
||||
|
||||
let batch = tests::new_batch_with_num_values(2);
|
||||
check_batch_from_parts_without_padding(&adapter, &batch, 2);
|
||||
|
||||
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4, 5],);
|
||||
|
||||
check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compat_same_version_with_projection() {
|
||||
// (k0, timestamp, v0, v1) with version 0.
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(0, 2));
|
||||
// Just read v0, k0.
|
||||
let projected_schema =
|
||||
Arc::new(ProjectedSchema::new(region_schema.clone(), Some(vec![2, 0])).unwrap());
|
||||
|
||||
let source_schema = region_schema.store_schema().clone();
|
||||
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
|
||||
|
||||
assert_eq!(&[true, true], adapter.source_key_needed());
|
||||
assert_eq!(&[true, false], adapter.source_value_needed());
|
||||
|
||||
// One value column has been filtered out, so the result batch should only contains one value column.
|
||||
let batch = tests::new_batch_with_num_values(1);
|
||||
check_batch_from_parts_without_padding(&adapter, &batch, 1);
|
||||
|
||||
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5]);
|
||||
|
||||
check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compat_old_column() {
|
||||
// (k0, timestamp, v0) with version 0.
|
||||
let region_schema_old = Arc::new(schema_util::new_region_schema(0, 1));
|
||||
// (k0, timestamp, v0, v1) with version 1.
|
||||
let region_schema_new = Arc::new(schema_util::new_region_schema(1, 1));
|
||||
|
||||
// Just read v0, k0
|
||||
let projected_schema =
|
||||
Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![2, 0])).unwrap());
|
||||
|
||||
let source_schema = region_schema_old.store_schema().clone();
|
||||
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
|
||||
|
||||
assert_eq!(&[true, true], adapter.source_key_needed());
|
||||
assert_eq!(&[true], adapter.source_value_needed());
|
||||
|
||||
let batch = tests::new_batch_with_num_values(1);
|
||||
check_batch_from_parts_without_padding(&adapter, &batch, 1);
|
||||
|
||||
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4],);
|
||||
|
||||
check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compat_new_column() {
|
||||
// (k0, timestamp, v0, v1) with version 0.
|
||||
let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2));
|
||||
// (k0, timestamp, v0, v1, v2) with version 1.
|
||||
let region_schema_new = Arc::new(schema_util::new_region_schema(1, 3));
|
||||
|
||||
// Just read v2, v0, k0
|
||||
let projected_schema =
|
||||
Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![4, 2, 0])).unwrap());
|
||||
|
||||
let source_schema = region_schema_old.store_schema().clone();
|
||||
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
|
||||
|
||||
assert_eq!(&[true, true], adapter.source_key_needed());
|
||||
assert_eq!(&[true, false], adapter.source_value_needed());
|
||||
|
||||
// Only read one value column from source.
|
||||
let batch = tests::new_batch_with_num_values(1);
|
||||
// New batch should contains k0, timestamp, v0, sequence, op_type.
|
||||
let new_batch = call_batch_from_parts(&adapter, &batch, 1);
|
||||
// v2 is filled by null.
|
||||
check_batch_with_null_padding(&batch, &new_batch, &[3]);
|
||||
|
||||
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5],);
|
||||
|
||||
let new_batch = call_arrow_chunk_to_batch(&adapter, &batch);
|
||||
check_batch_with_null_padding(&batch, &new_batch, &[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compat_different_column() {
|
||||
// (k0, timestamp, v0, v1) with version 0.
|
||||
let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2));
|
||||
|
||||
let mut descriptor = descriptor_util::desc_with_field_columns(tests::REGION_NAME, 2);
|
||||
// Assign a much larger column id to v0.
|
||||
descriptor.default_cf.columns[0].id = descriptor.default_cf.columns.last().unwrap().id + 10;
|
||||
let metadata: RegionMetadata = descriptor.try_into().unwrap();
|
||||
let columns = metadata.columns;
|
||||
// (k0, timestamp, v0, v1) with version 2, and v0 has different column id.
|
||||
let region_schema_new = Arc::new(RegionSchema::new(columns, 2).unwrap());
|
||||
|
||||
let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema_new));
|
||||
let source_schema = region_schema_old.store_schema().clone();
|
||||
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
|
||||
|
||||
assert_eq!(&[true, true], adapter.source_key_needed());
|
||||
// v0 is discarded as it has different column id than new schema's.
|
||||
assert_eq!(&[false, true], adapter.source_value_needed());
|
||||
|
||||
// New batch should contains k0, timestamp, v1, sequence, op_type, so we need to remove v0
|
||||
// from the created batch.
|
||||
let batch = tests::new_batch_with_num_values(2);
|
||||
let mut columns = batch.columns().to_vec();
|
||||
// Remove v0.
|
||||
let _ = columns.remove(2);
|
||||
let batch = Batch::new(columns);
|
||||
|
||||
let new_batch = call_batch_from_parts(&adapter, &batch, 1);
|
||||
// v0 is filled by null.
|
||||
check_batch_with_null_padding(&batch, &new_batch, &[2]);
|
||||
|
||||
assert_eq!(&adapter.fields_to_read(), &[0, 1, 3, 4, 5],);
|
||||
|
||||
let new_batch = call_arrow_chunk_to_batch(&adapter, &batch);
|
||||
check_batch_with_null_padding(&batch, &new_batch, &[2]);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn new_column_desc_builder() -> ColumnDescriptorBuilder {
|
||||
ColumnDescriptorBuilder::new(10, "test", ConcreteDataType::int32_datatype())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_source_column_compatible() {
|
||||
let desc = new_column_desc_builder().build().unwrap();
|
||||
let source = ColumnMetadata { cf_id: 1, desc };
|
||||
|
||||
// Same column is always compatible, also tests read nullable column
|
||||
// as a nullable column.
|
||||
assert!(is_source_column_compatible(&source, &source).unwrap());
|
||||
|
||||
// Different id.
|
||||
let desc = new_column_desc_builder()
|
||||
.id(source.desc.id + 1)
|
||||
.build()
|
||||
.unwrap();
|
||||
let dest = ColumnMetadata { cf_id: 1, desc };
|
||||
assert!(!is_source_column_compatible(&source, &dest).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nullable_column_read_by_not_null() {
|
||||
let desc = new_column_desc_builder().build().unwrap();
|
||||
assert!(desc.is_nullable());
|
||||
let source = ColumnMetadata { cf_id: 1, desc };
|
||||
|
||||
let desc = new_column_desc_builder()
|
||||
.is_nullable(false)
|
||||
.build()
|
||||
.unwrap();
|
||||
let dest = ColumnMetadata { cf_id: 1, desc };
|
||||
|
||||
let err = is_source_column_compatible(&source, &dest).unwrap_err();
|
||||
assert!(
|
||||
matches!(err, Error::CompatRead { .. }),
|
||||
"{err:?} is not CompatRead",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_not_null_column() {
|
||||
let desc = new_column_desc_builder()
|
||||
.is_nullable(false)
|
||||
.build()
|
||||
.unwrap();
|
||||
let source = ColumnMetadata { cf_id: 1, desc };
|
||||
|
||||
let desc = new_column_desc_builder()
|
||||
.is_nullable(false)
|
||||
.build()
|
||||
.unwrap();
|
||||
let not_null_dest = ColumnMetadata { cf_id: 1, desc };
|
||||
assert!(is_source_column_compatible(&source, ¬_null_dest).unwrap());
|
||||
|
||||
let desc = new_column_desc_builder().build().unwrap();
|
||||
let null_dest = ColumnMetadata { cf_id: 1, desc };
|
||||
assert!(is_source_column_compatible(&source, &null_dest).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_column_with_different_name() {
|
||||
let desc = new_column_desc_builder().build().unwrap();
|
||||
let source = ColumnMetadata { cf_id: 1, desc };
|
||||
|
||||
let desc = new_column_desc_builder()
|
||||
.name(format!("{}_other", source.desc.name))
|
||||
.build()
|
||||
.unwrap();
|
||||
let dest = ColumnMetadata { cf_id: 1, desc };
|
||||
|
||||
let err = is_source_column_compatible(&source, &dest).unwrap_err();
|
||||
assert!(
|
||||
matches!(err, Error::CompatRead { .. }),
|
||||
"{err:?} is not CompatRead",
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,590 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use common_base::BitVec;
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::schema::{SchemaBuilder, SchemaRef};
|
||||
use datatypes::vectors::{BooleanVector, UInt8Vector};
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::storage::{Chunk, ColumnId};
|
||||
|
||||
use crate::error;
|
||||
use crate::metadata::{self, Result};
|
||||
use crate::read::{Batch, BatchOp};
|
||||
use crate::schema::{RegionSchema, RegionSchemaRef, StoreSchema, StoreSchemaRef};
|
||||
|
||||
/// Metadata about projection.
|
||||
#[derive(Debug, Default)]
|
||||
struct Projection {
|
||||
/// Column indices of projection.
|
||||
projected_columns: Vec<usize>,
|
||||
/// Sorted and deduplicated indices of columns to read, includes all row key columns
|
||||
/// and internal columns.
|
||||
///
|
||||
/// We use these indices to read from data sources.
|
||||
columns_to_read: Vec<usize>,
|
||||
/// Maps column id to its index in `columns_to_read`.
|
||||
///
|
||||
/// Used to ask whether the column with given column id is needed in projection.
|
||||
id_to_read_idx: HashMap<ColumnId, usize>,
|
||||
/// Maps index of `projected_columns` to index of the column in `columns_to_read`.
|
||||
///
|
||||
/// Invariant:
|
||||
/// - `projected_idx_to_read_idx.len() == projected_columns.len()`
|
||||
projected_idx_to_read_idx: Vec<usize>,
|
||||
/// Number of user columns to read.
|
||||
num_user_columns: usize,
|
||||
}
|
||||
|
||||
impl Projection {
|
||||
fn new(region_schema: &RegionSchema, projected_columns: Vec<usize>) -> Projection {
|
||||
// Get a sorted list of column indices to read.
|
||||
let mut column_indices: BTreeSet<_> = projected_columns.iter().cloned().collect();
|
||||
column_indices.extend(region_schema.row_key_indices());
|
||||
let num_user_columns = column_indices.len();
|
||||
// Now insert internal columns.
|
||||
column_indices.extend([
|
||||
region_schema.sequence_index(),
|
||||
region_schema.op_type_index(),
|
||||
]);
|
||||
let columns_to_read: Vec<_> = column_indices.into_iter().collect();
|
||||
|
||||
// The region schema ensure that last two column must be internal columns.
|
||||
assert_eq!(
|
||||
region_schema.sequence_index(),
|
||||
columns_to_read[num_user_columns]
|
||||
);
|
||||
assert_eq!(
|
||||
region_schema.op_type_index(),
|
||||
columns_to_read[num_user_columns + 1]
|
||||
);
|
||||
|
||||
// Mapping: <column id> => <index in `columns_to_read`>
|
||||
let id_to_read_idx: HashMap<_, _> = columns_to_read
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, col_idx)| (region_schema.column_metadata(*col_idx).id(), idx))
|
||||
.collect();
|
||||
// Use column id to find index in `columns_to_read` of a column in `projected_columns`.
|
||||
let projected_idx_to_read_idx = projected_columns
|
||||
.iter()
|
||||
.map(|col_idx| {
|
||||
let column_id = region_schema.column_metadata(*col_idx).id();
|
||||
// This unwrap() should be safe since `columns_to_read` must contains all columns in `projected_columns`.
|
||||
let read_idx = id_to_read_idx.get(&column_id).unwrap();
|
||||
*read_idx
|
||||
})
|
||||
.collect();
|
||||
|
||||
Projection {
|
||||
projected_columns,
|
||||
columns_to_read,
|
||||
id_to_read_idx,
|
||||
projected_idx_to_read_idx,
|
||||
num_user_columns,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Schema with projection info.
|
||||
#[derive(Debug)]
|
||||
pub struct ProjectedSchema {
|
||||
/// Projection info, `None` means don't need to do projection.
|
||||
projection: Option<Projection>,
|
||||
/// Schema used to read from data sources.
|
||||
schema_to_read: StoreSchemaRef,
|
||||
/// User schema after projection.
|
||||
projected_user_schema: SchemaRef,
|
||||
}
|
||||
|
||||
pub type ProjectedSchemaRef = Arc<ProjectedSchema>;
|
||||
|
||||
impl ProjectedSchema {
|
||||
/// Create a new `ProjectedSchema` with given `projected_columns`.
|
||||
///
|
||||
/// If `projected_columns` is None, then all columns would be read. If `projected_columns` is
|
||||
/// `Some`, then the `Vec` in it contains the indices of columns need to be read.
|
||||
///
|
||||
/// If the `Vec` is empty or contains invalid index, `Err` would be returned.
|
||||
pub fn new(
|
||||
region_schema: RegionSchemaRef,
|
||||
projected_columns: Option<Vec<usize>>,
|
||||
) -> Result<ProjectedSchema> {
|
||||
match projected_columns {
|
||||
Some(indices) => {
|
||||
Self::validate_projection(®ion_schema, &indices)?;
|
||||
|
||||
let projection = Projection::new(®ion_schema, indices);
|
||||
|
||||
let schema_to_read = Self::build_schema_to_read(®ion_schema, &projection)?;
|
||||
let projected_user_schema =
|
||||
Self::build_projected_user_schema(®ion_schema, &projection)?;
|
||||
|
||||
Ok(ProjectedSchema {
|
||||
projection: Some(projection),
|
||||
schema_to_read,
|
||||
projected_user_schema,
|
||||
})
|
||||
}
|
||||
None => Ok(ProjectedSchema::no_projection(region_schema)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a `ProjectedSchema` that read all columns.
|
||||
pub fn no_projection(region_schema: RegionSchemaRef) -> ProjectedSchema {
|
||||
// We could just reuse the StoreSchema and user schema.
|
||||
ProjectedSchema {
|
||||
projection: None,
|
||||
schema_to_read: region_schema.store_schema().clone(),
|
||||
projected_user_schema: region_schema.user_schema().clone(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn projected_user_schema(&self) -> &SchemaRef {
|
||||
&self.projected_user_schema
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn schema_to_read(&self) -> &StoreSchemaRef {
|
||||
&self.schema_to_read
|
||||
}
|
||||
|
||||
/// Convert [Batch] into [Chunk].
|
||||
///
|
||||
/// This will remove all internal columns. The input `batch` should has the
|
||||
/// same schema as [`self.schema_to_read()`](ProjectedSchema::schema_to_read).
|
||||
/// The output [Chunk] has the same schema as
|
||||
/// [`self.projected_user_schema()`](ProjectedSchema::projected_user_schema).
|
||||
pub fn batch_to_chunk(&self, batch: &Batch) -> Chunk {
|
||||
let columns = match &self.projection {
|
||||
Some(projection) => projection
|
||||
.projected_idx_to_read_idx
|
||||
.iter()
|
||||
.map(|col_idx| batch.column(*col_idx))
|
||||
.cloned()
|
||||
.collect(),
|
||||
None => {
|
||||
let num_user_columns = self.projected_user_schema.num_columns();
|
||||
batch
|
||||
.columns()
|
||||
.iter()
|
||||
.take(num_user_columns)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
};
|
||||
Chunk::new(columns)
|
||||
}
|
||||
|
||||
/// Returns true if column with given `column_id` is needed (in projection).
|
||||
pub fn is_needed(&self, column_id: ColumnId) -> bool {
|
||||
self.projection
|
||||
.as_ref()
|
||||
.map(|p| p.id_to_read_idx.contains_key(&column_id))
|
||||
.unwrap_or(true)
|
||||
}
|
||||
|
||||
fn build_schema_to_read(
|
||||
region_schema: &RegionSchema,
|
||||
projection: &Projection,
|
||||
) -> Result<StoreSchemaRef> {
|
||||
// Reorder columns according to the projection.
|
||||
let columns: Vec<_> = projection
|
||||
.columns_to_read
|
||||
.iter()
|
||||
.map(|col_idx| region_schema.column_metadata(*col_idx))
|
||||
.cloned()
|
||||
.collect();
|
||||
// All row key columns are reserved in this schema, so we can use the row_key_end
|
||||
// and timestamp_key_index from region schema.
|
||||
let store_schema = StoreSchema::new(
|
||||
columns,
|
||||
region_schema.version(),
|
||||
region_schema.row_key_end(),
|
||||
projection.num_user_columns,
|
||||
)?;
|
||||
|
||||
Ok(Arc::new(store_schema))
|
||||
}
|
||||
|
||||
fn build_projected_user_schema(
|
||||
region_schema: &RegionSchema,
|
||||
projection: &Projection,
|
||||
) -> Result<SchemaRef> {
|
||||
let column_schemas: Vec<_> = projection
|
||||
.projected_columns
|
||||
.iter()
|
||||
.map(|col_idx| {
|
||||
region_schema
|
||||
.column_metadata(*col_idx)
|
||||
.desc
|
||||
.to_column_schema()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let schema = SchemaBuilder::try_from(column_schemas)
|
||||
.context(metadata::ConvertSchemaSnafu)?
|
||||
.version(region_schema.version())
|
||||
.build()
|
||||
.context(metadata::InvalidSchemaSnafu)?;
|
||||
|
||||
Ok(Arc::new(schema))
|
||||
}
|
||||
|
||||
fn validate_projection(region_schema: &RegionSchema, indices: &[usize]) -> Result<()> {
|
||||
// The projection indices should not be empty, at least the timestamp column
|
||||
// should be always read, and the `StoreSchema` also requires the timestamp column.
|
||||
ensure!(
|
||||
!indices.is_empty(),
|
||||
metadata::InvalidProjectionSnafu {
|
||||
msg: "at least one column should be read",
|
||||
}
|
||||
);
|
||||
|
||||
// Now only allowed to read user columns.
|
||||
let user_schema = region_schema.user_schema();
|
||||
for i in indices {
|
||||
ensure!(
|
||||
*i < user_schema.num_columns(),
|
||||
metadata::InvalidProjectionSnafu {
|
||||
msg: format!(
|
||||
"index {} out of bound, only contains {} columns",
|
||||
i,
|
||||
user_schema.num_columns()
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl BatchOp for ProjectedSchema {
|
||||
fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering {
|
||||
// Ordered by (row_key asc, sequence desc, op_type desc).
|
||||
let indices = self.schema_to_read.row_key_indices();
|
||||
for idx in indices {
|
||||
let (left_col, right_col) = (left.column(idx), right.column(idx));
|
||||
// Comparison of vector is done by virtual method calls currently. Consider using
|
||||
// enum dispatch if this becomes bottleneck.
|
||||
let order = left_col.get_ref(i).cmp(&right_col.get_ref(j));
|
||||
if order != Ordering::Equal {
|
||||
return order;
|
||||
}
|
||||
}
|
||||
let (sequence_index, op_type_index) = (
|
||||
self.schema_to_read.sequence_index(),
|
||||
self.schema_to_read.op_type_index(),
|
||||
);
|
||||
right
|
||||
.column(sequence_index)
|
||||
.get_ref(j)
|
||||
.cmp(&left.column(sequence_index).get_ref(i))
|
||||
.then_with(|| {
|
||||
right
|
||||
.column(op_type_index)
|
||||
.get_ref(j)
|
||||
.cmp(&left.column(op_type_index).get_ref(i))
|
||||
})
|
||||
}
|
||||
|
||||
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) {
|
||||
if let Some(prev) = prev {
|
||||
assert_eq!(batch.num_columns(), prev.num_columns());
|
||||
}
|
||||
let indices = self.schema_to_read.row_key_indices();
|
||||
for idx in indices {
|
||||
let (current, prev_col) = (
|
||||
batch.column(idx),
|
||||
prev.map(|prev| prev.column(idx).as_ref()),
|
||||
);
|
||||
current.find_unique(selected, prev_col);
|
||||
}
|
||||
}
|
||||
|
||||
fn filter(&self, batch: &Batch, filter: &BooleanVector) -> error::Result<Batch> {
|
||||
let columns = batch
|
||||
.columns()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, v)| {
|
||||
v.filter(filter).context(error::FilterColumnSnafu {
|
||||
name: self.schema_to_read.column_name(i),
|
||||
})
|
||||
})
|
||||
.collect::<error::Result<Vec<_>>>()?;
|
||||
|
||||
Ok(Batch::new(columns))
|
||||
}
|
||||
|
||||
fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec) {
|
||||
let op_types = batch.column(self.schema_to_read.op_type_index());
|
||||
// Safety: We expect the batch has the same schema as `self.schema_to_read`. The
|
||||
// read procedure should guarantee this, otherwise this is a critical bug and it
|
||||
// should be fine to panic.
|
||||
let op_types = op_types
|
||||
.as_any()
|
||||
.downcast_ref::<UInt8Vector>()
|
||||
.unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Expect op_type (UInt8) column at index {}, given {:?}",
|
||||
self.schema_to_read.op_type_index(),
|
||||
op_types.data_type()
|
||||
);
|
||||
});
|
||||
|
||||
for (i, op_type) in op_types.iter_data().enumerate() {
|
||||
if op_type == Some(OpType::Delete as u8) {
|
||||
selected.set(i, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use api::v1::OpType;
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
use datatypes::vectors::{TimestampMillisecondVector, VectorRef};
|
||||
|
||||
use super::*;
|
||||
use crate::metadata::Error;
|
||||
use crate::schema::tests;
|
||||
use crate::test_util::{read_util, schema_util};
|
||||
|
||||
#[test]
|
||||
fn test_projection() {
|
||||
// Build a region schema with 2 value columns. So the final user schema is
|
||||
// (k0, timestamp, v0, v1)
|
||||
let region_schema = schema_util::new_region_schema(0, 2);
|
||||
|
||||
// Projection, but still keep column order.
|
||||
// After projection: (timestamp, v0)
|
||||
let projected_columns = vec![1, 2];
|
||||
let projection = Projection::new(®ion_schema, projected_columns.clone());
|
||||
assert_eq!(projected_columns, projection.projected_columns);
|
||||
// Need to read (k0, timestamp, v0, sequence, op_type)
|
||||
assert_eq!(&[0, 1, 2, 4, 5], &projection.columns_to_read[..]);
|
||||
assert_eq!(5, projection.id_to_read_idx.len());
|
||||
// Index of timestamp, v0 in `columns_to_read`
|
||||
assert_eq!(&[1, 2], &projection.projected_idx_to_read_idx[..]);
|
||||
// 3 columns: k0, timestamp, v0
|
||||
assert_eq!(3, projection.num_user_columns);
|
||||
|
||||
// Projection, unordered.
|
||||
// After projection: (timestamp, v1, k0)
|
||||
let projected_columns = vec![1, 3, 0];
|
||||
let projection = Projection::new(®ion_schema, projected_columns.clone());
|
||||
assert_eq!(projected_columns, projection.projected_columns);
|
||||
// Need to read (k0, timestamp, v1, sequence, op_type)
|
||||
assert_eq!(&[0, 1, 3, 4, 5], &projection.columns_to_read[..]);
|
||||
assert_eq!(5, projection.id_to_read_idx.len());
|
||||
// Index of timestamp, v1, k0 in `columns_to_read`
|
||||
assert_eq!(&[1, 2, 0], &projection.projected_idx_to_read_idx[..]);
|
||||
// 3 columns: k0, timestamp, v1
|
||||
assert_eq!(3, projection.num_user_columns);
|
||||
|
||||
// Empty projection.
|
||||
let projection = Projection::new(®ion_schema, Vec::new());
|
||||
assert!(projection.projected_columns.is_empty());
|
||||
// Still need to read row keys.
|
||||
assert_eq!(&[0, 1, 4, 5], &projection.columns_to_read[..]);
|
||||
assert_eq!(4, projection.id_to_read_idx.len());
|
||||
assert!(projection.projected_idx_to_read_idx.is_empty());
|
||||
assert_eq!(2, projection.num_user_columns);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_projected_schema_with_projection() {
|
||||
// (k0, timestamp, v0, v1, v2)
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(123, 3));
|
||||
|
||||
// After projection: (v1, timestamp)
|
||||
let projected_schema =
|
||||
ProjectedSchema::new(region_schema.clone(), Some(vec![3, 1])).unwrap();
|
||||
let expect_user = schema_util::new_schema_with_version(
|
||||
&[
|
||||
("v1", LogicalTypeId::Int64, true),
|
||||
("timestamp", LogicalTypeId::TimestampMillisecond, false),
|
||||
],
|
||||
Some(1),
|
||||
123,
|
||||
);
|
||||
assert_eq!(expect_user, **projected_schema.projected_user_schema());
|
||||
|
||||
// Test is_needed
|
||||
let needed: Vec<_> = region_schema
|
||||
.columns()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, column_meta)| {
|
||||
if projected_schema.is_needed(column_meta.id()) {
|
||||
Some(idx)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
// (k0, timestamp, v1, sequence, op_type)
|
||||
assert_eq!(&[0, 1, 3, 5, 6], &needed[..]);
|
||||
|
||||
// Use another projection.
|
||||
// After projection: (v0, timestamp)
|
||||
let projected_schema = ProjectedSchema::new(region_schema, Some(vec![2, 1])).unwrap();
|
||||
|
||||
// The schema to read should be same as region schema with (k0, timestamp, v0).
|
||||
// We can't use `new_schema_with_version()` because the StoreSchema also store other
|
||||
// metadata that `new_schema_with_version()` can't store.
|
||||
let expect_schema = schema_util::new_region_schema(123, 1);
|
||||
assert_eq!(
|
||||
expect_schema.store_schema(),
|
||||
projected_schema.schema_to_read()
|
||||
);
|
||||
|
||||
// (k0, timestamp, v0, sequence, op_type)
|
||||
let batch = tests::new_batch();
|
||||
// Test Batch to our Chunk.
|
||||
// (v0, timestamp)
|
||||
let chunk = projected_schema.batch_to_chunk(&batch);
|
||||
assert_eq!(2, chunk.columns.len());
|
||||
assert_eq!(&chunk.columns[0], batch.column(2));
|
||||
assert_eq!(&chunk.columns[1], batch.column(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_projected_schema_no_projection() {
|
||||
// (k0, timestamp, v0)
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
|
||||
|
||||
let projected_schema = ProjectedSchema::no_projection(region_schema.clone());
|
||||
|
||||
assert_eq!(
|
||||
region_schema.user_schema(),
|
||||
projected_schema.projected_user_schema()
|
||||
);
|
||||
assert_eq!(
|
||||
region_schema.store_schema(),
|
||||
projected_schema.schema_to_read()
|
||||
);
|
||||
|
||||
for column in region_schema.columns() {
|
||||
assert!(projected_schema.is_needed(column.id()));
|
||||
}
|
||||
|
||||
// (k0, timestamp, v0, sequence, op_type)
|
||||
let batch = tests::new_batch();
|
||||
// Test Batch to our Chunk.
|
||||
// (k0, timestamp, v0)
|
||||
let chunk = projected_schema.batch_to_chunk(&batch);
|
||||
assert_eq!(3, chunk.columns.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_projected_schema_empty_projection() {
|
||||
// (k0, timestamp, v0)
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
|
||||
|
||||
let err = ProjectedSchema::new(region_schema, Some(Vec::new()))
|
||||
.err()
|
||||
.unwrap();
|
||||
assert!(matches!(err, Error::InvalidProjection { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compare_batch() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let left = read_util::new_full_kv_batch(&[(1000, 1, 1000, OpType::Put)]);
|
||||
let right = read_util::new_full_kv_batch(&[
|
||||
(999, 1, 1000, OpType::Put),
|
||||
(1000, 1, 999, OpType::Put),
|
||||
(1000, 1, 1000, OpType::Put),
|
||||
]);
|
||||
|
||||
assert_eq!(Ordering::Greater, schema.compare_row(&left, 0, &right, 0));
|
||||
assert_eq!(Ordering::Less, schema.compare_row(&left, 0, &right, 1));
|
||||
assert_eq!(Ordering::Equal, schema.compare_row(&left, 0, &right, 2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_find_unique() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);
|
||||
|
||||
let mut selected = BitVec::repeat(false, 3);
|
||||
schema.find_unique(&batch, &mut selected, None);
|
||||
assert!(selected[0]);
|
||||
assert!(selected[1]);
|
||||
assert!(!selected[2]);
|
||||
|
||||
let mut selected = BitVec::repeat(false, 3);
|
||||
let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
|
||||
schema.find_unique(&batch, &mut selected, Some(&prev));
|
||||
assert!(!selected[0]);
|
||||
assert!(selected[1]);
|
||||
assert!(!selected[2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_with_op() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let mut selected = BitVec::repeat(false, 3);
|
||||
let batch = read_util::new_full_kv_batch(&[
|
||||
(1001, 1, 3, OpType::Put),
|
||||
(1000, 1, 2, OpType::Delete),
|
||||
(1000, 1, 1, OpType::Put),
|
||||
]);
|
||||
schema.find_unique(&batch, &mut selected, None);
|
||||
assert!(selected[0]);
|
||||
assert!(selected[1]);
|
||||
assert!(!selected[2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_batch() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (3000, Some(3))]);
|
||||
let filter = BooleanVector::from_slice(&[true, false, true]);
|
||||
|
||||
let res = schema.filter(&batch, &filter).unwrap();
|
||||
let expect: VectorRef = Arc::new(TimestampMillisecondVector::from_values([1000, 3000]));
|
||||
assert_eq!(expect, *res.column(0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unselect_deleted() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let batch = read_util::new_full_kv_batch(&[
|
||||
(100, 1, 1000, OpType::Put),
|
||||
(101, 1, 999, OpType::Delete),
|
||||
(102, 1, 1000, OpType::Put),
|
||||
(103, 1, 999, OpType::Put),
|
||||
(104, 1, 1000, OpType::Delete),
|
||||
]);
|
||||
|
||||
let mut selected = BitVec::repeat(true, batch.num_rows());
|
||||
schema.unselect_deleted(&batch, &mut selected);
|
||||
assert_eq!(
|
||||
BitVec::from_iter([true, false, true, true, false]),
|
||||
selected
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,214 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::schema::{Schema, SchemaBuilder, SchemaRef};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, ColumnsMetadataRef, Result};
|
||||
use crate::schema::{StoreSchema, StoreSchemaRef};
|
||||
|
||||
/// Schema of region.
|
||||
///
|
||||
/// The `RegionSchema` has the knowledge of reserved and internal columns.
|
||||
/// Reserved columns are columns that their names, ids are reserved by the storage
|
||||
/// engine, and could not be used by the user. Reserved columns usually have
|
||||
/// special usage. Reserved columns expect the version columns are also
|
||||
/// called internal columns (though the version could also be thought as a
|
||||
/// special kind of internal column), are not visible to user, such as our
|
||||
/// internal sequence, op_type columns.
|
||||
///
|
||||
/// The user schema is the schema that only contains columns that user could visit,
|
||||
/// as well as what the schema user created.
|
||||
#[derive(PartialEq, Eq)]
|
||||
pub struct RegionSchema {
|
||||
/// Schema that only contains columns that user defined, excluding internal columns
|
||||
/// that are reserved and used by the storage engine.
|
||||
///
|
||||
/// Holding a [SchemaRef] to allow converting into `SchemaRef`/`arrow::SchemaRef`
|
||||
/// conveniently. The fields order in `SchemaRef` **must** be consistent with
|
||||
/// columns order in [ColumnsMetadata] to ensure the projection index of a field
|
||||
/// is correct.
|
||||
user_schema: SchemaRef,
|
||||
/// store schema contains all columns of the region, including all internal columns.
|
||||
store_schema: StoreSchemaRef,
|
||||
/// Metadata of columns.
|
||||
columns: ColumnsMetadataRef,
|
||||
}
|
||||
|
||||
impl fmt::Debug for RegionSchema {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("RegionSchema")
|
||||
.field("columns", &self.columns)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl RegionSchema {
|
||||
pub fn new(columns: ColumnsMetadataRef, version: u32) -> Result<RegionSchema> {
|
||||
let user_schema = Arc::new(build_user_schema(&columns, version)?);
|
||||
let store_schema = Arc::new(StoreSchema::from_columns_metadata(&columns, version)?);
|
||||
|
||||
debug_assert_eq!(user_schema.version(), store_schema.version());
|
||||
debug_assert_eq!(version, user_schema.version());
|
||||
|
||||
Ok(RegionSchema {
|
||||
user_schema,
|
||||
store_schema,
|
||||
columns,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the schema of the region, excluding internal columns that used by
|
||||
/// the storage engine.
|
||||
#[inline]
|
||||
pub fn user_schema(&self) -> &SchemaRef {
|
||||
&self.user_schema
|
||||
}
|
||||
|
||||
/// Returns the schema actually stores, which would also contains all internal columns.
|
||||
#[inline]
|
||||
pub fn store_schema(&self) -> &StoreSchemaRef {
|
||||
&self.store_schema
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn row_key_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
|
||||
self.columns.iter_row_key_columns()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn field_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
|
||||
self.columns.iter_field_columns()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn num_row_key_columns(&self) -> usize {
|
||||
self.columns.num_row_key_columns()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn num_field_columns(&self) -> usize {
|
||||
self.columns.num_field_columns()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn version(&self) -> u32 {
|
||||
self.user_schema.version()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn row_key_end(&self) -> usize {
|
||||
self.columns.row_key_end()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn sequence_index(&self) -> usize {
|
||||
self.store_schema.sequence_index()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn op_type_index(&self) -> usize {
|
||||
self.store_schema.op_type_index()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn row_key_indices(&self) -> impl Iterator<Item = usize> {
|
||||
self.store_schema.row_key_indices()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn timestamp_index(&self) -> usize {
|
||||
self.store_schema.timestamp_index()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn timestamp_column_name(&self) -> &str {
|
||||
self.store_schema.column_name(self.timestamp_index())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn value_indices(&self) -> impl Iterator<Item = usize> {
|
||||
self.store_schema.value_indices()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn column_metadata(&self, idx: usize) -> &ColumnMetadata {
|
||||
self.columns.column_metadata(idx)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn columns(&self) -> &[ColumnMetadata] {
|
||||
self.columns.columns()
|
||||
}
|
||||
}
|
||||
|
||||
pub type RegionSchemaRef = Arc<RegionSchema>;
|
||||
|
||||
// Now user schema don't have extra metadata like store schema.
|
||||
fn build_user_schema(columns: &ColumnsMetadata, version: u32) -> Result<Schema> {
|
||||
let column_schemas: Vec<_> = columns
|
||||
.iter_user_columns()
|
||||
.map(|col| col.desc.to_column_schema())
|
||||
.collect();
|
||||
|
||||
SchemaBuilder::try_from(column_schemas)
|
||||
.context(metadata::ConvertSchemaSnafu)?
|
||||
.version(version)
|
||||
.build()
|
||||
.context(metadata::InvalidSchemaSnafu)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use datatypes::type_id::LogicalTypeId;
|
||||
|
||||
use super::*;
|
||||
use crate::test_util::schema_util;
|
||||
|
||||
#[test]
|
||||
fn test_region_schema() {
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
|
||||
|
||||
let expect_schema = schema_util::new_schema_with_version(
|
||||
&[
|
||||
("k0", LogicalTypeId::Int64, false),
|
||||
("timestamp", LogicalTypeId::TimestampMillisecond, false),
|
||||
("v0", LogicalTypeId::Int64, true),
|
||||
],
|
||||
Some(1),
|
||||
123,
|
||||
);
|
||||
|
||||
assert_eq!(expect_schema, **region_schema.user_schema());
|
||||
|
||||
// Checks row key column.
|
||||
let mut row_keys = region_schema.row_key_columns();
|
||||
assert_eq!("k0", row_keys.next().unwrap().desc.name);
|
||||
assert_eq!("timestamp", row_keys.next().unwrap().desc.name);
|
||||
assert_eq!(None, row_keys.next());
|
||||
assert_eq!(2, region_schema.num_row_key_columns());
|
||||
|
||||
// Checks value column.
|
||||
let mut values = region_schema.field_columns();
|
||||
assert_eq!("v0", values.next().unwrap().desc.name);
|
||||
assert_eq!(None, values.next());
|
||||
assert_eq!(1, region_schema.num_field_columns());
|
||||
|
||||
// Checks version.
|
||||
assert_eq!(123, region_schema.version());
|
||||
}
|
||||
}
|
||||
@@ -1,323 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::arrow::datatypes::Schema as ArrowSchema;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::schema::{Schema, SchemaBuilder, SchemaRef};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::storage::consts;
|
||||
|
||||
use crate::error::NewRecordBatchSnafu;
|
||||
use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, Error, Result};
|
||||
use crate::read::Batch;
|
||||
|
||||
const ROW_KEY_END_KEY: &str = "greptime:storage:row_key_end";
|
||||
const USER_COLUMN_END_KEY: &str = "greptime:storage:user_column_end";
|
||||
|
||||
/// Schema that contains storage engine specific metadata, such as internal columns.
|
||||
///
|
||||
/// Used internally, contains all row key columns, internal columns and a sub set of
|
||||
/// value columns in a region. The columns are organized in `key, value, internal` order.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct StoreSchema {
|
||||
columns: Vec<ColumnMetadata>,
|
||||
schema: SchemaRef,
|
||||
row_key_end: usize,
|
||||
user_column_end: usize,
|
||||
}
|
||||
|
||||
pub type StoreSchemaRef = Arc<StoreSchema>;
|
||||
|
||||
impl StoreSchema {
|
||||
#[inline]
|
||||
pub fn version(&self) -> u32 {
|
||||
self.schema.version()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn schema(&self) -> &SchemaRef {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn arrow_schema(&self) -> &Arc<ArrowSchema> {
|
||||
self.schema.arrow_schema()
|
||||
}
|
||||
|
||||
// TODO(yingwen): Remove this method.
|
||||
pub fn batch_to_arrow_record_batch(
|
||||
&self,
|
||||
batch: &Batch,
|
||||
) -> std::result::Result<RecordBatch, crate::error::Error> {
|
||||
assert_eq!(self.schema.num_columns(), batch.num_columns(),);
|
||||
RecordBatch::try_new(
|
||||
self.schema.arrow_schema().clone(),
|
||||
batch.columns().iter().map(|v| v.to_arrow_array()).collect(),
|
||||
)
|
||||
.context(NewRecordBatchSnafu)
|
||||
}
|
||||
|
||||
/// Returns the ending index of row key columns.
|
||||
///
|
||||
/// The ending index has the same value as the number of the row key columns.
|
||||
#[inline]
|
||||
pub fn row_key_end(&self) -> usize {
|
||||
self.row_key_end
|
||||
}
|
||||
|
||||
/// Returns the index of timestamp column.
|
||||
/// We always assume that timestamp is the last column in [StoreSchema].
|
||||
#[inline]
|
||||
pub fn timestamp_index(&self) -> usize {
|
||||
self.row_key_end - 1
|
||||
}
|
||||
|
||||
pub(crate) fn contains_column(&self, name: &str) -> bool {
|
||||
self.schema.column_schema_by_name(name).is_some()
|
||||
}
|
||||
|
||||
pub(crate) fn is_key_column(&self, name: &str) -> bool {
|
||||
self.schema
|
||||
.column_index_by_name(name)
|
||||
.map(|idx| idx < self.row_key_end)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub(crate) fn is_user_column(&self, name: &str) -> bool {
|
||||
self.schema
|
||||
.column_index_by_name(name)
|
||||
.map(|idx| idx < self.user_column_end)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub(crate) fn from_columns_metadata(
|
||||
columns: &ColumnsMetadata,
|
||||
version: u32,
|
||||
) -> Result<StoreSchema> {
|
||||
StoreSchema::new(
|
||||
columns.columns().to_vec(),
|
||||
version,
|
||||
columns.row_key_end(),
|
||||
columns.user_column_end(),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
columns: Vec<ColumnMetadata>,
|
||||
version: u32,
|
||||
row_key_end: usize,
|
||||
user_column_end: usize,
|
||||
) -> Result<StoreSchema> {
|
||||
let column_schemas = columns
|
||||
.iter()
|
||||
.map(|meta| meta.to_column_schema())
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
let schema = SchemaBuilder::try_from(column_schemas)
|
||||
.context(metadata::ConvertSchemaSnafu)?
|
||||
.version(version)
|
||||
.add_metadata(ROW_KEY_END_KEY, row_key_end.to_string())
|
||||
.add_metadata(USER_COLUMN_END_KEY, user_column_end.to_string())
|
||||
.build()
|
||||
.context(metadata::InvalidSchemaSnafu)?;
|
||||
|
||||
assert_eq!(
|
||||
consts::SEQUENCE_COLUMN_NAME,
|
||||
schema.column_schemas()[user_column_end].name
|
||||
);
|
||||
assert_eq!(
|
||||
consts::OP_TYPE_COLUMN_NAME,
|
||||
schema.column_schemas()[user_column_end + 1].name
|
||||
);
|
||||
|
||||
Ok(StoreSchema {
|
||||
columns,
|
||||
schema: Arc::new(schema),
|
||||
row_key_end,
|
||||
user_column_end,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn sequence_index(&self) -> usize {
|
||||
self.user_column_end
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn op_type_index(&self) -> usize {
|
||||
self.user_column_end + 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn row_key_indices(&self) -> impl Iterator<Item = usize> {
|
||||
0..self.row_key_end
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn value_indices(&self) -> impl Iterator<Item = usize> {
|
||||
self.row_key_end..self.user_column_end
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn column_name(&self, idx: usize) -> &str {
|
||||
&self.schema.column_schemas()[idx].name
|
||||
}
|
||||
|
||||
/// # Panic
|
||||
/// Panics if `name` is not a valid column name.
|
||||
#[inline]
|
||||
pub(crate) fn column_index(&self, name: &str) -> usize {
|
||||
self.schema.column_index_by_name(name).unwrap()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn num_columns(&self) -> usize {
|
||||
self.schema.num_columns()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn user_column_end(&self) -> usize {
|
||||
self.user_column_end
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn field_columns(&self) -> &[ColumnMetadata] {
|
||||
&self.columns[self.row_key_end..self.user_column_end]
|
||||
}
|
||||
|
||||
/// Returns the index of the value column according its `offset`.
|
||||
#[inline]
|
||||
pub(crate) fn field_column_index_by_offset(&self, offset: usize) -> usize {
|
||||
self.row_key_end + offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn columns(&self) -> &[ColumnMetadata] {
|
||||
&self.columns
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<Arc<ArrowSchema>> for StoreSchema {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(arrow_schema: Arc<ArrowSchema>) -> std::result::Result<Self, Self::Error> {
|
||||
let schema = Schema::try_from(arrow_schema).context(metadata::ConvertArrowSchemaSnafu)?;
|
||||
// Recover other metadata from schema.
|
||||
let row_key_end = parse_index_from_metadata(schema.metadata(), ROW_KEY_END_KEY)?;
|
||||
let user_column_end = parse_index_from_metadata(schema.metadata(), USER_COLUMN_END_KEY)?;
|
||||
|
||||
// There should be sequence and op_type columns.
|
||||
ensure!(
|
||||
consts::SEQUENCE_COLUMN_NAME == schema.column_schemas()[user_column_end].name,
|
||||
metadata::InvalidIndexSnafu
|
||||
);
|
||||
ensure!(
|
||||
consts::OP_TYPE_COLUMN_NAME == schema.column_schemas()[user_column_end + 1].name,
|
||||
metadata::InvalidIndexSnafu
|
||||
);
|
||||
|
||||
// Recover ColumnMetadata from schema.
|
||||
let columns = schema
|
||||
.column_schemas()
|
||||
.iter()
|
||||
.map(ColumnMetadata::from_column_schema)
|
||||
.collect::<Result<_>>()?;
|
||||
|
||||
Ok(StoreSchema {
|
||||
columns,
|
||||
schema: Arc::new(schema),
|
||||
row_key_end,
|
||||
user_column_end,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<ArrowSchema> for StoreSchema {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(arrow_schema: ArrowSchema) -> std::result::Result<StoreSchema, Self::Error> {
|
||||
StoreSchema::try_from(Arc::new(arrow_schema))
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_index_from_metadata(metadata: &HashMap<String, String>, key: &str) -> Result<usize> {
|
||||
let value = metadata
|
||||
.get(key)
|
||||
.context(metadata::MetaNotFoundSnafu { key })?;
|
||||
value.parse().with_context(|_| metadata::ParseMetaIntSnafu {
|
||||
key_value: format!("{key}={value}"),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::read::Batch;
|
||||
use crate::schema::tests;
|
||||
use crate::test_util::schema_util;
|
||||
|
||||
fn check_chunk_batch(record_batch: &RecordBatch, batch: &Batch) {
|
||||
assert_eq!(5, record_batch.num_columns());
|
||||
assert_eq!(3, record_batch.num_rows());
|
||||
|
||||
for i in 0..5 {
|
||||
assert_eq!(record_batch.column(i), &batch.column(i).to_arrow_array());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_schema() {
|
||||
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
|
||||
|
||||
// Checks StoreSchema.
|
||||
let store_schema = region_schema.store_schema();
|
||||
assert_eq!(123, store_schema.version());
|
||||
let sst_arrow_schema = store_schema.arrow_schema();
|
||||
let converted_store_schema = StoreSchema::try_from((**sst_arrow_schema).clone()).unwrap();
|
||||
|
||||
assert_eq!(**store_schema, converted_store_schema);
|
||||
|
||||
let column_schemas: Vec<_> = region_schema
|
||||
.columns()
|
||||
.iter()
|
||||
.map(|meta| meta.to_column_schema().unwrap())
|
||||
.collect();
|
||||
let expect_schema = SchemaBuilder::try_from(column_schemas)
|
||||
.unwrap()
|
||||
.version(123)
|
||||
.build()
|
||||
.unwrap();
|
||||
// Only compare column schemas since SchemaRef in StoreSchema also contains other metadata that only used
|
||||
// by StoreSchema.
|
||||
assert_eq!(
|
||||
expect_schema.column_schemas(),
|
||||
store_schema.schema().column_schemas(),
|
||||
);
|
||||
assert_eq!(3, store_schema.sequence_index());
|
||||
assert_eq!(4, store_schema.op_type_index());
|
||||
let row_key_indices: Vec<_> = store_schema.row_key_indices().collect();
|
||||
assert_eq!([0, 1], &row_key_indices[..]);
|
||||
let value_indices: Vec<_> = store_schema.value_indices().collect();
|
||||
assert_eq!([2], &value_indices[..]);
|
||||
|
||||
// Test batch and chunk conversion.
|
||||
let batch = tests::new_batch();
|
||||
// Convert batch to chunk.
|
||||
let chunk = store_schema.batch_to_arrow_record_batch(&batch).unwrap();
|
||||
check_chunk_batch(&chunk, &batch);
|
||||
}
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::cmp;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use store_api::storage::{
|
||||
GetRequest, GetResponse, ReadContext, ScanRequest, ScanResponse, SchemaRef, SequenceNumber,
|
||||
Snapshot,
|
||||
};
|
||||
|
||||
use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
|
||||
use crate::error::{Error, Result};
|
||||
use crate::sst::AccessLayerRef;
|
||||
use crate::version::VersionRef;
|
||||
|
||||
/// [Snapshot] implementation.
|
||||
pub struct SnapshotImpl {
|
||||
version: VersionRef,
|
||||
/// Max sequence number (inclusive) visible to user.
|
||||
visible_sequence: SequenceNumber,
|
||||
sst_layer: AccessLayerRef,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Snapshot for SnapshotImpl {
|
||||
type Error = Error;
|
||||
type Reader = ChunkReaderImpl;
|
||||
|
||||
fn schema(&self) -> &SchemaRef {
|
||||
self.version.user_schema()
|
||||
}
|
||||
|
||||
async fn scan(
|
||||
&self,
|
||||
ctx: &ReadContext,
|
||||
request: ScanRequest,
|
||||
) -> Result<ScanResponse<ChunkReaderImpl>> {
|
||||
let visible_sequence = self.sequence_to_read(request.sequence);
|
||||
let memtable_version = self.version.memtables();
|
||||
|
||||
let mutables = memtable_version.mutable_memtable();
|
||||
let immutables = memtable_version.immutable_memtables();
|
||||
|
||||
let mut builder = ChunkReaderBuilder::new(
|
||||
self.version.metadata().id(),
|
||||
self.version.schema().clone(),
|
||||
self.sst_layer.clone(),
|
||||
)
|
||||
.reserve_num_memtables(memtable_version.num_memtables())
|
||||
.projection(request.projection)
|
||||
.filters(request.filters)
|
||||
.batch_size(ctx.batch_size)
|
||||
.output_ordering(request.output_ordering)
|
||||
.visible_sequence(visible_sequence)
|
||||
.pick_memtables(mutables.clone())
|
||||
.use_chain_reader(true);
|
||||
|
||||
for memtable in immutables {
|
||||
builder = builder.pick_memtables(memtable.clone());
|
||||
}
|
||||
|
||||
let reader = builder.pick_all_ssts(self.version.ssts())?.build().await?;
|
||||
|
||||
Ok(ScanResponse { reader })
|
||||
}
|
||||
|
||||
async fn get(&self, _ctx: &ReadContext, _request: GetRequest) -> Result<GetResponse> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl SnapshotImpl {
|
||||
pub fn new(
|
||||
version: VersionRef,
|
||||
visible_sequence: SequenceNumber,
|
||||
sst_layer: AccessLayerRef,
|
||||
) -> SnapshotImpl {
|
||||
SnapshotImpl {
|
||||
version,
|
||||
visible_sequence,
|
||||
sst_layer,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sequence_to_read(&self, request_sequence: Option<SequenceNumber>) -> SequenceNumber {
|
||||
request_sequence
|
||||
.map(|s| cmp::min(s, self.visible_sequence))
|
||||
.unwrap_or(self.visible_sequence)
|
||||
}
|
||||
}
|
||||
@@ -1,830 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub(crate) mod parquet;
|
||||
mod pruning;
|
||||
mod stream_writer;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_recordbatch::SendableRecordBatchStream;
|
||||
use common_telemetry::{debug, error};
|
||||
use common_time::range::TimestampRange;
|
||||
use common_time::Timestamp;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use futures_util::StreamExt;
|
||||
use object_store::{util, ObjectStore};
|
||||
use serde::{Deserialize, Deserializer, Serialize};
|
||||
use snafu::{ResultExt, Snafu};
|
||||
use store_api::storage::{ChunkReader, RegionId};
|
||||
use table::predicate::Predicate;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::chunk::ChunkReaderImpl;
|
||||
use crate::error;
|
||||
use crate::error::{DeleteSstSnafu, Result};
|
||||
use crate::file_purger::{FilePurgeRequest, FilePurgerRef};
|
||||
use crate::memtable::BoxedBatchIterator;
|
||||
use crate::read::{Batch, BatchReader, BoxedBatchReader};
|
||||
use crate::scheduler::Scheduler;
|
||||
use crate::schema::ProjectedSchemaRef;
|
||||
use crate::sst::parquet::{ChunkStream, ParquetReader, ParquetWriter};
|
||||
|
||||
/// Maximum level of SSTs.
|
||||
pub const MAX_LEVEL: u8 = 2;
|
||||
|
||||
pub type Level = u8;
|
||||
|
||||
pub use crate::sst::stream_writer::BufferedWriter;
|
||||
|
||||
// We only has fixed number of level, so we use array to hold elements. This implementation
|
||||
// detail of LevelMetaVec should not be exposed to the user of [LevelMetas].
|
||||
type LevelMetaVec = [LevelMeta; MAX_LEVEL as usize];
|
||||
|
||||
/// Metadata of all SSTs under a region.
|
||||
///
|
||||
/// Files are organized into multiple level, though there may be only one level.
|
||||
#[derive(Clone)]
|
||||
pub struct LevelMetas {
|
||||
levels: LevelMetaVec,
|
||||
sst_layer: AccessLayerRef,
|
||||
file_purger: FilePurgerRef,
|
||||
/// Compaction time window in seconds
|
||||
compaction_time_window: Option<i64>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for LevelMetas {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("LevelMetas")
|
||||
.field("levels", &self.levels)
|
||||
.field("compaction_time_window", &self.compaction_time_window)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl LevelMetas {
|
||||
/// Create a new LevelMetas and initialized each level.
|
||||
pub fn new(sst_layer: AccessLayerRef, file_purger: FilePurgerRef) -> LevelMetas {
|
||||
LevelMetas {
|
||||
levels: new_level_meta_vec(),
|
||||
sst_layer,
|
||||
file_purger,
|
||||
compaction_time_window: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns total level number.
|
||||
#[inline]
|
||||
pub fn level_num(&self) -> usize {
|
||||
self.levels.len()
|
||||
}
|
||||
|
||||
pub fn compaction_time_window(&self) -> Option<i64> {
|
||||
self.compaction_time_window
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn level(&self, level: Level) -> &LevelMeta {
|
||||
&self.levels[level as usize]
|
||||
}
|
||||
|
||||
/// Merge `self` with files to add/remove to create a new [LevelMetas].
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if level of [FileHandle] is greater than [MAX_LEVEL].
|
||||
pub fn merge(
|
||||
&self,
|
||||
files_to_add: impl Iterator<Item = FileMeta>,
|
||||
files_to_remove: impl Iterator<Item = FileMeta>,
|
||||
compaction_time_window: Option<i64>,
|
||||
) -> LevelMetas {
|
||||
let mut merged = self.clone();
|
||||
for file in files_to_add {
|
||||
let level = file.level;
|
||||
let handle = FileHandle::new(file, self.sst_layer.clone(), self.file_purger.clone());
|
||||
merged.levels[level as usize].add_file(handle);
|
||||
}
|
||||
|
||||
for file in files_to_remove {
|
||||
let level = file.level;
|
||||
if let Some(removed_file) = merged.levels[level as usize].remove_file(file.file_id) {
|
||||
removed_file.mark_deleted();
|
||||
}
|
||||
}
|
||||
// we only update region's compaction time window iff region's window is not set and VersionEdit's
|
||||
// compaction time window is present.
|
||||
if let Some(window) = compaction_time_window {
|
||||
let _ = merged.compaction_time_window.get_or_insert(window);
|
||||
}
|
||||
merged
|
||||
}
|
||||
|
||||
pub fn mark_all_files_deleted(&self) -> Vec<FileId> {
|
||||
self.levels().iter().fold(vec![], |mut files, level| {
|
||||
files.extend(level.files().map(|f| {
|
||||
f.mark_deleted();
|
||||
f.file_id()
|
||||
}));
|
||||
files
|
||||
})
|
||||
}
|
||||
|
||||
pub fn levels(&self) -> &[LevelMeta] {
|
||||
&self.levels
|
||||
}
|
||||
|
||||
pub fn file_purger(&self) -> FilePurgerRef {
|
||||
self.file_purger.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata of files in same SST level.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct LevelMeta {
|
||||
level: Level,
|
||||
/// Handles to the files in this level.
|
||||
// TODO(yingwen): Now for simplicity, files are unordered, maybe sort the files by time range
|
||||
// or use another structure to hold them.
|
||||
files: HashMap<FileId, FileHandle>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for LevelMeta {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("LevelMeta")
|
||||
.field("level", &self.level)
|
||||
.field("files", &self.files.keys())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl LevelMeta {
|
||||
pub fn new(level: Level) -> Self {
|
||||
Self {
|
||||
level,
|
||||
files: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_file(&mut self, file: FileHandle) {
|
||||
let _ = self.files.insert(file.file_id(), file);
|
||||
}
|
||||
|
||||
fn remove_file(&mut self, file_to_remove: FileId) -> Option<FileHandle> {
|
||||
self.files.remove(&file_to_remove)
|
||||
}
|
||||
|
||||
/// Returns the level of level meta.
|
||||
#[inline]
|
||||
pub fn level(&self) -> Level {
|
||||
self.level
|
||||
}
|
||||
|
||||
/// Returns number of SST files in level.
|
||||
#[inline]
|
||||
pub fn file_num(&self) -> usize {
|
||||
self.files.len()
|
||||
}
|
||||
|
||||
/// Returns expired SSTs from current level.
|
||||
pub fn get_expired_files(&self, expire_time: &Timestamp) -> Vec<FileHandle> {
|
||||
self.files
|
||||
.iter()
|
||||
.filter_map(|(_, v)| {
|
||||
let Some((_, end)) = v.time_range() else {
|
||||
return None;
|
||||
};
|
||||
if end < expire_time {
|
||||
Some(v.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn files(&self) -> impl Iterator<Item = &FileHandle> {
|
||||
self.files.values()
|
||||
}
|
||||
}
|
||||
|
||||
fn new_level_meta_vec() -> LevelMetaVec {
|
||||
(0u8..MAX_LEVEL)
|
||||
.map(LevelMeta::new)
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap() // safety: LevelMetaVec is a fixed length array with length MAX_LEVEL
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FileHandle {
|
||||
inner: Arc<FileHandleInner>,
|
||||
}
|
||||
|
||||
impl Debug for FileHandle {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("FileHandle")
|
||||
.field("file_id", &self.inner.meta.file_id)
|
||||
.field("region_id", &self.inner.meta.region_id)
|
||||
.field("time_range", &self.inner.meta.time_range)
|
||||
.field("size", &self.inner.meta.file_size)
|
||||
.field("level", &self.inner.meta.level)
|
||||
.field("compacting", &self.inner.compacting)
|
||||
.field("deleted", &self.inner.deleted)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl FileHandle {
|
||||
pub fn new(
|
||||
meta: FileMeta,
|
||||
sst_layer: AccessLayerRef,
|
||||
file_purger: FilePurgerRef,
|
||||
) -> FileHandle {
|
||||
FileHandle {
|
||||
inner: Arc::new(FileHandleInner::new(meta, sst_layer, file_purger)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns level as usize so it can be used as index.
|
||||
#[inline]
|
||||
pub fn level(&self) -> Level {
|
||||
self.inner.meta.level
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn file_name(&self) -> String {
|
||||
self.inner.meta.file_id.as_parquet()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn file_path(&self) -> String {
|
||||
self.inner
|
||||
.sst_layer
|
||||
.sst_file_path(&self.inner.meta.file_id.as_parquet())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn file_id(&self) -> FileId {
|
||||
self.inner.meta.file_id
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn time_range(&self) -> &Option<(Timestamp, Timestamp)> {
|
||||
&self.inner.meta.time_range
|
||||
}
|
||||
|
||||
/// Returns true if current file is under compaction.
|
||||
#[inline]
|
||||
pub fn compacting(&self) -> bool {
|
||||
self.inner.compacting.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Sets the compacting flag.
|
||||
#[inline]
|
||||
pub fn mark_compacting(&self, compacting: bool) {
|
||||
self.inner.compacting.store(compacting, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn deleted(&self) -> bool {
|
||||
self.inner.deleted.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn mark_deleted(&self) {
|
||||
self.inner.deleted.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn meta(&self) -> FileMeta {
|
||||
self.inner.meta.clone()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.inner.meta.file_size
|
||||
}
|
||||
}
|
||||
|
||||
/// Actually data of [FileHandle].
|
||||
///
|
||||
/// Contains meta of the file, and other mutable info like metrics.
|
||||
struct FileHandleInner {
|
||||
meta: FileMeta,
|
||||
compacting: AtomicBool,
|
||||
deleted: AtomicBool,
|
||||
sst_layer: AccessLayerRef,
|
||||
file_purger: FilePurgerRef,
|
||||
}
|
||||
|
||||
impl fmt::Debug for FileHandleInner {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("FileHandleInner")
|
||||
.field("meta", &self.meta)
|
||||
.field("compacting", &self.compacting)
|
||||
.field("deleted", &self.deleted)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FileHandleInner {
|
||||
fn drop(&mut self) {
|
||||
if self.deleted.load(Ordering::Relaxed) {
|
||||
let request = FilePurgeRequest {
|
||||
sst_layer: self.sst_layer.clone(),
|
||||
file_id: self.meta.file_id,
|
||||
region_id: self.meta.region_id,
|
||||
};
|
||||
match self.file_purger.schedule(request) {
|
||||
Ok(res) => {
|
||||
debug!(
|
||||
"Scheduled SST purge task, region: {}, name: {}, res: {}",
|
||||
self.meta.region_id,
|
||||
self.meta.file_id.as_parquet(),
|
||||
res
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(e; "Failed to schedule SST purge task, region: {}, name: {}",
|
||||
self.meta.region_id, self.meta.file_id.as_parquet());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileHandleInner {
|
||||
fn new(
|
||||
meta: FileMeta,
|
||||
sst_layer: AccessLayerRef,
|
||||
file_purger: FilePurgerRef,
|
||||
) -> FileHandleInner {
|
||||
FileHandleInner {
|
||||
meta,
|
||||
compacting: AtomicBool::new(false),
|
||||
deleted: AtomicBool::new(false),
|
||||
sst_layer,
|
||||
file_purger,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Snafu, PartialEq)]
|
||||
pub struct ParseIdError {
|
||||
source: uuid::Error,
|
||||
}
|
||||
|
||||
/// Unique id for [SST File].
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
|
||||
pub struct FileId(Uuid);
|
||||
|
||||
impl FileId {
|
||||
/// Returns a new unique [FileId] randomly.
|
||||
pub fn random() -> FileId {
|
||||
FileId(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Parses id from string.
|
||||
pub fn parse_str(input: &str) -> std::result::Result<FileId, ParseIdError> {
|
||||
Uuid::parse_str(input).map(FileId).context(ParseIdSnafu)
|
||||
}
|
||||
|
||||
/// Append `.parquet` to file id to make a complete file name
|
||||
pub fn as_parquet(&self) -> String {
|
||||
format!("{}{}", self.0.hyphenated(), ".parquet")
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FileId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for FileId {
|
||||
type Err = ParseIdError;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<FileId, ParseIdError> {
|
||||
FileId::parse_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
/// Immutable metadata of a sst file.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct FileMeta {
|
||||
/// Region of file.
|
||||
pub region_id: RegionId,
|
||||
/// Compared to normal file names, FileId ignore the extension
|
||||
#[serde(deserialize_with = "deserialize_from_string")]
|
||||
#[serde(alias = "file_name")]
|
||||
pub file_id: FileId,
|
||||
/// Timestamp range of file.
|
||||
pub time_range: Option<(Timestamp, Timestamp)>,
|
||||
/// SST level of the file.
|
||||
pub level: Level,
|
||||
/// Size of the file.
|
||||
pub file_size: u64,
|
||||
}
|
||||
|
||||
fn deserialize_from_string<'de, D>(deserializer: D) -> std::result::Result<FileId, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let s: &str = Deserialize::deserialize(deserializer)?;
|
||||
let stripped = s.strip_suffix(".parquet").unwrap_or(s); // strip parquet suffix if needed.
|
||||
FileId::from_str(stripped).map_err(<D::Error as serde::de::Error>::custom)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct WriteOptions {
|
||||
// TODO(yingwen): [flush] row group size.
|
||||
pub sst_write_buffer_size: ReadableSize,
|
||||
}
|
||||
|
||||
impl Default for WriteOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sst_write_buffer_size: ReadableSize::mb(8),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ReadOptions {
|
||||
/// Suggested size of each batch.
|
||||
pub batch_size: usize,
|
||||
/// The schema that user expected to read, might not the same as the
|
||||
/// schema of the SST file.
|
||||
pub projected_schema: ProjectedSchemaRef,
|
||||
|
||||
pub predicate: Predicate,
|
||||
pub time_range: TimestampRange,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct SstInfo {
|
||||
pub time_range: Option<(Timestamp, Timestamp)>,
|
||||
pub file_size: u64,
|
||||
pub num_rows: usize,
|
||||
}
|
||||
|
||||
/// SST access layer.
|
||||
#[async_trait]
|
||||
pub trait AccessLayer: Send + Sync + std::fmt::Debug {
|
||||
/// Returns the sst file path.
|
||||
fn sst_file_path(&self, file_name: &str) -> String;
|
||||
|
||||
/// Writes SST file with given `file_id` and returns the SST info.
|
||||
/// If source does not contain any data, `write_sst` will return `Ok(None)`.
|
||||
async fn write_sst(
|
||||
&self,
|
||||
file_id: FileId,
|
||||
source: Source,
|
||||
opts: &WriteOptions,
|
||||
) -> Result<Option<SstInfo>>;
|
||||
|
||||
/// Read SST file with given `file_handle` and schema.
|
||||
async fn read_sst(
|
||||
&self,
|
||||
file_handle: FileHandle,
|
||||
opts: &ReadOptions,
|
||||
) -> Result<BoxedBatchReader>;
|
||||
|
||||
/// Deletes a SST file with given name.
|
||||
async fn delete_sst(&self, file_id: FileId) -> Result<()>;
|
||||
}
|
||||
|
||||
pub type AccessLayerRef = Arc<dyn AccessLayer>;
|
||||
|
||||
/// Parquet writer data source.
|
||||
pub enum Source {
|
||||
/// Writes rows from memtable to parquet
|
||||
Iter(BoxedBatchIterator),
|
||||
/// Writes row from ChunkReaderImpl (maybe a set of SSTs) to parquet.
|
||||
Reader(ChunkReaderImpl),
|
||||
/// Record batch stream yielded by table scan
|
||||
Stream(SendableRecordBatchStream),
|
||||
}
|
||||
|
||||
impl Source {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
match self {
|
||||
Source::Iter(iter) => iter.next().transpose(),
|
||||
Source::Reader(reader) => reader
|
||||
.next_chunk()
|
||||
.await
|
||||
.map(|p| p.map(|chunk| Batch::new(chunk.columns))),
|
||||
Source::Stream(stream) => stream
|
||||
.next()
|
||||
.await
|
||||
.transpose()
|
||||
.map(|r| r.map(|r| Batch::new(r.columns().to_vec())))
|
||||
.context(error::CreateRecordBatchSnafu),
|
||||
}
|
||||
}
|
||||
|
||||
fn schema(&self) -> SchemaRef {
|
||||
match self {
|
||||
Source::Iter(iter) => {
|
||||
let projected_schema = iter.schema();
|
||||
projected_schema.schema_to_read().schema().clone()
|
||||
}
|
||||
Source::Reader(reader) => reader.projected_schema().schema_to_read().schema().clone(),
|
||||
Source::Stream(stream) => stream.schema(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sst access layer.
|
||||
pub struct FsAccessLayer {
|
||||
sst_dir: String,
|
||||
object_store: ObjectStore,
|
||||
}
|
||||
|
||||
impl fmt::Debug for FsAccessLayer {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("FsAccessLayer")
|
||||
.field("sst_dir", &self.sst_dir)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl FsAccessLayer {
|
||||
pub fn new(sst_dir: &str, object_store: ObjectStore) -> FsAccessLayer {
|
||||
FsAccessLayer {
|
||||
sst_dir: util::normalize_dir(sst_dir),
|
||||
object_store,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl AccessLayer for FsAccessLayer {
|
||||
fn sst_file_path(&self, file_name: &str) -> String {
|
||||
format!("{}{}", self.sst_dir, file_name)
|
||||
}
|
||||
|
||||
/// Writes SST file with given `file_id`.
|
||||
async fn write_sst(
|
||||
&self,
|
||||
file_id: FileId,
|
||||
source: Source,
|
||||
opts: &WriteOptions,
|
||||
) -> Result<Option<SstInfo>> {
|
||||
// Now we only supports parquet format. We may allow caller to specific SST format in
|
||||
// WriteOptions in the future.
|
||||
let file_path = self.sst_file_path(&file_id.as_parquet());
|
||||
let writer = ParquetWriter::new(&file_path, source, self.object_store.clone());
|
||||
writer.write_sst(opts).await
|
||||
}
|
||||
|
||||
/// Read SST file with given `file_handle` and schema.
|
||||
async fn read_sst(
|
||||
&self,
|
||||
file_handle: FileHandle,
|
||||
opts: &ReadOptions,
|
||||
) -> Result<BoxedBatchReader> {
|
||||
let reader = ParquetReader::new(
|
||||
file_handle,
|
||||
self.object_store.clone(),
|
||||
opts.projected_schema.clone(),
|
||||
opts.predicate.clone(),
|
||||
opts.time_range,
|
||||
);
|
||||
|
||||
Ok(Box::new(LazyParquetBatchReader::new(reader)))
|
||||
}
|
||||
|
||||
/// Deletes a SST file with given file id.
|
||||
async fn delete_sst(&self, file_id: FileId) -> Result<()> {
|
||||
let path = self.sst_file_path(&file_id.as_parquet());
|
||||
self.object_store
|
||||
.delete(&path)
|
||||
.await
|
||||
.context(DeleteSstSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
struct LazyParquetBatchReader {
|
||||
inner: ParquetReader,
|
||||
stream: Option<ChunkStream>,
|
||||
}
|
||||
|
||||
impl LazyParquetBatchReader {
|
||||
fn new(inner: ParquetReader) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
stream: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl BatchReader for LazyParquetBatchReader {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
if let Some(s) = &mut self.stream {
|
||||
s.next_batch().await
|
||||
} else {
|
||||
let mut stream = self.inner.chunk_stream().await?;
|
||||
let res = stream.next_batch().await;
|
||||
self.stream = Some(stream);
|
||||
res
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::*;
|
||||
use crate::file_purger::noop::NoopFilePurgeHandler;
|
||||
use crate::scheduler::{LocalScheduler, SchedulerConfig};
|
||||
|
||||
#[test]
|
||||
fn test_file_id() {
|
||||
let id = FileId::random();
|
||||
let uuid_str = id.to_string();
|
||||
assert_eq!(id.0.to_string(), uuid_str);
|
||||
|
||||
let parsed = FileId::parse_str(&uuid_str).unwrap();
|
||||
assert_eq!(id, parsed);
|
||||
let parsed = uuid_str.parse().unwrap();
|
||||
assert_eq!(id, parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_id_serialization() {
|
||||
let id = FileId::random();
|
||||
let json = serde_json::to_string(&id).unwrap();
|
||||
assert_eq!(format!("\"{id}\""), json);
|
||||
|
||||
let parsed = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(id, parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_file_meta() {
|
||||
let file_meta = create_file_meta(FileId::random(), 0);
|
||||
let serialized_file_meta = serde_json::to_string(&file_meta).unwrap();
|
||||
let deserialized_file_meta = serde_json::from_str(&serialized_file_meta);
|
||||
assert_eq!(file_meta, deserialized_file_meta.unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_from_string() {
|
||||
let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55\",\"time_range\":null,\"level\":0}";
|
||||
let file_meta = create_file_meta(
|
||||
FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
|
||||
0,
|
||||
);
|
||||
let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
|
||||
assert_eq!(file_meta, deserialized_file_meta);
|
||||
}
|
||||
#[test]
|
||||
fn test_deserialize_from_string_parquet() {
|
||||
let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}";
|
||||
let file_meta = create_file_meta(
|
||||
FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
|
||||
0,
|
||||
);
|
||||
let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
|
||||
assert_eq!(file_meta, deserialized_file_meta);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_from_string_parquet_file_name() {
|
||||
let json_file_meta = "{\"region_id\":0,\"file_name\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}";
|
||||
let file_meta = create_file_meta(
|
||||
FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
|
||||
0,
|
||||
);
|
||||
let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
|
||||
assert_eq!(file_meta, deserialized_file_meta);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_id_as_parquet() {
|
||||
let id = FileId::from_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap();
|
||||
assert_eq!(
|
||||
"67e55044-10b1-426f-9247-bb680e5fe0c8.parquet",
|
||||
id.as_parquet()
|
||||
);
|
||||
}
|
||||
|
||||
fn create_file_meta(file_id: FileId, level: Level) -> FileMeta {
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id,
|
||||
time_range: None,
|
||||
level,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_level_metas_add_and_remove() {
|
||||
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
|
||||
let purger = Arc::new(LocalScheduler::new(
|
||||
SchedulerConfig::default(),
|
||||
NoopFilePurgeHandler,
|
||||
));
|
||||
let file_ids = [
|
||||
FileId::random(),
|
||||
FileId::random(),
|
||||
FileId::random(),
|
||||
FileId::random(),
|
||||
];
|
||||
|
||||
let metas = LevelMetas::new(layer, purger);
|
||||
let merged = metas.merge(
|
||||
vec![
|
||||
create_file_meta(file_ids[0], 0),
|
||||
create_file_meta(file_ids[1], 0),
|
||||
]
|
||||
.into_iter(),
|
||||
vec![].into_iter(),
|
||||
None,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
HashSet::from([file_ids[0], file_ids[1]]),
|
||||
merged.level(0).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
|
||||
let merged1 = merged.merge(
|
||||
vec![
|
||||
create_file_meta(file_ids[2], 1),
|
||||
create_file_meta(file_ids[3], 1),
|
||||
]
|
||||
.into_iter(),
|
||||
vec![].into_iter(),
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
HashSet::from([file_ids[0], file_ids[1]]),
|
||||
merged1.level(0).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
HashSet::from([file_ids[2], file_ids[3]]),
|
||||
merged1.level(1).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
|
||||
let removed1 = merged1.merge(
|
||||
vec![].into_iter(),
|
||||
vec![
|
||||
create_file_meta(file_ids[0], 0),
|
||||
create_file_meta(file_ids[2], 0),
|
||||
]
|
||||
.into_iter(),
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
HashSet::from([file_ids[1]]),
|
||||
removed1.level(0).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
HashSet::from([file_ids[2], file_ids[3]]),
|
||||
removed1.level(1).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
|
||||
let removed2 = removed1.merge(
|
||||
vec![].into_iter(),
|
||||
vec![
|
||||
create_file_meta(file_ids[2], 1),
|
||||
create_file_meta(file_ids[3], 1),
|
||||
]
|
||||
.into_iter(),
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
HashSet::from([file_ids[1]]),
|
||||
removed2.level(0).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
HashSet::new(),
|
||||
removed2.level(1).files().map(|f| f.file_id()).collect()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,819 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Parquet sst format.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_compat::CompatExt;
|
||||
use async_stream::try_stream;
|
||||
use async_trait::async_trait;
|
||||
use common_telemetry::{debug, error};
|
||||
use common_time::range::TimestampRange;
|
||||
use common_time::Timestamp;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use futures_util::{Stream, StreamExt, TryStreamExt};
|
||||
use object_store::ObjectStore;
|
||||
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
|
||||
use parquet::basic::{Compression, Encoding, ZstdLevel};
|
||||
use parquet::file::metadata::KeyValue;
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use parquet::format::FileMetaData;
|
||||
use parquet::schema::types::ColumnPath;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
|
||||
use table::predicate::Predicate;
|
||||
use tokio::io::BufReader;
|
||||
|
||||
use crate::error::{self, DecodeParquetTimeRangeSnafu, ReadObjectSnafu, ReadParquetSnafu, Result};
|
||||
use crate::read::{Batch, BatchReader};
|
||||
use crate::schema::compat::ReadAdapter;
|
||||
use crate::schema::{ProjectedSchemaRef, StoreSchema};
|
||||
use crate::sst;
|
||||
use crate::sst::pruning::build_row_filter;
|
||||
use crate::sst::stream_writer::BufferedWriter;
|
||||
use crate::sst::{FileHandle, Source, SstInfo};
|
||||
|
||||
/// Parquet sst writer.
|
||||
pub struct ParquetWriter<'a> {
|
||||
file_path: &'a str,
|
||||
source: Source,
|
||||
object_store: ObjectStore,
|
||||
max_row_group_size: usize,
|
||||
}
|
||||
|
||||
impl<'a> ParquetWriter<'a> {
|
||||
pub fn new(file_path: &'a str, source: Source, object_store: ObjectStore) -> ParquetWriter {
|
||||
ParquetWriter {
|
||||
file_path,
|
||||
source,
|
||||
object_store,
|
||||
max_row_group_size: 4096, // TODO(hl): make this configurable
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn write_sst(self, opts: &sst::WriteOptions) -> Result<Option<SstInfo>> {
|
||||
self.write_rows(None, opts).await
|
||||
}
|
||||
|
||||
/// Iterates memtable and writes rows to Parquet file.
|
||||
/// A chunk of records yielded from each iteration with a size given
|
||||
/// in config will be written to a single row group.
|
||||
async fn write_rows(
|
||||
mut self,
|
||||
extra_meta: Option<HashMap<String, String>>,
|
||||
opts: &sst::WriteOptions,
|
||||
) -> Result<Option<SstInfo>> {
|
||||
let schema = self.source.schema();
|
||||
|
||||
let mut props_builder = WriterProperties::builder()
|
||||
.set_compression(Compression::ZSTD(ZstdLevel::default()))
|
||||
.set_encoding(Encoding::PLAIN)
|
||||
.set_max_row_group_size(self.max_row_group_size)
|
||||
.set_key_value_metadata(extra_meta.map(|map| {
|
||||
map.iter()
|
||||
.map(|(k, v)| KeyValue::new(k.clone(), v.clone()))
|
||||
.collect::<Vec<_>>()
|
||||
}))
|
||||
.set_column_encoding(
|
||||
ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
|
||||
Encoding::DELTA_BINARY_PACKED,
|
||||
)
|
||||
.set_column_dictionary_enabled(
|
||||
ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
|
||||
false,
|
||||
);
|
||||
|
||||
if let Some(ts_col) = schema.timestamp_column() {
|
||||
props_builder = props_builder.set_column_encoding(
|
||||
ColumnPath::new(vec![ts_col.name.clone()]),
|
||||
Encoding::DELTA_BINARY_PACKED,
|
||||
);
|
||||
}
|
||||
|
||||
let writer_props = props_builder.build();
|
||||
|
||||
let mut buffered_writer = BufferedWriter::try_new(
|
||||
self.file_path.to_string(),
|
||||
self.object_store.clone(),
|
||||
&schema,
|
||||
Some(writer_props),
|
||||
opts.sst_write_buffer_size.as_bytes() as usize,
|
||||
)
|
||||
.await?;
|
||||
let mut rows_written = 0;
|
||||
|
||||
while let Some(batch) = self.source.next_batch().await? {
|
||||
buffered_writer.write(&batch).await?;
|
||||
rows_written += batch.num_rows();
|
||||
}
|
||||
|
||||
if rows_written == 0 {
|
||||
debug!("No data written, try abort writer: {}", self.file_path);
|
||||
let _ = buffered_writer.close().await?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let (file_meta, file_size) = buffered_writer.close().await?;
|
||||
let time_range = decode_timestamp_range(&file_meta, &schema).ok().flatten();
|
||||
|
||||
// object_store.write will make sure all bytes are written or an error is raised.
|
||||
Ok(Some(SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
num_rows: rows_written,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_timestamp_range(
|
||||
file_meta: &FileMetaData,
|
||||
schema: &datatypes::schema::SchemaRef,
|
||||
) -> Result<Option<(Timestamp, Timestamp)>> {
|
||||
let (Some(ts_col_idx), Some(ts_col)) = (schema.timestamp_index(), schema.timestamp_column())
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let ts_datatype = &ts_col.data_type;
|
||||
decode_timestamp_range_inner(file_meta, ts_col_idx, ts_datatype)
|
||||
}
|
||||
|
||||
fn decode_timestamp_range_inner(
|
||||
file_meta: &FileMetaData,
|
||||
ts_index: usize,
|
||||
ts_datatype: &ConcreteDataType,
|
||||
) -> Result<Option<(Timestamp, Timestamp)>> {
|
||||
let mut start = i64::MAX;
|
||||
let mut end = i64::MIN;
|
||||
|
||||
let unit = match ts_datatype {
|
||||
ConcreteDataType::Timestamp(type_) => type_.unit(),
|
||||
_ => {
|
||||
return DecodeParquetTimeRangeSnafu {
|
||||
msg: format!("Unexpected timestamp column datatype: {ts_datatype:?}"),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
};
|
||||
|
||||
for rg in &file_meta.row_groups {
|
||||
let Some(ref metadata) = rg
|
||||
.columns
|
||||
.get(ts_index)
|
||||
.context(DecodeParquetTimeRangeSnafu {
|
||||
msg: format!("Cannot find ts column by index: {ts_index}"),
|
||||
})?
|
||||
.meta_data
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Some(stats) = &metadata.statistics else {
|
||||
return Ok(None);
|
||||
};
|
||||
let (Some(min_value), Some(max_value)) = (&stats.min_value, &stats.max_value) else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
// according to [parquet's spec](https://parquet.apache.org/docs/file-format/data-pages/encodings/), min/max value in stats uses plain encoding with little endian.
|
||||
// also see https://github.com/apache/arrow-rs/blob/5fb337db04a1a19f7d40da46f19b7b5fd4051593/parquet/src/file/statistics.rs#L172
|
||||
let min = i64::from_le_bytes(min_value[..8].try_into().map_err(|e| {
|
||||
error!(
|
||||
"Failed to decode min value from stats, bytes: {:?}, source: {:?}",
|
||||
min_value, e
|
||||
);
|
||||
DecodeParquetTimeRangeSnafu {
|
||||
msg: "decode min value",
|
||||
}
|
||||
.build()
|
||||
})?);
|
||||
let max = i64::from_le_bytes(max_value[..8].try_into().map_err(|e| {
|
||||
error!(
|
||||
"Failed to decode max value from stats, bytes: {:?}, source: {:?}",
|
||||
max_value, e
|
||||
);
|
||||
DecodeParquetTimeRangeSnafu {
|
||||
msg: "decode max value",
|
||||
}
|
||||
.build()
|
||||
})?);
|
||||
start = start.min(min);
|
||||
end = end.max(max);
|
||||
}
|
||||
|
||||
assert!(
|
||||
start <= end,
|
||||
"Illegal timestamp range decoded from SST file {:?}, start: {}, end: {}",
|
||||
file_meta,
|
||||
start,
|
||||
end
|
||||
);
|
||||
Ok(Some((
|
||||
Timestamp::new(start, unit),
|
||||
Timestamp::new(end, unit),
|
||||
)))
|
||||
}
|
||||
|
||||
pub struct ParquetReader {
|
||||
// Holds the file handle to avoid the file purge purge it.
|
||||
file_handle: FileHandle,
|
||||
object_store: ObjectStore,
|
||||
projected_schema: ProjectedSchemaRef,
|
||||
predicate: Predicate,
|
||||
time_range: TimestampRange,
|
||||
}
|
||||
|
||||
impl ParquetReader {
|
||||
pub fn new(
|
||||
file_handle: FileHandle,
|
||||
object_store: ObjectStore,
|
||||
projected_schema: ProjectedSchemaRef,
|
||||
predicate: Predicate,
|
||||
time_range: TimestampRange,
|
||||
) -> ParquetReader {
|
||||
ParquetReader {
|
||||
file_handle,
|
||||
object_store,
|
||||
projected_schema,
|
||||
predicate,
|
||||
time_range,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn chunk_stream(&self) -> Result<ChunkStream> {
|
||||
let file_path = self.file_handle.file_path();
|
||||
let operator = self.object_store.clone();
|
||||
|
||||
let reader = operator
|
||||
.reader(&file_path)
|
||||
.await
|
||||
.context(ReadObjectSnafu { path: &file_path })?
|
||||
.compat();
|
||||
let buf_reader = BufReader::new(reader);
|
||||
let builder = ParquetRecordBatchStreamBuilder::new(buf_reader)
|
||||
.await
|
||||
.context(ReadParquetSnafu { file: &file_path })?;
|
||||
let arrow_schema = builder.schema().clone();
|
||||
|
||||
let store_schema = Arc::new(
|
||||
StoreSchema::try_from(arrow_schema)
|
||||
.context(error::ConvertStoreSchemaSnafu { file: &file_path })?,
|
||||
);
|
||||
|
||||
let adapter = ReadAdapter::new(store_schema.clone(), self.projected_schema.clone())?;
|
||||
|
||||
let pruned_row_groups = self
|
||||
.predicate
|
||||
.prune_row_groups(
|
||||
builder.metadata().row_groups(),
|
||||
store_schema.schema().clone(),
|
||||
)
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, valid)| if valid { Some(idx) } else { None })
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let parquet_schema_desc = builder.metadata().file_metadata().schema_descr_ptr();
|
||||
|
||||
let projection_mask = ProjectionMask::roots(&parquet_schema_desc, adapter.fields_to_read());
|
||||
let mut builder = builder
|
||||
.with_projection(projection_mask.clone())
|
||||
.with_row_groups(pruned_row_groups);
|
||||
|
||||
if let Some(row_filter) = build_row_filter(
|
||||
self.time_range,
|
||||
&self.predicate,
|
||||
&store_schema,
|
||||
&parquet_schema_desc,
|
||||
projection_mask,
|
||||
) {
|
||||
builder = builder.with_row_filter(row_filter);
|
||||
}
|
||||
|
||||
let mut stream = builder
|
||||
.build()
|
||||
.context(ReadParquetSnafu { file: &file_path })?;
|
||||
|
||||
let chunk_stream = try_stream!({
|
||||
while let Some(res) = stream.next().await {
|
||||
yield res.context(ReadParquetSnafu { file: &file_path })?
|
||||
}
|
||||
});
|
||||
|
||||
ChunkStream::new(self.file_handle.clone(), adapter, Box::pin(chunk_stream))
|
||||
}
|
||||
}
|
||||
|
||||
pub type SendableChunkStream = Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>;
|
||||
|
||||
pub struct ChunkStream {
|
||||
// Holds the file handle in the stream to avoid the purger purge it.
|
||||
_file_handle: FileHandle,
|
||||
adapter: ReadAdapter,
|
||||
stream: SendableChunkStream,
|
||||
}
|
||||
|
||||
impl ChunkStream {
|
||||
pub fn new(
|
||||
file_handle: FileHandle,
|
||||
adapter: ReadAdapter,
|
||||
stream: SendableChunkStream,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
_file_handle: file_handle,
|
||||
adapter,
|
||||
stream,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl BatchReader for ChunkStream {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
self.stream
|
||||
.try_next()
|
||||
.await?
|
||||
.map(|rb| self.adapter.arrow_record_batch_to_batch(&rb))
|
||||
.transpose()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use datatypes::arrow::array::{Array, UInt64Array, UInt8Array};
|
||||
use datatypes::prelude::{ScalarVector, Vector};
|
||||
use datatypes::types::{TimestampMillisecondType, TimestampType};
|
||||
use datatypes::vectors::TimestampMillisecondVector;
|
||||
use object_store::services::Fs;
|
||||
|
||||
use super::*;
|
||||
use crate::file_purger::noop::new_noop_file_purger;
|
||||
use crate::memtable::{
|
||||
tests as memtable_tests, DefaultMemtableBuilder, IterContext, MemtableBuilder,
|
||||
};
|
||||
use crate::schema::ProjectedSchema;
|
||||
use crate::sst::{FileId, FileMeta};
|
||||
|
||||
fn create_object_store(root: &str) -> ObjectStore {
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(root);
|
||||
ObjectStore::new(builder).unwrap().finish()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parquet_writer() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let schema = memtable_tests::schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema);
|
||||
|
||||
memtable_tests::write_kvs(
|
||||
&*memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1002, 2002, 2003, 2003, 1001], // keys
|
||||
&[
|
||||
(Some(1), Some(1234)),
|
||||
(Some(2), Some(1234)),
|
||||
(Some(7), Some(1234)),
|
||||
(Some(8), Some(1234)),
|
||||
(Some(9), Some(1234)),
|
||||
(Some(3), Some(1234)),
|
||||
], // values
|
||||
);
|
||||
|
||||
let dir = create_temp_dir("write_parquet");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
|
||||
let object_store = create_object_store(path);
|
||||
let sst_file_name = "test-flush.parquet";
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
|
||||
|
||||
assert!(writer
|
||||
.write_sst(&sst::WriteOptions::default())
|
||||
.await
|
||||
.is_ok());
|
||||
|
||||
// verify parquet file
|
||||
let reader = BufReader::new(object_store.reader(sst_file_name).await.unwrap().compat());
|
||||
|
||||
let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
|
||||
|
||||
let mut stream = builder.build().unwrap();
|
||||
// chunk schema: timestamp, v1, __sequence, __op_type
|
||||
let chunk = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(5, chunk.columns().len());
|
||||
|
||||
// timestamp
|
||||
assert_eq!(
|
||||
&TimestampMillisecondVector::from_slice([
|
||||
1000.into(),
|
||||
1001.into(),
|
||||
1002.into(),
|
||||
2002.into(),
|
||||
2003.into(),
|
||||
])
|
||||
.to_arrow_array(),
|
||||
chunk.column(0)
|
||||
);
|
||||
|
||||
// v0
|
||||
assert_eq!(
|
||||
&(Arc::new(UInt64Array::from(vec![1, 3, 2, 7, 9])) as Arc<dyn Array>),
|
||||
chunk.column(1)
|
||||
);
|
||||
|
||||
// v1
|
||||
assert_eq!(
|
||||
&(Arc::new(UInt64Array::from(vec![1234; 5])) as Arc<dyn Array>),
|
||||
chunk.column(2)
|
||||
);
|
||||
|
||||
// sequence
|
||||
assert_eq!(
|
||||
&(Arc::new(UInt64Array::from(vec![10; 5])) as Arc<dyn Array>),
|
||||
chunk.column(3)
|
||||
);
|
||||
|
||||
// op_type
|
||||
assert_eq!(
|
||||
&(Arc::new(UInt8Array::from(vec![1; 5])) as Arc<dyn Array>),
|
||||
chunk.column(4)
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_large_data() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let schema = memtable_tests::schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema);
|
||||
|
||||
let mut rows_written = 0;
|
||||
for i in 0..16 {
|
||||
let range: Range<i64> = i * 1024..(i + 1) * 1024;
|
||||
let keys = range.clone().collect::<Vec<_>>();
|
||||
let values = range
|
||||
.map(|idx| (Some(idx as u64), Some(idx as u64)))
|
||||
.collect::<Vec<_>>();
|
||||
memtable_tests::write_kvs(&*memtable, i as u64, OpType::Put, &keys, &values);
|
||||
rows_written += keys.len();
|
||||
}
|
||||
|
||||
let dir = create_temp_dir("write_large_parquet");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
|
||||
let object_store = create_object_store(path);
|
||||
let sst_file_name = "test-large.parquet";
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
|
||||
|
||||
let sst_info = writer
|
||||
.write_sst(&sst::WriteOptions {
|
||||
sst_write_buffer_size: ReadableSize::kb(4),
|
||||
})
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let file_meta = object_store.stat(sst_file_name).await.unwrap();
|
||||
assert!(file_meta.is_file());
|
||||
assert_eq!(sst_info.file_size, file_meta.content_length());
|
||||
assert_eq!(rows_written, sst_info.num_rows);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parquet_read_large_batch() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let schema = memtable_tests::schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
|
||||
|
||||
let rows_total = 4096 * 4;
|
||||
let mut keys_vec = Vec::with_capacity(rows_total);
|
||||
let mut values_vec = Vec::with_capacity(rows_total);
|
||||
|
||||
for i in 0..rows_total {
|
||||
keys_vec.push(i as i64);
|
||||
values_vec.push((Some(i as u64), Some(i as u64)));
|
||||
}
|
||||
|
||||
memtable_tests::write_kvs(
|
||||
&*memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&keys_vec, // keys
|
||||
&values_vec, // values
|
||||
);
|
||||
|
||||
let dir = create_temp_dir("write_parquet");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
let object_store = create_object_store(path);
|
||||
let sst_file_handle = new_file_handle(FileId::random());
|
||||
let sst_file_name = sst_file_handle.file_name();
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
|
||||
|
||||
let SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
..
|
||||
} = writer
|
||||
.write_sst(&sst::WriteOptions::default())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
Some((
|
||||
Timestamp::new_millisecond(0),
|
||||
Timestamp::new_millisecond((rows_total - 1) as i64)
|
||||
)),
|
||||
time_range
|
||||
);
|
||||
assert_ne!(file_size, 0);
|
||||
let operator = create_object_store(dir.path().to_str().unwrap());
|
||||
|
||||
let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
|
||||
let reader = ParquetReader::new(
|
||||
sst_file_handle,
|
||||
operator,
|
||||
projected_schema,
|
||||
Predicate::empty(),
|
||||
TimestampRange::min_to_max(),
|
||||
);
|
||||
|
||||
let mut rows_fetched = 0;
|
||||
let mut stream = reader.chunk_stream().await.unwrap();
|
||||
while let Some(res) = stream.next_batch().await.unwrap() {
|
||||
rows_fetched += res.num_rows();
|
||||
}
|
||||
assert_eq!(rows_total, rows_fetched);
|
||||
}
|
||||
|
||||
fn new_file_handle(file_id: FileId) -> FileHandle {
|
||||
let file_purger = new_noop_file_purger();
|
||||
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
|
||||
FileHandle::new(
|
||||
FileMeta {
|
||||
region_id: 0.into(),
|
||||
file_id,
|
||||
time_range: Some((
|
||||
Timestamp::new_millisecond(0),
|
||||
Timestamp::new_millisecond(1000),
|
||||
)),
|
||||
level: 0,
|
||||
file_size: 0,
|
||||
},
|
||||
layer,
|
||||
file_purger,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parquet_reader() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let schema = memtable_tests::schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
|
||||
|
||||
memtable_tests::write_kvs(
|
||||
&*memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1002, 2002, 2003, 2003, 1001], // keys
|
||||
&[
|
||||
(Some(1), Some(1234)),
|
||||
(Some(2), Some(1234)),
|
||||
(Some(7), Some(1234)),
|
||||
(Some(8), Some(1234)),
|
||||
(Some(9), Some(1234)),
|
||||
(Some(3), Some(1234)),
|
||||
], // values
|
||||
);
|
||||
|
||||
let dir = create_temp_dir("write_parquet");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
|
||||
let object_store = create_object_store(path);
|
||||
let file_handle = new_file_handle(FileId::random());
|
||||
let sst_file_name = file_handle.file_name();
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
|
||||
|
||||
let SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
..
|
||||
} = writer
|
||||
.write_sst(&sst::WriteOptions::default())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
Some((
|
||||
Timestamp::new_millisecond(1000),
|
||||
Timestamp::new_millisecond(2003)
|
||||
)),
|
||||
time_range
|
||||
);
|
||||
assert_ne!(file_size, 0);
|
||||
let operator = create_object_store(dir.path().to_str().unwrap());
|
||||
|
||||
let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
|
||||
let reader = ParquetReader::new(
|
||||
file_handle,
|
||||
operator,
|
||||
projected_schema,
|
||||
Predicate::empty(),
|
||||
TimestampRange::min_to_max(),
|
||||
);
|
||||
|
||||
let mut stream = reader.chunk_stream().await.unwrap();
|
||||
assert_eq!(
|
||||
5,
|
||||
stream
|
||||
.next_batch()
|
||||
.await
|
||||
.transpose()
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.num_rows()
|
||||
);
|
||||
}
|
||||
|
||||
async fn check_range_read(
|
||||
file_handle: FileHandle,
|
||||
object_store: ObjectStore,
|
||||
schema: ProjectedSchemaRef,
|
||||
range: TimestampRange,
|
||||
expect: Vec<i64>,
|
||||
) {
|
||||
let reader =
|
||||
ParquetReader::new(file_handle, object_store, schema, Predicate::empty(), range);
|
||||
let mut stream = reader.chunk_stream().await.unwrap();
|
||||
let result = stream.next_batch().await;
|
||||
|
||||
let Some(batch) = result.unwrap() else {
|
||||
// if batch does not contain any row
|
||||
assert!(expect.is_empty());
|
||||
return;
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType)),
|
||||
batch.column(0).data_type()
|
||||
);
|
||||
|
||||
let ts = batch
|
||||
.column(0)
|
||||
.as_any()
|
||||
.downcast_ref::<TimestampMillisecondVector>()
|
||||
.unwrap()
|
||||
.iter_data()
|
||||
.map(|t| t.unwrap().0.value())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(expect, ts);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parquet_reader_with_time_range_filter() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let schema = memtable_tests::schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
|
||||
|
||||
memtable_tests::write_kvs(
|
||||
&*memtable,
|
||||
10, // sequence
|
||||
OpType::Put,
|
||||
&[1000, 1002, 2002, 2003, 2003, 1001, 3001], // keys
|
||||
&[
|
||||
(Some(1), Some(1234)),
|
||||
(Some(2), Some(1234)),
|
||||
(Some(7), Some(1234)),
|
||||
(Some(8), Some(1234)),
|
||||
(Some(9), Some(1234)),
|
||||
(Some(3), Some(1234)),
|
||||
(Some(7), Some(1234)),
|
||||
], // values
|
||||
);
|
||||
|
||||
let dir = create_temp_dir("read-parquet-by-range");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
let object_store = create_object_store(path);
|
||||
let sst_file_handle = new_file_handle(FileId::random());
|
||||
let sst_file_name = sst_file_handle.file_name();
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
|
||||
|
||||
let SstInfo {
|
||||
time_range,
|
||||
file_size,
|
||||
..
|
||||
} = writer
|
||||
.write_sst(&sst::WriteOptions::default())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
Some((
|
||||
Timestamp::new_millisecond(1000),
|
||||
Timestamp::new_millisecond(3001)
|
||||
)),
|
||||
time_range
|
||||
);
|
||||
assert_ne!(file_size, 0);
|
||||
|
||||
let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1, 0, 2])).unwrap());
|
||||
|
||||
check_range_read(
|
||||
sst_file_handle.clone(),
|
||||
object_store.clone(),
|
||||
projected_schema.clone(),
|
||||
TimestampRange::with_unit(1000, 2003, TimeUnit::Millisecond).unwrap(),
|
||||
vec![1000, 1001, 1002, 2002],
|
||||
)
|
||||
.await;
|
||||
|
||||
check_range_read(
|
||||
sst_file_handle.clone(),
|
||||
object_store.clone(),
|
||||
projected_schema.clone(),
|
||||
TimestampRange::with_unit(2002, 3001, TimeUnit::Millisecond).unwrap(),
|
||||
vec![2002, 2003],
|
||||
)
|
||||
.await;
|
||||
|
||||
// read a range without any rows.
|
||||
check_range_read(
|
||||
sst_file_handle.clone(),
|
||||
object_store.clone(),
|
||||
projected_schema.clone(),
|
||||
TimestampRange::with_unit(3002, 3003, TimeUnit::Millisecond).unwrap(),
|
||||
vec![],
|
||||
)
|
||||
.await;
|
||||
|
||||
//
|
||||
check_range_read(
|
||||
sst_file_handle.clone(),
|
||||
object_store.clone(),
|
||||
projected_schema.clone(),
|
||||
TimestampRange::with_unit(1000, 3000, TimeUnit::Millisecond).unwrap(),
|
||||
vec![1000, 1001, 1002, 2002, 2003],
|
||||
)
|
||||
.await;
|
||||
|
||||
// read full range
|
||||
check_range_read(
|
||||
sst_file_handle,
|
||||
object_store,
|
||||
projected_schema,
|
||||
TimestampRange::min_to_max(),
|
||||
vec![1000, 1001, 1002, 2002, 2003, 3001],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_empty_file() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let schema = memtable_tests::schema_for_test();
|
||||
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
|
||||
|
||||
let dir = create_temp_dir("write-empty-file");
|
||||
let path = dir.path().to_str().unwrap();
|
||||
let mut builder = Fs::default();
|
||||
let _ = builder.root(path);
|
||||
let object_store = ObjectStore::new(builder).unwrap().finish();
|
||||
let sst_file_name = "test-empty.parquet";
|
||||
let iter = memtable.iter(IterContext::default()).unwrap();
|
||||
let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
|
||||
|
||||
let sst_info_opt = writer
|
||||
.write_sst(&sst::WriteOptions::default())
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(sst_info_opt.is_none());
|
||||
// The file should not exist when no row has been written.
|
||||
assert!(!object_store.is_exist(sst_file_name).await.unwrap());
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user