refactor: Remove usages of the old storage crate (#2777)

* chore: remove storage from some crate

* feat: remove storage config

* feat: remove storage from cmd

* feat: impl stream_to_parquet

* feat: remove storage from operator

* feat: remove stream writer from mito2

* feat: remove storage from project toml

* test: fix config api test

* docs: remove outdated configs

* refactor: remove storage directory
This commit is contained in:
Yingwen
2023-11-20 20:29:41 +08:00
committed by GitHub
parent 9558b3c201
commit b9146c88ff
118 changed files with 160 additions and 27554 deletions

119
Cargo.lock generated
View File

@@ -641,12 +641,6 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
[[package]]
name = "atomic_float"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62af46d040ba9df09edc6528dae9d8e49f5f3e82f55b7d2ec31a733c38dbc49d"
[[package]]
name = "atty"
version = "0.2.14"
@@ -1205,7 +1199,6 @@ dependencies = [
"serde_json",
"session",
"snafu",
"storage",
"store-api",
"table",
"tokio",
@@ -1628,11 +1621,13 @@ dependencies = [
"common-runtime",
"common-test-util",
"datafusion",
"datatypes",
"derive_builder 0.12.0",
"futures",
"lazy_static",
"object-store",
"orc-rust",
"parquet",
"paste",
"regex",
"serde",
@@ -1722,7 +1717,7 @@ dependencies = [
"common-runtime",
"common-telemetry",
"common-time",
"criterion 0.4.0",
"criterion",
"dashmap",
"datafusion",
"datatypes",
@@ -2142,32 +2137,6 @@ dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "criterion"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
dependencies = [
"atty",
"cast",
"clap 2.34.0",
"criterion-plot 0.4.5",
"csv",
"itertools 0.10.5",
"lazy_static",
"num-traits",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_cbor",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion"
version = "0.4.0"
@@ -2179,7 +2148,7 @@ dependencies = [
"cast",
"ciborium",
"clap 3.2.25",
"criterion-plot 0.5.0",
"criterion-plot",
"futures",
"itertools 0.10.5",
"lazy_static",
@@ -2196,16 +2165,6 @@ dependencies = [
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
dependencies = [
"cast",
"itertools 0.10.5",
]
[[package]]
name = "criterion-plot"
version = "0.5.0"
@@ -2681,7 +2640,6 @@ dependencies = [
"session",
"snafu",
"sql",
"storage",
"store-api",
"substrait 0.4.3",
"table",
@@ -3313,7 +3271,6 @@ dependencies = [
"snafu",
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)",
"storage",
"store-api",
"strfmt",
"substrait 0.4.3",
@@ -5569,7 +5526,6 @@ dependencies = [
"snafu",
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)",
"storage",
"store-api",
"substrait 0.4.3",
"table",
@@ -7966,7 +7922,7 @@ dependencies = [
"common-test-util",
"common-time",
"console",
"criterion 0.4.0",
"criterion",
"crossbeam-utils",
"datafusion",
"datafusion-common",
@@ -7998,7 +7954,6 @@ dependencies = [
"session",
"snafu",
"sql",
"storage",
"store-api",
"table",
"tokio",
@@ -8078,16 +8033,6 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde_cbor"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
dependencies = [
"half 1.8.2",
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.190"
@@ -8829,60 +8774,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "storage"
version = "0.4.3"
dependencies = [
"api",
"arc-swap",
"arrow",
"arrow-array",
"async-compat",
"async-stream",
"async-trait",
"atomic_float",
"bytes",
"common-base",
"common-config",
"common-datasource",
"common-error",
"common-macro",
"common-query",
"common-recordbatch",
"common-runtime",
"common-telemetry",
"common-test-util",
"common-time",
"criterion 0.3.6",
"datafusion",
"datafusion-common",
"datafusion-expr",
"datafusion-physical-expr",
"datatypes",
"futures",
"futures-util",
"itertools 0.10.5",
"lazy_static",
"log-store",
"object-store",
"parquet",
"paste",
"prometheus",
"prost 0.12.1",
"rand",
"regex",
"serde",
"serde_json",
"snafu",
"store-api",
"table",
"tokio",
"tokio-util",
"tonic 0.10.2",
"tonic-build 0.9.2",
"uuid",
]
[[package]]
name = "store-api"
version = "0.4.3"

View File

@@ -49,7 +49,6 @@ members = [
"src/servers",
"src/session",
"src/sql",
"src/storage",
"src/store-api",
"src/table",
"tests-integration",
@@ -176,7 +175,6 @@ script = { path = "src/script" }
servers = { path = "src/servers" }
session = { path = "src/session" }
sql = { path = "src/sql" }
storage = { path = "src/storage" }
store-api = { path = "src/store-api" }
substrait = { path = "src/common/substrait" }
table = { path = "src/table" }

View File

@@ -53,33 +53,6 @@ type = "File"
# The local file cache capacity in bytes.
# cache_capacity = "256MB"
# Compaction options, see `standalone.example.toml`.
[storage.compaction]
max_inflight_tasks = 4
max_files_in_level0 = 8
max_purge_tasks = 32
# Storage manifest options
[storage.manifest]
# Region checkpoint actions margin.
# Create a checkpoint every <checkpoint_margin> actions.
checkpoint_margin = 10
# Region manifest logs and checkpoints gc execution duration
gc_duration = '10m'
# Storage flush options
[storage.flush]
# Max inflight flush tasks.
max_flush_tasks = 8
# Default write buffer size for a region.
region_write_buffer_size = "32MB"
# Interval to check whether a region needs flush.
picker_schedule_interval = "5m"
# Interval to auto flush a region if it has not flushed yet.
auto_flush_interval = "1h"
# Global write buffer size for all regions.
global_write_buffer_size = "1GB"
# Mito engine options
[[region_engine]]
[region_engine.mito]

View File

@@ -122,36 +122,6 @@ type = "File"
# The local file cache capacity in bytes.
# cache_capacity = "256MB"
# Compaction options.
[storage.compaction]
# Max task number that can concurrently run.
max_inflight_tasks = 4
# Max files in level 0 to trigger compaction.
max_files_in_level0 = 8
# Max task number for SST purge task after compaction.
max_purge_tasks = 32
# Storage manifest options
[storage.manifest]
# Region checkpoint actions margin.
# Create a checkpoint every <checkpoint_margin> actions.
checkpoint_margin = 10
# Region manifest logs and checkpoints gc execution duration
gc_duration = '10m'
# Storage flush options
[storage.flush]
# Max inflight flush tasks.
max_flush_tasks = 8
# Default write buffer size for a region.
region_write_buffer_size = "32MB"
# Interval to check whether a region needs flush.
picker_schedule_interval = "5m"
# Interval to auto flush a region if it has not flushed yet.
auto_flush_interval = "1h"
# Global write buffer size for all regions.
global_write_buffer_size = "1GB"
# Mito engine options
[[region_engine]]
[region_engine.mito]

View File

@@ -49,5 +49,4 @@ chrono.workspace = true
common-test-util.workspace = true
log-store.workspace = true
object-store.workspace = true
storage.workspace = true
tokio.workspace = true

View File

@@ -192,7 +192,7 @@ mod tests {
use std::time::Duration;
use common_test_util::temp_dir::create_named_temp_file;
use datanode::config::{CompactionConfig, FileConfig, ObjectStoreConfig, RegionManifestConfig};
use datanode::config::{FileConfig, ObjectStoreConfig};
use servers::heartbeat_options::HeartbeatOptions;
use servers::Mode;
@@ -232,16 +232,6 @@ mod tests {
type = "File"
data_home = "/tmp/greptimedb/"
[storage.compaction]
max_inflight_tasks = 3
max_files_in_level0 = 7
max_purge_tasks = 32
[storage.manifest]
checkpoint_margin = 9
gc_duration = '7s'
compress = true
[logging]
level = "debug"
dir = "/tmp/greptimedb/test/logs"
@@ -294,23 +284,6 @@ mod tests {
ObjectStoreConfig::File(FileConfig { .. })
));
assert_eq!(
CompactionConfig {
max_inflight_tasks: 3,
max_files_in_level0: 7,
max_purge_tasks: 32,
},
options.storage.compaction,
);
assert_eq!(
RegionManifestConfig {
checkpoint_margin: Some(9),
gc_duration: Some(Duration::from_secs(7)),
compress: true
},
options.storage.manifest,
);
assert_eq!("debug", options.logging.level.unwrap());
assert_eq!("/tmp/greptimedb/test/logs".to_string(), options.logging.dir);
}
@@ -387,18 +360,12 @@ mod tests {
file_size = "1GB"
purge_threshold = "50GB"
purge_interval = "10m"
read_batch_size = 128
sync_write = false
[storage]
type = "File"
data_home = "/tmp/greptimedb/"
[storage.compaction]
max_inflight_tasks = 3
max_files_in_level0 = 7
max_purge_tasks = 32
[logging]
level = "debug"
dir = "/tmp/greptimedb/test/logs"
@@ -409,26 +376,24 @@ mod tests {
temp_env::with_vars(
[
(
// storage.manifest.gc_duration = 9s
// wal.purge_interval = 1m
[
env_prefix.to_string(),
"storage".to_uppercase(),
"manifest".to_uppercase(),
"gc_duration".to_uppercase(),
"wal".to_uppercase(),
"purge_interval".to_uppercase(),
]
.join(ENV_VAR_SEP),
Some("9s"),
Some("1m"),
),
(
// storage.compaction.max_purge_tasks = 99
// wal.read_batch_size = 100
[
env_prefix.to_string(),
"storage".to_uppercase(),
"compaction".to_uppercase(),
"max_purge_tasks".to_uppercase(),
"wal".to_uppercase(),
"read_batch_size".to_uppercase(),
]
.join(ENV_VAR_SEP),
Some("99"),
Some("100"),
),
(
// meta_client.metasrv_addrs = 127.0.0.1:3001,127.0.0.1:3002,127.0.0.1:3003
@@ -456,10 +421,7 @@ mod tests {
};
// Should be read from env, env > default values.
assert_eq!(
opts.storage.manifest.gc_duration,
Some(Duration::from_secs(9))
);
assert_eq!(opts.wal.read_batch_size, 100,);
assert_eq!(
opts.meta_client.unwrap().metasrv_addrs,
vec![
@@ -470,19 +432,13 @@ mod tests {
);
// Should be read from config file, config file > env > default values.
assert_eq!(opts.storage.compaction.max_purge_tasks, 32);
assert_eq!(opts.wal.purge_interval, Duration::from_secs(60 * 10));
// Should be read from cli, cli > config file > env > default values.
assert_eq!(opts.wal.dir.unwrap(), "/other/wal/dir");
// Should be default value.
assert_eq!(
opts.storage.manifest.checkpoint_margin,
DatanodeOptions::default()
.storage
.manifest
.checkpoint_margin
);
assert_eq!(opts.http.addr, DatanodeOptions::default().http.addr);
},
);
}

View File

@@ -147,7 +147,6 @@ impl Options {
#[cfg(test)]
mod tests {
use std::io::Write;
use std::time::Duration;
use common_test_util::temp_dir::create_named_temp_file;
use datanode::config::{DatanodeOptions, ObjectStoreConfig};
@@ -179,11 +178,6 @@ mod tests {
read_batch_size = 128
sync_write = false
[storage.compaction]
max_inflight_tasks = 3
max_files_in_level0 = 7
max_purge_tasks = 32
[logging]
level = "debug"
dir = "/tmp/greptimedb/test/logs"
@@ -194,17 +188,6 @@ mod tests {
temp_env::with_vars(
// The following environment variables will be used to override the values in the config file.
[
(
// storage.manifest.checkpoint_margin = 99
[
env_prefix.to_string(),
"storage".to_uppercase(),
"manifest".to_uppercase(),
"checkpoint_margin".to_uppercase(),
]
.join(ENV_VAR_SEP),
Some("99"),
),
(
// storage.type = S3
[
@@ -225,17 +208,6 @@ mod tests {
.join(ENV_VAR_SEP),
Some("mybucket"),
),
(
// storage.manifest.gc_duration = 42s
[
env_prefix.to_string(),
"storage".to_uppercase(),
"manifest".to_uppercase(),
"gc_duration".to_uppercase(),
]
.join(ENV_VAR_SEP),
Some("42s"),
),
(
// wal.dir = /other/wal/dir
[
@@ -266,17 +238,12 @@ mod tests {
.unwrap();
// Check the configs from environment variables.
assert_eq!(opts.storage.manifest.checkpoint_margin, Some(99));
match opts.storage.store {
ObjectStoreConfig::S3(s3_config) => {
assert_eq!(s3_config.bucket, "mybucket".to_string());
}
_ => panic!("unexpected store type"),
}
assert_eq!(
opts.storage.manifest.gc_duration,
Some(Duration::from_secs(42))
);
assert_eq!(
opts.meta_client.unwrap().metasrv_addrs,
vec![

View File

@@ -21,11 +21,13 @@ common-error.workspace = true
common-macro.workspace = true
common-runtime.workspace = true
datafusion.workspace = true
datatypes.workspace = true
derive_builder.workspace = true
futures.workspace = true
lazy_static.workspace = true
object-store.workspace = true
orc-rust = "0.2"
parquet.workspace = true
paste = "1.0"
regex = "1.7"
serde.workspace = true

View File

@@ -166,6 +166,14 @@ pub enum Error {
#[snafu(display("Buffered writer closed"))]
BufferedWriterClosed { location: Location },
#[snafu(display("Failed to write parquet file, path: {}", path))]
WriteParquet {
path: String,
location: Location,
#[snafu(source)]
error: parquet::errors::ParquetError,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -178,7 +186,8 @@ impl ErrorExt for Error {
| ListObjects { .. }
| ReadObject { .. }
| WriteObject { .. }
| AsyncWrite { .. } => StatusCode::StorageUnavailable,
| AsyncWrite { .. }
| WriteParquet { .. } => StatusCode::StorageUnavailable,
UnsupportedBackendProtocol { .. }
| UnsupportedCompressionType { .. }
@@ -231,6 +240,7 @@ impl ErrorExt for Error {
InvalidConnection { location, .. } => Some(*location),
UnsupportedCompressionType { location, .. } => Some(*location),
UnsupportedFormat { location, .. } => Some(*location),
WriteParquet { location, .. } => Some(*location),
}
}
}

View File

@@ -12,11 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::future::Future;
use std::pin::Pin;
use std::result;
use std::sync::Arc;
use arrow::record_batch::RecordBatch;
use arrow_schema::Schema;
use arrow_schema::{Schema, SchemaRef};
use async_trait::async_trait;
use datafusion::datasource::physical_plan::{FileMeta, ParquetFileReaderFactory};
use datafusion::error::Result as DatafusionResult;
@@ -26,11 +28,15 @@ use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
use datafusion::parquet::file::metadata::ParquetMetaData;
use datafusion::parquet::format::FileMetaData;
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
use datafusion::physical_plan::SendableRecordBatchStream;
use futures::future::BoxFuture;
use futures::StreamExt;
use object_store::{ObjectStore, Reader};
use parquet::basic::{Compression, ZstdLevel};
use parquet::file::properties::WriterProperties;
use snafu::ResultExt;
use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder};
use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder, LazyBufferedWriter};
use crate::error::{self, Result};
use crate::file_format::FileFormat;
use crate::share_buffer::SharedBuffer;
@@ -156,6 +162,103 @@ impl ArrowWriterCloser for ArrowWriter<SharedBuffer> {
}
}
/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying
/// storage by chunks to reduce memory consumption.
pub struct BufferedWriter {
inner: InnerBufferedWriter,
}
type InnerBufferedWriter = LazyBufferedWriter<
object_store::Writer,
ArrowWriter<SharedBuffer>,
Box<
dyn FnMut(
String,
)
-> Pin<Box<dyn Future<Output = error::Result<object_store::Writer>> + Send>>
+ Send,
>,
>;
impl BufferedWriter {
pub async fn try_new(
path: String,
store: ObjectStore,
arrow_schema: SchemaRef,
props: Option<WriterProperties>,
buffer_threshold: usize,
) -> error::Result<Self> {
let buffer = SharedBuffer::with_capacity(buffer_threshold);
let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props)
.context(error::WriteParquetSnafu { path: &path })?;
Ok(Self {
inner: LazyBufferedWriter::new(
buffer_threshold,
buffer,
arrow_writer,
&path,
Box::new(move |path| {
let store = store.clone();
Box::pin(async move {
store
.writer(&path)
.await
.context(error::WriteObjectSnafu { path })
})
}),
),
})
}
/// Write a record batch to stream writer.
pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
self.inner.write(arrow_batch).await?;
self.inner.try_flush(false).await?;
Ok(())
}
/// Close parquet writer.
///
/// Return file metadata and bytes written.
pub async fn close(self) -> error::Result<(FileMetaData, u64)> {
self.inner.close_with_arrow_writer().await
}
}
/// Output the stream to a parquet file.
///
/// Returns number of rows written.
pub async fn stream_to_parquet(
mut stream: SendableRecordBatchStream,
store: ObjectStore,
path: &str,
threshold: usize,
) -> Result<usize> {
let write_props = WriterProperties::builder()
.set_compression(Compression::ZSTD(ZstdLevel::default()))
.build();
let schema = stream.schema();
let mut buffered_writer = BufferedWriter::try_new(
path.to_string(),
store,
schema,
Some(write_props),
threshold,
)
.await?;
let mut rows_written = 0;
while let Some(batch) = stream.next().await {
let batch = batch.context(error::ReadRecordBatchSnafu)?;
buffered_writer.write(&batch).await?;
rows_written += batch.num_rows();
}
buffered_writer.close().await?;
Ok(rows_written)
}
#[cfg(test)]
mod tests {
use common_test_util::find_workspace_path;

View File

@@ -61,7 +61,6 @@ servers.workspace = true
session.workspace = true
snafu.workspace = true
sql.workspace = true
storage.workspace = true
store-api.workspace = true
substrait.workspace = true
table.workspace = true

View File

@@ -31,11 +31,6 @@ use serde::{Deserialize, Serialize};
use servers::heartbeat_options::HeartbeatOptions;
use servers::http::HttpOptions;
use servers::Mode;
use storage::config::{
EngineConfig as StorageEngineConfig, DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_MAX_FLUSH_TASKS,
DEFAULT_PICKER_SCHEDULE_INTERVAL, DEFAULT_REGION_WRITE_BUFFER_SIZE,
};
use storage::scheduler::SchedulerConfig;
pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::mb(256);
@@ -68,9 +63,6 @@ pub struct StorageConfig {
pub data_home: String,
#[serde(flatten)]
pub store: ObjectStoreConfig,
pub compaction: CompactionConfig,
pub manifest: RegionManifestConfig,
pub flush: FlushConfig,
}
impl Default for StorageConfig {
@@ -79,9 +71,6 @@ impl Default for StorageConfig {
global_ttl: None,
data_home: DEFAULT_DATA_HOME.to_string(),
store: ObjectStoreConfig::default(),
compaction: CompactionConfig::default(),
manifest: RegionManifestConfig::default(),
flush: FlushConfig::default(),
}
}
}
@@ -216,109 +205,6 @@ impl Default for ObjectStoreConfig {
}
}
/// Options for region manifest
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
#[serde(default)]
pub struct RegionManifestConfig {
/// Region manifest checkpoint actions margin.
/// Manifest service create a checkpoint every `checkpoint_margin` actions.
pub checkpoint_margin: Option<u16>,
/// Region manifest logs and checkpoints gc task execution duration.
#[serde(with = "humantime_serde")]
pub gc_duration: Option<Duration>,
/// Whether to compress manifest and checkpoint file by gzip
pub compress: bool,
}
impl Default for RegionManifestConfig {
fn default() -> Self {
Self {
checkpoint_margin: Some(10u16),
gc_duration: Some(Duration::from_secs(600)),
compress: false,
}
}
}
/// Options for table compaction
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
#[serde(default)]
pub struct CompactionConfig {
/// Max task number that can concurrently run.
pub max_inflight_tasks: usize,
/// Max files in level 0 to trigger compaction.
pub max_files_in_level0: usize,
/// Max task number for SST purge task after compaction.
pub max_purge_tasks: usize,
}
impl Default for CompactionConfig {
fn default() -> Self {
Self {
max_inflight_tasks: 4,
max_files_in_level0: 8,
max_purge_tasks: 32,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
#[serde(default)]
pub struct FlushConfig {
/// Max inflight flush tasks.
pub max_flush_tasks: usize,
/// Default write buffer size for a region.
pub region_write_buffer_size: ReadableSize,
/// Interval to schedule auto flush picker to find region to flush.
#[serde(with = "humantime_serde")]
pub picker_schedule_interval: Duration,
/// Interval to auto flush a region if it has not flushed yet.
#[serde(with = "humantime_serde")]
pub auto_flush_interval: Duration,
/// Global write buffer size for all regions.
pub global_write_buffer_size: Option<ReadableSize>,
}
impl Default for FlushConfig {
fn default() -> Self {
Self {
max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
picker_schedule_interval: Duration::from_millis(
DEFAULT_PICKER_SCHEDULE_INTERVAL.into(),
),
auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
global_write_buffer_size: None,
}
}
}
impl From<&DatanodeOptions> for SchedulerConfig {
fn from(value: &DatanodeOptions) -> Self {
Self {
max_inflight_tasks: value.storage.compaction.max_inflight_tasks,
}
}
}
impl From<&DatanodeOptions> for StorageEngineConfig {
fn from(value: &DatanodeOptions) -> Self {
Self {
compress_manifest: value.storage.manifest.compress,
manifest_checkpoint_margin: value.storage.manifest.checkpoint_margin,
manifest_gc_duration: value.storage.manifest.gc_duration,
max_files_in_l0: value.storage.compaction.max_files_in_level0,
max_purge_tasks: value.storage.compaction.max_purge_tasks,
max_flush_tasks: value.storage.flush.max_flush_tasks,
region_write_buffer_size: value.storage.flush.region_write_buffer_size,
picker_schedule_interval: value.storage.flush.picker_schedule_interval,
auto_flush_interval: value.storage.flush.auto_flush_interval,
global_write_buffer_size: value.storage.flush.global_write_buffer_size,
global_ttl: value.storage.global_ttl,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(default)]
pub struct DatanodeOptions {

View File

@@ -68,7 +68,6 @@ session.workspace = true
snafu.workspace = true
sql.workspace = true
sqlparser.workspace = true
storage.workspace = true
store-api.workspace = true
substrait.workspace = true
table.workspace = true

View File

@@ -121,14 +121,6 @@ pub enum Error {
source: common_datasource::error::Error,
},
#[snafu(display("Failed to write parquet file, path: {}", path))]
WriteParquet {
path: String,
location: Location,
#[snafu(source)]
error: parquet::errors::ParquetError,
},
#[snafu(display("Failed to read parquet file, path: {}", path))]
ReadParquet {
path: String,
@@ -428,7 +420,6 @@ impl ErrorExt for Error {
match self {
OpenDal { .. }
| WriteParquet { .. }
| ReadParquet { .. }
| WriteWal { .. }
| ReadWal { .. }

View File

@@ -17,5 +17,4 @@
pub mod file;
pub mod file_purger;
pub mod parquet;
mod stream_writer;
pub(crate) mod version;

View File

@@ -14,6 +14,7 @@
//! Parquet writer.
use common_datasource::file_format::parquet::BufferedWriter;
use common_telemetry::debug;
use common_time::Timestamp;
use object_store::ObjectStore;
@@ -25,11 +26,10 @@ use snafu::ResultExt;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
use crate::error::{InvalidMetadataSnafu, Result};
use crate::error::{InvalidMetadataSnafu, Result, WriteBufferSnafu};
use crate::read::{Batch, Source};
use crate::sst::parquet::format::WriteFormat;
use crate::sst::parquet::{SstInfo, WriteOptions, PARQUET_METADATA_KEY};
use crate::sst::stream_writer::BufferedWriter;
/// Parquet SST writer.
pub struct ParquetWriter {
@@ -83,14 +83,18 @@ impl ParquetWriter {
Some(writer_props),
opts.write_buffer_size.as_bytes() as usize,
)
.await?;
.await
.context(WriteBufferSnafu)?;
let mut stats = SourceStats::default();
while let Some(batch) = self.source.next_batch().await? {
stats.update(&batch);
let arrow_batch = write_format.convert_batch(&batch)?;
buffered_writer.write(&arrow_batch).await?;
buffered_writer
.write(&arrow_batch)
.await
.context(WriteBufferSnafu)?;
}
if stats.num_rows == 0 {
@@ -99,11 +103,11 @@ impl ParquetWriter {
self.file_path
);
buffered_writer.close().await?;
buffered_writer.close().await.context(WriteBufferSnafu)?;
return Ok(None);
}
let (_file_meta, file_size) = buffered_writer.close().await?;
let (_file_meta, file_size) = buffered_writer.close().await.context(WriteBufferSnafu)?;
// Safety: num rows > 0 so we must have min/max.
let time_range = stats.time_range.unwrap();

View File

@@ -1,105 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::future::Future;
use std::pin::Pin;
use common_datasource::buffered_writer::LazyBufferedWriter;
use common_datasource::share_buffer::SharedBuffer;
use datatypes::arrow::datatypes::SchemaRef;
use datatypes::arrow::record_batch::RecordBatch;
use object_store::ObjectStore;
use parquet::arrow::ArrowWriter;
use parquet::file::properties::WriterProperties;
use parquet::format::FileMetaData;
use snafu::ResultExt;
use crate::error;
use crate::error::WriteParquetSnafu;
/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying
/// storage by chunks to reduce memory consumption.
pub struct BufferedWriter {
inner: InnerBufferedWriter,
}
type InnerBufferedWriter = LazyBufferedWriter<
object_store::Writer,
ArrowWriter<SharedBuffer>,
Box<
dyn FnMut(
String,
) -> Pin<
Box<
dyn Future<Output = common_datasource::error::Result<object_store::Writer>>
+ Send,
>,
> + Send,
>,
>;
impl BufferedWriter {
pub async fn try_new(
path: String,
store: ObjectStore,
arrow_schema: SchemaRef,
props: Option<WriterProperties>,
buffer_threshold: usize,
) -> error::Result<Self> {
let buffer = SharedBuffer::with_capacity(buffer_threshold);
let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props)
.context(WriteParquetSnafu { path: &path })?;
Ok(Self {
inner: LazyBufferedWriter::new(
buffer_threshold,
buffer,
arrow_writer,
&path,
Box::new(move |path| {
let store = store.clone();
Box::pin(async move {
store
.writer(&path)
.await
.context(common_datasource::error::WriteObjectSnafu { path })
})
}),
),
})
}
/// Write a record batch to stream writer.
pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> {
self.inner
.write(arrow_batch)
.await
.context(error::WriteBufferSnafu)?;
self.inner
.try_flush(false)
.await
.context(error::WriteBufferSnafu)?;
Ok(())
}
/// Close parquet writer.
pub async fn close(self) -> error::Result<(FileMetaData, u64)> {
self.inner
.close_with_arrow_writer()
.await
.context(error::WriteBufferSnafu)
}
}

View File

@@ -50,7 +50,6 @@ session.workspace = true
snafu.workspace = true
sql.workspace = true
sqlparser.workspace = true
storage.workspace = true
store-api.workspace = true
substrait.workspace = true
table.workspace = true

View File

@@ -378,12 +378,6 @@ pub enum Error {
error: datafusion::error::DataFusionError,
},
#[snafu(display("Failed to write parquet file"))]
WriteParquet {
location: Location,
source: storage::error::Error,
},
#[snafu(display(
"Schema datatypes not match at index {}, expected table schema: {}, actual file schema: {}",
index,
@@ -594,7 +588,6 @@ impl ErrorExt for Error {
| Error::ParseUrl { source, .. }
| Error::BuildBackend { source, .. } => source.status_code(),
Error::WriteParquet { source, .. } => source.status_code(),
Error::ExecuteDdl { source, .. } => source.status_code(),
Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,

View File

@@ -17,6 +17,7 @@ use std::sync::Arc;
use common_base::readable_size::ReadableSize;
use common_datasource::file_format::csv::stream_to_csv;
use common_datasource::file_format::json::stream_to_json;
use common_datasource::file_format::parquet::stream_to_parquet;
use common_datasource::file_format::Format;
use common_datasource::object_store::{build_backend, parse_url};
use common_datasource::util::find_dir_and_filename;
@@ -31,17 +32,17 @@ use object_store::ObjectStore;
use query::plan::LogicalPlan;
use session::context::QueryContextRef;
use snafu::{OptionExt, ResultExt};
use storage::sst::SstInfo;
use storage::{ParquetWriter, Source};
use table::engine::TableReference;
use table::requests::CopyTableRequest;
use table::table::adapter::DfTableProviderAdapter;
use crate::error::{
self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result, WriteParquetSnafu,
};
use crate::error::{self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result};
use crate::statement::StatementExecutor;
// The buffer size should be greater than 5MB (minimum multipart upload size).
/// Buffer size to flush data to object stores.
const WRITE_BUFFER_THRESHOLD: ReadableSize = ReadableSize::mb(8);
impl StatementExecutor {
async fn stream_to_file(
&self,
@@ -50,7 +51,7 @@ impl StatementExecutor {
object_store: ObjectStore,
path: &str,
) -> Result<usize> {
let threshold = ReadableSize::mb(4).as_bytes() as usize;
let threshold = WRITE_BUFFER_THRESHOLD.as_bytes() as usize;
match format {
Format::Csv(_) => stream_to_csv(
@@ -69,17 +70,14 @@ impl StatementExecutor {
)
.await
.context(error::WriteStreamToFileSnafu { path }),
Format::Parquet(_) => {
let writer = ParquetWriter::new(path, Source::Stream(stream), object_store);
let rows_copied = writer
.write_sst(&storage::sst::WriteOptions::default())
.await
.context(WriteParquetSnafu)?
.map(|SstInfo { num_rows, .. }| num_rows)
.unwrap_or(0);
Ok(rows_copied)
}
Format::Parquet(_) => stream_to_parquet(
Box::pin(DfRecordBatchStreamAdapter::new(stream)),
object_store,
path,
threshold,
)
.await
.context(error::WriteStreamToFileSnafu { path }),
_ => error::UnsupportedFormatSnafu { format: *format }.fail(),
}
}

View File

@@ -85,7 +85,6 @@ rayon = "1.0"
ron = "0.7"
serde = { version = "1.0", features = ["derive"] }
session = { workspace = true, features = ["testing"] }
storage.workspace = true
tokio-test = "0.4"
[[bench]]

View File

@@ -1,64 +0,0 @@
[package]
name = "storage"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
api.workspace = true
arc-swap = "1.0"
arrow-array.workspace = true
arrow.workspace = true
async-compat = "0.2"
async-stream.workspace = true
async-trait = "0.1"
bytes = "1.1"
common-base.workspace = true
common-datasource.workspace = true
common-error.workspace = true
common-macro.workspace = true
common-query.workspace = true
common-recordbatch.workspace = true
common-runtime.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
datafusion-common.workspace = true
datafusion-expr.workspace = true
datafusion-physical-expr.workspace = true
datafusion.workspace = true
datatypes.workspace = true
futures-util.workspace = true
futures.workspace = true
itertools.workspace = true
lazy_static.workspace = true
object-store.workspace = true
parquet = { workspace = true, features = ["async"] }
paste.workspace = true
prometheus.workspace = true
prost.workspace = true
regex = "1.5"
serde.workspace = true
serde_json = "1.0"
snafu.workspace = true
store-api.workspace = true
table.workspace = true
tokio-util.workspace = true
tokio.workspace = true
tonic.workspace = true
uuid.workspace = true
[dev-dependencies]
atomic_float = "0.1"
common-config.workspace = true
common-test-util.workspace = true
criterion = "0.3"
datatypes = { workspace = true, features = ["test"] }
log-store.workspace = true
rand.workspace = true
[build-dependencies]
tonic-build = "0.9"
[[bench]]
name = "bench_main"
harness = false

View File

@@ -1,27 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::criterion_main;
mod memtable;
mod wal;
criterion_main! {
memtable::bench_memtable_read::benches,
memtable::bench_memtable_write::benches,
memtable::bench_memtable_read_write_ratio::benches,
wal::bench_wal::benches,
wal::bench_decode::benches,
wal::bench_encode::benches,
}

View File

@@ -1,33 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use crate::memtable::generate_kvs;
use crate::memtable::util::bench_context::BenchContext;
fn bench_memtable_read(c: &mut Criterion) {
// the length of string in value is 20
let kvs = generate_kvs(10, 10000, 20);
let ctx = BenchContext::new();
kvs.iter().for_each(|kv| ctx.write(kv));
let mut group = c.benchmark_group("memtable_read");
let _ = group
.throughput(Throughput::Elements(10 * 10000))
.bench_function("read", |b| b.iter(|| ctx.read(100)));
group.finish();
}
criterion_group!(benches, bench_memtable_read);
criterion_main!(benches);

View File

@@ -1,151 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Instant;
use atomic_float::AtomicF64;
use criterion::{
criterion_group, criterion_main, BatchSize, Bencher, BenchmarkId, Criterion, Throughput,
};
use rand::Rng;
use crate::memtable::generate_kvs;
use crate::memtable::util::bench_context::BenchContext;
static READ_NUM: AtomicUsize = AtomicUsize::new(0);
static WRITE_NUM: AtomicUsize = AtomicUsize::new(0);
static READ_SECS: AtomicF64 = AtomicF64::new(0.0);
static WRITE_SECS: AtomicF64 = AtomicF64::new(0.0);
struct Input {
ratio: bool,
kv_size: usize,
batch_size: usize,
}
fn memtable_round(ctx: &BenchContext, input: &Input) {
if input.ratio {
let now = Instant::now();
let read_count = ctx.read(input.batch_size);
let d = now.elapsed();
let _ = READ_SECS.fetch_add(
d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9,
Ordering::Relaxed,
);
let _ = READ_NUM.fetch_add(read_count, Ordering::Relaxed);
} else {
generate_kvs(input.kv_size, input.batch_size, 20)
.iter()
.for_each(|kv| {
let now = Instant::now();
ctx.write(kv);
let d = now.elapsed();
let _ = WRITE_SECS.fetch_add(
d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9,
Ordering::Relaxed,
);
let _ = WRITE_NUM.fetch_add(kv.len(), Ordering::Relaxed);
});
}
}
fn bench_read_write_ctx_frac(b: &mut Bencher<'_>, frac: &usize) {
let frac = *frac;
let ctx = Arc::new(BenchContext::default());
let thread_ctx = ctx.clone();
let stop = Arc::new(AtomicBool::new(false));
let thread_stop = stop.clone();
let handle = thread::spawn(move || {
let mut rng = rand::thread_rng();
while !thread_stop.load(Ordering::Relaxed) {
let f = rng.gen_range(0..=10);
let input = Input {
ratio: f < frac,
kv_size: 100,
batch_size: 1000,
};
memtable_round(&thread_ctx, &input);
}
});
let mut rng = rand::thread_rng();
b.iter_batched_ref(
|| {
let f = rng.gen_range(0..=10);
Input {
ratio: f < frac,
kv_size: 100,
batch_size: 1000,
}
},
|input| {
memtable_round(&ctx, input);
},
BatchSize::SmallInput,
);
stop.store(true, Ordering::Relaxed);
handle.join().unwrap();
}
#[allow(clippy::print_stdout)]
fn bench_memtable_read_write_ratio(c: &mut Criterion) {
let mut group = c.benchmark_group("memtable_read_write_ratio");
for i in 0..=10 {
READ_NUM.store(0, Ordering::Relaxed);
WRITE_NUM.store(0, Ordering::Relaxed);
READ_SECS.store(0.0, Ordering::Relaxed);
WRITE_SECS.store(0.0, Ordering::Relaxed);
let _ = group
.bench_with_input(
BenchmarkId::from_parameter(format!(
"read ratio: {:.2}% , write ratio: {:.2}%",
i as f64 / 10_f64 * 100.0,
(10 - i) as f64 / 10_f64 * 100.0,
)),
&i,
bench_read_write_ctx_frac,
)
.throughput(Throughput::Elements(100 * 1000));
// the time is a little different the real time
let read_num = READ_NUM.load(Ordering::Relaxed);
let read_time = READ_SECS.load(Ordering::Relaxed);
let read_tps = if read_time != 0.0 {
read_num as f64 / read_time
} else {
0.0
};
let write_num = WRITE_NUM.load(Ordering::Relaxed);
let write_time = WRITE_SECS.load(Ordering::Relaxed);
let write_tps = if write_time != 0.0 {
write_num as f64 / write_time
} else {
0.0
};
if read_num != 0 || write_num != 0 {
println!(
"\nread numbers: {read_num}, read thrpt: {read_tps}\nwrite numbers: {write_num}, write thrpt {write_tps}\n",
);
}
}
group.finish();
}
criterion_group!(benches, bench_memtable_read_write_ratio);
criterion_main!(benches);

View File

@@ -1,34 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use crate::memtable::generate_kvs;
use crate::memtable::util::bench_context::BenchContext;
pub fn bench_memtable_write(c: &mut Criterion) {
// the length of string in value is 20
let kvs = generate_kvs(10, 1000, 20);
let mut group = c.benchmark_group("memtable_write");
let _ = group
.throughput(Throughput::Elements(10 * 1000))
.bench_function("write", |b| {
let ctx = BenchContext::new();
b.iter(|| kvs.iter().for_each(|kv| ctx.write(kv)))
});
group.finish();
}
criterion_group!(benches, bench_memtable_write);
criterion_main!(benches);

View File

@@ -1,121 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod bench_memtable_read;
pub mod bench_memtable_read_write_ratio;
pub mod bench_memtable_write;
pub mod util;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use api::v1::OpType;
use datatypes::prelude::ScalarVectorBuilder;
use datatypes::timestamp::TimestampMillisecond;
use datatypes::vectors::{
StringVectorBuilder, TimestampMillisecondVectorBuilder, UInt64VectorBuilder,
};
use rand::distributions::Alphanumeric;
use rand::prelude::ThreadRng;
use rand::Rng;
use storage::memtable::KeyValues;
use store_api::storage::SequenceNumber;
static NEXT_SEQUENCE: AtomicU64 = AtomicU64::new(0);
fn get_sequence() -> SequenceNumber {
NEXT_SEQUENCE.fetch_add(1, Ordering::Relaxed)
}
fn random_kv(rng: &mut ThreadRng, value_size: usize) -> ((i64, u64), (Option<u64>, String)) {
let key0 = rng.gen_range(0..10000);
let key1 = rng.gen::<u64>();
let value1 = Some(rng.gen::<u64>());
let value2 = rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(value_size)
.map(char::from)
.collect();
((key0, key1), (value1, value2))
}
type KeyTuple = (i64, u64);
type ValueTuple = (Option<u64>, String);
fn random_kvs(len: usize, value_size: usize) -> (Vec<KeyTuple>, Vec<ValueTuple>) {
let mut keys = Vec::with_capacity(len);
let mut values = Vec::with_capacity(len);
for _ in 0..len {
let mut rng = rand::thread_rng();
let (key, value) = random_kv(&mut rng, value_size);
keys.push(key);
values.push(value);
}
(keys, values)
}
fn kvs_with_index(
sequence: SequenceNumber,
op_type: OpType,
start_index_in_batch: usize,
keys: &[(i64, u64)],
values: &[(Option<u64>, String)],
) -> KeyValues {
let mut key_builders = (
TimestampMillisecondVectorBuilder::with_capacity(keys.len()),
UInt64VectorBuilder::with_capacity(keys.len()),
);
for key in keys {
key_builders.0.push(Some(TimestampMillisecond::from(key.0)));
key_builders.1.push(Some(key.1));
}
let row_keys = vec![Arc::new(key_builders.1.finish()) as _];
let mut value_builders = (
UInt64VectorBuilder::with_capacity(values.len()),
StringVectorBuilder::with_capacity(values.len()),
);
for value in values {
value_builders.0.push(value.0);
value_builders.1.push(Some(&value.1));
}
let row_values = vec![
Arc::new(value_builders.0.finish()) as _,
Arc::new(value_builders.1.finish()) as _,
];
KeyValues {
sequence,
op_type,
start_index_in_batch,
keys: row_keys,
values: row_values,
timestamp: Some(Arc::new(key_builders.0.finish()) as _),
}
}
fn generate_kv(kv_size: usize, start_index_in_batch: usize, value_size: usize) -> KeyValues {
let (keys, values) = random_kvs(kv_size, value_size);
kvs_with_index(
get_sequence(),
OpType::Put,
start_index_in_batch,
&keys,
&values,
)
}
fn generate_kvs(kv_size: usize, size: usize, value_size: usize) -> Vec<KeyValues> {
(0..size)
.map(|i| generate_kv(kv_size, i, value_size))
.collect()
}

View File

@@ -1,51 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use storage::memtable::{IterContext, KeyValues, MemtableRef};
use crate::memtable::util::new_memtable;
pub struct BenchContext {
memtable: MemtableRef,
}
impl Default for BenchContext {
fn default() -> Self {
BenchContext::new()
}
}
impl BenchContext {
pub fn new() -> BenchContext {
BenchContext {
memtable: new_memtable(),
}
}
pub fn write(&self, kvs: &KeyValues) {
self.memtable.write(kvs).unwrap();
}
pub fn read(&self, batch_size: usize) -> usize {
let mut read_count = 0;
let iter_ctx = IterContext {
batch_size,
..Default::default()
};
let iter = self.memtable.iter(iter_ctx).unwrap();
for batch in iter {
let _ = batch.unwrap();
read_count += batch_size;
}
read_count
}
}

View File

@@ -1,40 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod bench_context;
pub mod regiondesc_util;
pub mod schema_util;
use datatypes::type_id::LogicalTypeId;
use storage::memtable::{DefaultMemtableBuilder, MemtableBuilder, MemtableRef};
use storage::metadata::RegionMetadata;
use storage::schema::RegionSchemaRef;
use crate::memtable::util::regiondesc_util::RegionDescBuilder;
pub const TIMESTAMP_NAME: &str = "timestamp";
pub fn schema_for_test() -> RegionSchemaRef {
let desc = RegionDescBuilder::new("bench")
.push_field_column(("v1", LogicalTypeId::UInt64, true))
.push_field_column(("v2", LogicalTypeId::String, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
metadata.schema().clone()
}
pub fn new_memtable() -> MemtableRef {
DefaultMemtableBuilder::default().build(schema_for_test())
}

View File

@@ -1,80 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use datatypes::prelude::ConcreteDataType;
use store_api::storage::{
ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId,
RegionDescriptor, RowKeyDescriptorBuilder,
};
use super::schema_util::ColumnDef;
use super::TIMESTAMP_NAME;
pub struct RegionDescBuilder {
name: String,
last_column_id: ColumnId,
key_builder: RowKeyDescriptorBuilder,
default_cf_builder: ColumnFamilyDescriptorBuilder,
}
impl RegionDescBuilder {
pub fn new<T: Into<String>>(name: T) -> Self {
let key_builder = RowKeyDescriptorBuilder::new(
ColumnDescriptorBuilder::new(
1,
TIMESTAMP_NAME,
ConcreteDataType::timestamp_millisecond_datatype(),
)
.is_nullable(false)
.build()
.unwrap(),
);
Self {
name: name.into(),
last_column_id: 1,
key_builder,
default_cf_builder: ColumnFamilyDescriptorBuilder::default(),
}
}
pub fn push_field_column(mut self, column_def: ColumnDef) -> Self {
let column = self.new_column(column_def);
self.default_cf_builder = self.default_cf_builder.push_column(column);
self
}
pub fn build(self) -> RegionDescriptor {
RegionDescriptor {
id: 0.into(),
name: self.name,
row_key: self.key_builder.build().unwrap(),
default_cf: self.default_cf_builder.build().unwrap(),
extra_cfs: Vec::new(),
}
}
fn alloc_column_id(&mut self) -> ColumnId {
self.last_column_id += 1;
self.last_column_id
}
fn new_column(&mut self, column_def: ColumnDef) -> ColumnDescriptor {
let datatype = column_def.1.data_type();
ColumnDescriptorBuilder::new(self.alloc_column_id(), column_def.0, datatype)
.is_nullable(column_def.2)
.build()
.unwrap()
}
}

View File

@@ -1,46 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use datatypes::prelude::*;
use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef};
/// Column definition: (name, datatype, is_nullable)
pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool);
pub fn new_schema(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> Schema {
let column_schemas: Vec<_> = column_defs
.iter()
.enumerate()
.map(|(index, column_def)| {
let datatype = column_def.1.data_type();
if let Some(timestamp_index) = timestamp_index {
ColumnSchema::new(column_def.0, datatype, column_def.2)
.with_time_index(index == timestamp_index)
} else {
ColumnSchema::new(column_def.0, datatype, column_def.2)
}
})
.collect();
SchemaBuilder::try_from(column_schemas)
.unwrap()
.build()
.unwrap()
}
pub fn new_schema_ref(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> SchemaRef {
Arc::new(new_schema(column_defs, timestamp_index))
}

View File

@@ -1,73 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion};
use storage::codec::{Decoder, Encoder};
use storage::write_batch::{codec, WriteBatch};
use crate::wal::util::gen_new_batch_and_types;
/*
-------------------------------------
decode |
-------------------------------------
rows | protobuf | arrow |
------------------------------------
10 | 8.6485 us | 8.8028 us |
------------------------------------
100 | 63.850 us | 46.174 us |
------------------------------------
10000| 654.46 us | 433.58 us |
------------------------------------
*/
fn encode_arrow(batch: &WriteBatch, dst: &mut Vec<u8>) {
let encoder = codec::PayloadEncoder::new();
encoder.encode(batch.payload(), dst).unwrap();
}
fn decode_arrow(dst: &[u8], mutation_types: &[i32]) {
let decoder = codec::PayloadDecoder::new(mutation_types);
let _ = decoder.decode(dst).unwrap();
}
fn bench_wal_decode(c: &mut Criterion) {
let (batch_10, types_10) = gen_new_batch_and_types(1);
let (batch_100, types_100) = gen_new_batch_and_types(10);
let (batch_10000, types_10000) = gen_new_batch_and_types(100);
let mut dst_arrow_10 = vec![];
let mut dst_arrow_100 = vec![];
let mut dst_arrow_10000 = vec![];
encode_arrow(&batch_10, &mut dst_arrow_10);
encode_arrow(&batch_100, &mut dst_arrow_100);
encode_arrow(&batch_10000, &mut dst_arrow_10000);
let mut group = c.benchmark_group("wal_decode");
let _ = group
.bench_function("arrow_decode_with_10_num_rows", |b| {
b.iter(|| decode_arrow(&dst_arrow_10, &types_10))
})
.bench_function("arrow_decode_with_100_num_rows", |b| {
b.iter(|| decode_arrow(&dst_arrow_100, &types_100))
})
.bench_function("arrow_decode_with_10000_num_rows", |b| {
b.iter(|| decode_arrow(&dst_arrow_10000, &types_10000))
});
group.finish();
}
criterion_group!(benches, bench_wal_decode);
criterion_main!(benches);

View File

@@ -1,61 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion};
use storage::codec::Encoder;
use storage::write_batch::{codec, WriteBatch};
use crate::wal::util::gen_new_batch_and_types;
/*
-------------------------------------
encode |
-------------------------------------
rows | protobuf | arrow |
------------------------------------
10 | 4.8732 us | 5.7388 us |
------------------------------------
100 | 40.928 us | 24.988 us |
------------------------------------
10000| 425.69 us | 229.74 us |
------------------------------------
*/
fn encode_arrow(batch: &WriteBatch) {
let encoder = codec::PayloadEncoder::new();
let mut dst = vec![];
encoder.encode(batch.payload(), &mut dst).unwrap();
}
fn bench_wal_encode(c: &mut Criterion) {
let (batch_10, _) = gen_new_batch_and_types(1);
let (batch_100, _) = gen_new_batch_and_types(10);
let (batch_10000, _) = gen_new_batch_and_types(100);
let mut group = c.benchmark_group("wal_encode");
let _ = group
.bench_function("arrow_encode_with_10_num_rows", |b| {
b.iter(|| encode_arrow(&batch_10))
})
.bench_function("arrow_encode_with_100_num_rows", |b| {
b.iter(|| encode_arrow(&batch_100))
})
.bench_function("arrow_encode_with_10000_num_rows", |b| {
b.iter(|| encode_arrow(&batch_10000))
});
group.finish();
}
criterion_group!(benches, bench_wal_encode);
criterion_main!(benches);

View File

@@ -1,64 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion};
use storage::codec::{Decoder, Encoder};
use storage::write_batch::{codec, WriteBatch};
use crate::wal::util::gen_new_batch_and_types;
/*
-------------------------------------
encode & decode |
-------------------------------------
rows | protobuf | arrow |
------------------------------------
10 | 13.845 us | 15.093 us |
------------------------------------
100 | 106.70 us | 73.895 us |
------------------------------------
10000| 1.0860 ms | 680.12 us |
------------------------------------
*/
fn codec_arrow(batch: &WriteBatch, mutation_types: &[i32]) {
let encoder = codec::PayloadEncoder::new();
let mut dst = vec![];
encoder.encode(batch.payload(), &mut dst).unwrap();
let decoder = codec::PayloadDecoder::new(mutation_types);
let _ = decoder.decode(&dst).unwrap();
}
fn bench_wal_encode_decode(c: &mut Criterion) {
let (batch_10, types_10) = gen_new_batch_and_types(1);
let (batch_100, types_100) = gen_new_batch_and_types(10);
let (batch_10000, types_10000) = gen_new_batch_and_types(100);
let mut group = c.benchmark_group("wal_encode_decode");
let _ = group
.bench_function("arrow_encode_decode_with_10_num_rows", |b| {
b.iter(|| codec_arrow(&batch_10, &types_10))
})
.bench_function("arrow_encode_decode_with_100_num_rows", |b| {
b.iter(|| codec_arrow(&batch_100, &types_100))
})
.bench_function("arrow_encode_decode_with_10000_num_rows", |b| {
b.iter(|| codec_arrow(&batch_10000, &types_10000))
});
group.finish();
}
criterion_group!(benches, bench_wal_encode_decode);
criterion_main!(benches);

View File

@@ -1,18 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod bench_decode;
pub mod bench_encode;
pub mod bench_wal;
pub mod util;

View File

@@ -1,94 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod write_batch_util;
use std::collections::HashMap;
use std::sync::Arc;
use datatypes::prelude::ScalarVector;
use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::{
BooleanVector, Float64Vector, StringVector, TimestampMillisecondVector, UInt64Vector, VectorRef,
};
use rand::Rng;
use storage::proto;
use storage::write_batch::WriteBatch;
use store_api::storage::WriteRequest;
pub fn new_test_batch() -> WriteBatch {
write_batch_util::new_write_batch(
&[
("k1", LogicalTypeId::UInt64, false),
("ts", LogicalTypeId::TimestampMillisecond, false),
("v1", LogicalTypeId::Boolean, true),
("4", LogicalTypeId::Float64, false),
("5", LogicalTypeId::Float64, false),
("6", LogicalTypeId::Float64, false),
("7", LogicalTypeId::Float64, false),
("8", LogicalTypeId::Float64, false),
("9", LogicalTypeId::Float64, false),
("10", LogicalTypeId::String, false),
],
Some(2),
3,
)
}
pub fn gen_new_batch_and_types(putdate_nums: usize) -> (WriteBatch, Vec<i32>) {
let mut batch = new_test_batch();
let mut rng = rand::thread_rng();
for _ in 0..putdate_nums {
let mut intvs = [0u64; 10];
let mut boolvs = [true; 10];
let mut tsvs = [0i64; 10];
let mut fvs = [0.0_f64; 10];
let svs = [
"value1_string",
"value2_string",
"value3_string",
"value4_string",
"value5_string",
"value6_string",
"value7_string",
"value8_string",
"value9_string",
"value10_string",
];
rng.fill(&mut intvs[..]);
rng.fill(&mut boolvs[..]);
rng.fill(&mut tsvs[..]);
rng.fill(&mut fvs[..]);
let intv = Arc::new(UInt64Vector::from_slice(intvs)) as VectorRef;
let boolv = Arc::new(BooleanVector::from(boolvs.to_vec())) as VectorRef;
let tsv = Arc::new(TimestampMillisecondVector::from_values(tsvs)) as VectorRef;
let fvs = Arc::new(Float64Vector::from_slice(fvs)) as VectorRef;
let svs = Arc::new(StringVector::from_slice(&svs)) as VectorRef;
let put_data = HashMap::from([
("k1".to_string(), intv.clone()),
("v1".to_string(), boolv),
("ts".to_string(), tsv.clone()),
("4".to_string(), fvs.clone()),
("5".to_string(), fvs.clone()),
("6".to_string(), fvs.clone()),
("7".to_string(), fvs.clone()),
("8".to_string(), fvs.clone()),
("9".to_string(), fvs),
("10".to_string(), svs),
]);
batch.put(put_data).unwrap();
}
let types = proto::wal::gen_mutation_types(batch.payload());
(batch, types)
}

View File

@@ -1,27 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use storage::write_batch::WriteBatch;
use crate::memtable::util::schema_util::{self, ColumnDef};
pub fn new_write_batch(
column_defs: &[ColumnDef],
timestamp_index: Option<usize>,
row_key_end: usize,
) -> WriteBatch {
let schema = schema_util::new_schema_ref(column_defs, timestamp_index);
WriteBatch::new(schema, row_key_end)
}

View File

@@ -1,19 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
fn main() {
tonic_build::configure()
.compile(&["proto/wal.proto"], &["."])
.expect("compile proto");
}

View File

@@ -1,14 +0,0 @@
syntax = "proto3";
package greptime.storage.wal.v1;
message WalHeader {
uint64 last_manifest_version = 1;
// Type of each mutation in payload, now only arrow payload uses this field.
repeated MutationType mutation_types = 2;
}
enum MutationType {
DELETE = 0;
PUT = 1;
}

View File

@@ -1,451 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use async_trait::async_trait;
use common_query::logical_plan::Expr;
use common_recordbatch::OrderOption;
use common_telemetry::logging;
use common_time::range::TimestampRange;
use snafu::ResultExt;
use store_api::storage::{Chunk, ChunkReader, RegionId, SchemaRef, SequenceNumber};
use table::predicate::{Predicate, TimeRangePredicateBuilder};
use crate::error::{self, Error, Result};
use crate::memtable::{IterContext, MemtableRef};
use crate::read::{
Batch, BoxedBatchReader, ChainReader, DedupReader, MergeReaderBuilder, WindowedReader,
};
use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
use crate::sst::{AccessLayerRef, FileHandle, LevelMetas, ReadOptions};
use crate::window_infer::{PlainWindowInference, WindowInfer};
/// Chunk reader implementation.
// Now we use async-trait to implement the chunk reader, which is easier to implement than
// using `Stream`, maybe change to `Stream` if we find out it is more efficient and have
// necessary to do so.
pub struct ChunkReaderImpl {
schema: ProjectedSchemaRef,
batch_reader: BoxedBatchReader,
output_ordering: Option<Vec<OrderOption>>,
}
#[async_trait]
impl ChunkReader for ChunkReaderImpl {
type Error = Error;
fn user_schema(&self) -> &SchemaRef {
self.schema.projected_user_schema()
}
async fn next_chunk(&mut self) -> Result<Option<Chunk>> {
let batch = match self.batch_reader.next_batch().await? {
Some(b) => b,
None => return Ok(None),
};
Ok(Some(Chunk::new(batch.columns)))
}
fn project_chunk(&self, chunk: Chunk) -> Chunk {
let batch = Batch {
columns: chunk.columns,
};
self.schema.batch_to_chunk(&batch)
}
fn output_ordering(&self) -> Option<Vec<OrderOption>> {
self.output_ordering.clone()
}
}
impl ChunkReaderImpl {
pub fn new(
schema: ProjectedSchemaRef,
batch_reader: BoxedBatchReader,
output_ordering: Option<Vec<OrderOption>>,
) -> ChunkReaderImpl {
ChunkReaderImpl {
schema,
batch_reader,
output_ordering,
}
}
#[inline]
pub fn projected_schema(&self) -> &ProjectedSchemaRef {
&self.schema
}
}
/// Builder to create a new [ChunkReaderImpl] from scan request.
pub struct ChunkReaderBuilder {
region_id: RegionId,
schema: RegionSchemaRef,
projection: Option<Vec<usize>>,
filters: Vec<Expr>,
sst_layer: AccessLayerRef,
iter_ctx: IterContext,
memtables: Vec<MemtableRef>,
files_to_read: Vec<FileHandle>,
output_ordering: Option<Vec<OrderOption>>,
use_chain_reader: bool,
}
impl ChunkReaderBuilder {
pub fn new(region_id: RegionId, schema: RegionSchemaRef, sst_layer: AccessLayerRef) -> Self {
ChunkReaderBuilder {
region_id,
schema,
projection: None,
filters: vec![],
sst_layer,
iter_ctx: IterContext::default(),
memtables: Vec::new(),
files_to_read: Vec::new(),
output_ordering: None,
use_chain_reader: false,
}
}
/// Reserve space for iterating `num` memtables.
pub fn reserve_num_memtables(mut self, num: usize) -> Self {
self.memtables.reserve(num);
self
}
pub fn projection(mut self, projection: Option<Vec<usize>>) -> Self {
self.projection = projection;
self
}
pub fn filters(mut self, filters: Vec<Expr>) -> Self {
self.filters = filters;
self
}
pub fn output_ordering(mut self, ordering: Option<Vec<OrderOption>>) -> Self {
self.output_ordering = ordering;
self
}
pub fn batch_size(mut self, batch_size: usize) -> Self {
self.iter_ctx.batch_size = batch_size;
self
}
pub fn visible_sequence(mut self, sequence: SequenceNumber) -> Self {
self.iter_ctx.visible_sequence = sequence;
self
}
pub fn pick_memtables(mut self, memtables: MemtableRef) -> Self {
self.memtables.push(memtables);
self
}
/// Partition files and memtables according to their time windows and scan time windows
/// one by one.
///
/// Note that compaction should not enable this.
pub fn use_chain_reader(mut self, use_chain_reader: bool) -> Self {
self.use_chain_reader = use_chain_reader;
self
}
/// Picks all SSTs in all levels
pub fn pick_all_ssts(mut self, ssts: &LevelMetas) -> Result<Self> {
let files = ssts.levels().iter().flat_map(|level| level.files());
// Now we read all files, so just reserve enough space to hold all files.
self.files_to_read.reserve(files.size_hint().0);
for file in files {
// We can't invoke async functions here, so we collects all files first, and
// create the batch reader later in `ChunkReaderBuilder`.
self.files_to_read.push(file.clone());
}
Ok(self)
}
/// Picks given SSTs to read.
pub fn pick_ssts(mut self, ssts: &[FileHandle]) -> Self {
for file in ssts {
self.files_to_read.push(file.clone());
}
self
}
/// Try to infer time window from output ordering. If the result
/// is `None` means the output ordering is not obeyed, otherwise
/// means the output ordering is obeyed and is same with request.
fn infer_time_windows(&self, output_ordering: &[OrderOption]) -> Option<Vec<TimestampRange>> {
if output_ordering.is_empty() {
return None;
}
let OrderOption { name, options } = &output_ordering[0];
if name != self.schema.timestamp_column_name() {
return None;
}
let memtable_stats = self
.memtables
.iter()
.filter(|m| m.num_rows() > 0) // Skip empty memtables.
.map(|m| m.stats())
.collect::<Vec<_>>();
let files = self
.files_to_read
.iter()
.map(FileHandle::meta)
.collect::<Vec<_>>();
Some(PlainWindowInference {}.infer_window(&files, &memtable_stats, options.descending))
}
async fn build_windowed(
self,
schema: &ProjectedSchemaRef,
time_range_predicate: &TimestampRange,
windows: Vec<TimestampRange>,
order_options: Vec<OrderOption>,
) -> Result<BoxedBatchReader> {
let mut readers = Vec::with_capacity(windows.len());
for window in windows {
let time_range_predicate = time_range_predicate.and(&window);
let reader = self.build_reader(schema, &time_range_predicate).await?;
readers.push(reader);
}
let windowed_reader = WindowedReader::new(schema.clone(), readers, order_options);
Ok(Box::new(windowed_reader) as Box<_>)
}
async fn build_reader(
&self,
schema: &ProjectedSchemaRef,
time_range: &TimestampRange,
) -> Result<BoxedBatchReader> {
let num_sources = self.memtables.len() + self.files_to_read.len();
let mut reader_builder = MergeReaderBuilder::with_capacity(schema.clone(), num_sources)
.batch_size(self.iter_ctx.batch_size);
for mem in &self.memtables {
let mut iter_ctx = self.iter_ctx.clone();
iter_ctx.time_range = Some(*time_range);
let iter = mem.iter(iter_ctx)?;
reader_builder = reader_builder.push_batch_iter(iter);
}
let predicate = Predicate::new(self.filters.clone());
let read_opts = ReadOptions {
batch_size: self.iter_ctx.batch_size,
projected_schema: schema.clone(),
predicate,
time_range: *time_range,
};
let mut num_read_files = 0;
for file in &self.files_to_read {
if !Self::file_in_range(file, time_range) {
logging::debug!(
"Skip region {} file {:?}, predicate: {:?}",
self.region_id,
file,
time_range
);
continue;
}
let reader = self.sst_layer.read_sst(file.clone(), &read_opts).await?;
reader_builder = reader_builder.push_batch_reader(reader);
num_read_files += 1;
}
logging::debug!(
"build reader done, region_id: {}, time_range: {:?}, total_files: {}, num_read_files: {}",
self.region_id,
time_range,
self.files_to_read.len(),
num_read_files,
);
let reader = reader_builder.build();
let reader = DedupReader::new(schema.clone(), reader);
Ok(Box::new(reader) as Box<_>)
}
pub async fn build(mut self) -> Result<ChunkReaderImpl> {
let time_range_predicate = self.build_time_range_predicate();
let schema = Arc::new(
ProjectedSchema::new(self.schema.clone(), self.projection.clone())
.context(error::InvalidProjectionSnafu)?,
);
self.iter_ctx.projected_schema = Some(schema.clone());
let mut output_ordering = None;
let reader = if let Some(ordering) = self.output_ordering.take() &&
let Some(windows) = self.infer_time_windows(&ordering) {
output_ordering = Some(ordering.clone());
self.build_windowed(&schema, &time_range_predicate, windows, ordering)
.await?
} else if self.use_chain_reader {
self.build_chained(&schema, &time_range_predicate).await?
} else {
self.build_reader(&schema, &time_range_predicate).await?
};
Ok(ChunkReaderImpl::new(schema, reader, output_ordering))
}
async fn build_chained(
&self,
schema: &ProjectedSchemaRef,
time_range: &TimestampRange,
) -> Result<BoxedBatchReader> {
let windows = self.infer_window_for_chain_reader(time_range);
logging::debug!(
"Infer window for chain reader, region_id: {}, memtables: {}, files: {}, num_windows: {}",
self.region_id,
self.memtables.len(),
self.files_to_read.len(),
windows.len(),
);
let mut readers = Vec::with_capacity(windows.len());
for window in &windows {
let time_range = time_range.and(window);
let reader = self.build_reader(schema, &time_range).await?;
readers.push(reader);
}
logging::debug!(
"Build chain reader, region_id: {}, time_range: {:?}, num_readers: {}",
self.region_id,
time_range,
readers.len(),
);
let chain_reader = ChainReader::new(schema.clone(), readers);
Ok(Box::new(chain_reader) as Box<_>)
}
/// Build time range predicate from schema and filters.
fn build_time_range_predicate(&self) -> TimestampRange {
let Some(ts_col) = self.schema.user_schema().timestamp_column() else {
return TimestampRange::min_to_max();
};
let unit = ts_col
.data_type
.as_timestamp()
.expect("Timestamp column must have timestamp-compatible type")
.unit();
TimeRangePredicateBuilder::new(&ts_col.name, unit, &self.filters).build()
}
/// Check if SST file's time range matches predicate.
fn file_in_range(file: &FileHandle, predicate: &TimestampRange) -> bool {
if predicate == &TimestampRange::min_to_max() {
return true;
}
// end_timestamp of sst file is inclusive.
let Some((start, end)) = *file.time_range() else {
return true;
};
let file_ts_range = TimestampRange::new_inclusive(Some(start), Some(end));
file_ts_range.intersects(predicate)
}
/// Returns the time range of memtables to read.
fn compute_memtable_range(&self) -> Option<TimestampRange> {
let (min_timestamp, max_timestamp) = self
.memtables
.iter()
.filter(|m| m.num_rows() > 0) // Skip empty memtables.
.map(|m| {
let stats = m.stats();
(stats.min_timestamp, stats.max_timestamp)
})
.reduce(|acc, e| (acc.0.min(e.0), acc.1.max(e.1)))?;
logging::debug!(
"Compute memtable range, region_id: {}, min: {:?}, max: {:?}",
self.region_id,
min_timestamp,
max_timestamp,
);
Some(TimestampRange::new_inclusive(
Some(min_timestamp),
Some(max_timestamp),
))
}
/// Infer time window for chain reader according to the time range of memtables and files.
fn infer_window_for_chain_reader(&self, time_range: &TimestampRange) -> Vec<TimestampRange> {
let mut memtable_range = self.compute_memtable_range();
// file ranges: (start, end)
let mut file_ranges = Vec::with_capacity(self.files_to_read.len());
for file in &self.files_to_read {
if !Self::file_in_range(file, time_range) || file.time_range().is_none() {
continue;
}
// Safety: we have skip files whose range is `None`.
let range = file.time_range().unwrap();
// Filter by memtable's time range.
if let Some(mem_range) = &mut memtable_range {
let file_range = TimestampRange::new_inclusive(Some(range.0), Some(range.1));
if mem_range.intersects(&file_range) {
// If the range of the SST intersects with the range of the
// memtable, we merge it into the memtable's range.
*mem_range = mem_range.or(&file_range);
continue;
}
}
file_ranges.push((range.0, range.1));
}
if file_ranges.is_empty() {
return memtable_range.map(|range| vec![range]).unwrap_or_default();
}
// Sort by start times.
file_ranges.sort_unstable_by(|left, right| left.0.cmp(&right.0));
// Compute ranges for all SSTs.
let mut time_ranges = Vec::with_capacity(file_ranges.len() + 1);
// Safety: file_ranges is not empty.
let mut prev =
TimestampRange::new_inclusive(Some(file_ranges[0].0), Some(file_ranges[0].1));
for file_range in &file_ranges[1..] {
let current = TimestampRange::new_inclusive(Some(file_range.0), Some(file_range.1));
if prev.intersects(&current) {
prev = prev.or(&current);
} else {
time_ranges.push(prev);
prev = current;
}
}
time_ranges.push(prev);
if let Some(mem_range) = memtable_range {
time_ranges.push(mem_range);
// We have pushed the memtable range, resort the array.
time_ranges.sort_unstable_by(|left, right| left.start().cmp(right.start()));
}
time_ranges
}
}

View File

@@ -1,33 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_error::ext::ErrorExt;
pub trait Encoder {
/// The type that is decoded.
type Item;
type Error: ErrorExt;
/// Encodes a message into the bytes buffer.
fn encode(&self, item: &Self::Item, dst: &mut Vec<u8>) -> Result<(), Self::Error>;
}
pub trait Decoder {
/// The type that is decoded.
type Item;
type Error: ErrorExt;
/// Decodes a message from the bytes buffer.
fn decode(&self, src: &[u8]) -> Result<Self::Item, Self::Error>;
}

View File

@@ -1,193 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod noop;
mod picker;
mod scheduler;
mod task;
mod twcs;
mod writer;
use std::sync::Arc;
use common_telemetry::warn;
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
pub use picker::{LeveledTimeWindowPicker, Picker, PickerContext};
pub use scheduler::{CompactionHandler, CompactionRequestImpl};
use store_api::logstore::LogStore;
use store_api::storage::CompactionStrategy;
pub use task::{CompactionTask, CompactionTaskImpl};
pub use twcs::TwcsPicker;
use crate::scheduler::Scheduler;
use crate::sst::FileHandle;
pub type CompactionPickerRef<S> =
Arc<dyn Picker<Request = CompactionRequestImpl<S>, Task = CompactionTaskImpl<S>> + Send + Sync>;
pub type CompactionSchedulerRef<S> =
Arc<dyn Scheduler<Request = CompactionRequestImpl<S>> + Send + Sync>;
/// Infers the suitable time bucket duration.
/// Now it simply find the max and min timestamp across all SSTs in level and fit the time span
/// into time bucket.
pub(crate) fn infer_time_bucket<'a>(files: impl Iterator<Item = &'a FileHandle>) -> i64 {
let mut max_ts = Timestamp::new(i64::MIN, TimeUnit::Second);
let mut min_ts = Timestamp::new(i64::MAX, TimeUnit::Second);
for f in files {
if let Some((start, end)) = f.time_range() {
min_ts = min_ts.min(*start);
max_ts = max_ts.max(*end);
} else {
// we don't expect an SST file without time range,
// it's either a bug or data corruption.
warn!("Found SST file without time range metadata: {f:?}");
}
}
// safety: Convert whatever timestamp into seconds will not cause overflow.
let min_sec = min_ts.convert_to(TimeUnit::Second).unwrap().value();
let max_sec = max_ts.convert_to(TimeUnit::Second).unwrap().value();
max_sec
.checked_sub(min_sec)
.map(|span| TIME_BUCKETS.fit_time_bucket(span)) // return the max bucket on subtraction overflow.
.unwrap_or_else(|| TIME_BUCKETS.max()) // safety: TIME_BUCKETS cannot be empty.
}
pub(crate) struct TimeBuckets([i64; 7]);
impl TimeBuckets {
/// Fits a given time span into time bucket by find the minimum bucket that can cover the span.
/// Returns the max bucket if no such bucket can be found.
fn fit_time_bucket(&self, span_sec: i64) -> i64 {
assert!(span_sec >= 0);
match self.0.binary_search(&span_sec) {
Ok(idx) => self.0[idx],
Err(idx) => {
if idx < self.0.len() {
self.0[idx]
} else {
self.0.last().copied().unwrap()
}
}
}
}
#[cfg(test)]
fn get(&self, idx: usize) -> i64 {
self.0[idx]
}
fn max(&self) -> i64 {
self.0.last().copied().unwrap()
}
}
/// A set of predefined time buckets.
pub(crate) const TIME_BUCKETS: TimeBuckets = TimeBuckets([
60 * 60, // one hour
2 * 60 * 60, // two hours
12 * 60 * 60, // twelve hours
24 * 60 * 60, // one day
7 * 24 * 60 * 60, // one week
365 * 24 * 60 * 60, // one year
10 * 365 * 24 * 60 * 60, // ten years
]);
pub fn compaction_strategy_to_picker<S: LogStore>(
strategy: &CompactionStrategy,
) -> CompactionPickerRef<S> {
match strategy {
CompactionStrategy::Twcs(twcs_opts) => Arc::new(TwcsPicker::new(
twcs_opts.max_active_window_files,
twcs_opts.max_inactive_window_files,
twcs_opts.time_window_seconds,
)) as Arc<_>,
}
}
#[cfg(test)]
mod tests {
use common_time::Timestamp;
use super::*;
use crate::file_purger::noop::new_noop_file_purger;
use crate::sst::{FileHandle, FileId, FileMeta, Level};
/// Test util to create file handles.
pub fn new_file_handle(
file_id: FileId,
start_ts_millis: i64,
end_ts_millis: i64,
level: Level,
) -> FileHandle {
let file_purger = new_noop_file_purger();
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
FileHandle::new(
FileMeta {
region_id: 0.into(),
file_id,
time_range: Some((
Timestamp::new_millisecond(start_ts_millis),
Timestamp::new_millisecond(end_ts_millis),
)),
level,
file_size: 0,
},
layer,
file_purger,
)
}
#[test]
fn test_time_bucket() {
assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(1));
assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(60 * 60));
assert_eq!(
TIME_BUCKETS.get(1),
TIME_BUCKETS.fit_time_bucket(60 * 60 + 1)
);
assert_eq!(
TIME_BUCKETS.get(2),
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2) - 1)
);
assert_eq!(
TIME_BUCKETS.get(2),
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2))
);
assert_eq!(
TIME_BUCKETS.get(3),
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(3) - 1)
);
assert_eq!(TIME_BUCKETS.get(6), TIME_BUCKETS.fit_time_bucket(i64::MAX));
}
#[test]
fn test_infer_time_buckets() {
assert_eq!(
TIME_BUCKETS.get(0),
infer_time_bucket(
[
new_file_handle(FileId::random(), 0, TIME_BUCKETS.get(0) * 1000 - 1, 0),
new_file_handle(FileId::random(), 1, 10_000, 0)
]
.iter()
)
);
}
}

View File

@@ -1,91 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{Debug, Formatter};
use std::marker::PhantomData;
use store_api::storage::RegionId;
use crate::compaction::{CompactionTask, Picker};
use crate::error::Result;
use crate::scheduler::{Request, Scheduler};
pub struct NoopCompactionScheduler<R> {
_phantom_data: PhantomData<R>,
}
impl<R> Default for NoopCompactionScheduler<R> {
fn default() -> Self {
Self {
_phantom_data: Default::default(),
}
}
}
impl<R> Debug for NoopCompactionScheduler<R> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("NoopCompactionScheduler<...>").finish()
}
}
#[derive(Default, Debug)]
pub struct NoopCompactionRequest;
#[derive(Default, Debug)]
pub struct NoopCompactionPicker;
impl Picker for NoopCompactionPicker {
type Request = NoopCompactionRequest;
type Task = NoopCompactionTask;
fn pick(&self, _req: &Self::Request) -> Result<Option<Self::Task>> {
Ok(None)
}
}
#[derive(Debug)]
pub struct NoopCompactionTask;
#[async_trait::async_trait]
impl CompactionTask for NoopCompactionTask {
async fn run(self) -> Result<()> {
Ok(())
}
}
impl Request for NoopCompactionRequest {
type Key = RegionId;
fn key(&self) -> Self::Key {
RegionId::from(0)
}
fn complete(self, _result: Result<()>) {}
}
#[async_trait::async_trait]
impl<R> Scheduler for NoopCompactionScheduler<R>
where
R: Request<Key = RegionId>,
{
type Request = R;
fn schedule(&self, _request: Self::Request) -> Result<bool> {
Ok(true)
}
async fn stop(&self, _await_termination: bool) -> Result<()> {
Ok(())
}
}

View File

@@ -1,432 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::marker::PhantomData;
use std::time::Duration;
use common_telemetry::{debug, error, info, warn};
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use snafu::ResultExt;
use store_api::logstore::LogStore;
use crate::compaction::infer_time_bucket;
use crate::compaction::scheduler::CompactionRequestImpl;
use crate::compaction::task::{CompactionOutput, CompactionTask, CompactionTaskImpl};
use crate::error::{Result, TtlCalculationSnafu};
use crate::scheduler::Request;
use crate::sst::{FileHandle, FileId, LevelMeta};
/// Picker picks input SST files and builds the compaction task.
/// Different compaction strategy may implement different pickers.
pub trait Picker: Debug + Send + 'static {
type Request: Request;
type Task: CompactionTask;
fn pick(&self, req: &Self::Request) -> Result<Option<Self::Task>>;
}
pub(crate) fn get_expired_ssts(
levels: &[LevelMeta],
ttl: Option<Duration>,
now: Timestamp,
) -> Result<Vec<FileHandle>> {
let Some(ttl) = ttl else {
return Ok(vec![]);
};
let expire_time = now.sub_duration(ttl).context(TtlCalculationSnafu)?;
let expired_ssts = levels
.iter()
.flat_map(|l| l.get_expired_files(&expire_time).into_iter())
.collect();
Ok(expired_ssts)
}
pub struct PickerContext {
compaction_time_window: Option<i64>,
}
impl PickerContext {
pub fn with(compaction_time_window: Option<i64>) -> Self {
Self {
compaction_time_window,
}
}
pub fn compaction_time_window(&self) -> Option<i64> {
self.compaction_time_window
}
}
/// `LeveledTimeWindowPicker` only handles level 0 to level 1 compaction in a time-window tiered
/// manner. It picks all SSTs in level 0 and writes rows in these SSTs to a new file partitioned
/// by a inferred time bucket in level 1.
pub struct LeveledTimeWindowPicker<S> {
_phantom_data: PhantomData<S>,
}
impl<S> Debug for LeveledTimeWindowPicker<S> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "LeveledTimeWindowPicker{{..}}")
}
}
impl<S> Default for LeveledTimeWindowPicker<S> {
fn default() -> Self {
Self::new()
}
}
impl<S> LeveledTimeWindowPicker<S> {
pub fn new() -> Self {
Self {
_phantom_data: Default::default(),
}
}
}
impl<S: LogStore> Picker for LeveledTimeWindowPicker<S> {
type Request = CompactionRequestImpl<S>;
type Task = CompactionTaskImpl<S>;
fn pick(&self, req: &CompactionRequestImpl<S>) -> Result<Option<CompactionTaskImpl<S>>> {
let levels = &req.levels();
let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())
.map_err(|e| {
error!(e;"Failed to get region expired SST files, region: {}, ttl: {:?}", req.region_id, req.ttl);
e
})
.unwrap_or_default();
if !expired_ssts.is_empty() {
info!(
"Expired SSTs in region {}: {:?}",
req.region_id, expired_ssts
);
// here we mark expired SSTs as compacting to avoid them being picked.
expired_ssts.iter().for_each(|f| f.mark_compacting(true));
}
let ctx = &PickerContext::with(req.compaction_time_window);
let mut outputs = vec![];
for level_num in 0..levels.level_num() {
let level = levels.level(level_num as u8);
let compaction_time_window = Self::pick_level(ctx, level, &mut outputs);
if outputs.is_empty() {
debug!(
"No SST file can be compacted at level {}, path: {:?}",
level_num, req.sst_layer
);
continue;
}
debug!(
"Found SST files to compact {:?} on level: {}, compaction window: {:?}",
outputs, level_num, compaction_time_window,
);
return Ok(Some(CompactionTaskImpl {
schema: req.schema(),
sst_layer: req.sst_layer.clone(),
outputs,
writer: req.writer.clone(),
shared_data: req.shared.clone(),
wal: req.wal.clone(),
manifest: req.manifest.clone(),
expired_ssts,
sst_write_buffer_size: req.sst_write_buffer_size,
compaction_time_window,
reschedule_on_finish: req.reschedule_on_finish,
}));
}
Ok(None)
}
}
impl<S> LeveledTimeWindowPicker<S> {
fn pick_level(
ctx: &PickerContext,
level: &LevelMeta,
results: &mut Vec<CompactionOutput>,
) -> Option<i64> {
// SimpleTimeWindowStrategy only handles level 0 to level 1 compaction.
if level.level() != 0 {
return None;
}
let files = find_compactable_files(level);
debug!("Compactable files found: {:?}", files);
if files.is_empty() {
return None;
}
let time_window = ctx.compaction_time_window().unwrap_or_else(|| {
let inferred = infer_time_bucket(files.iter());
debug!(
"Compaction window is not present, inferring from files: {:?}",
inferred
);
inferred
});
let buckets = calculate_time_buckets(time_window, &files);
debug!("File bucket:{}, file groups: {:?}", time_window, buckets);
results.extend(buckets.into_iter().map(|(bound, files)| CompactionOutput {
output_file_id: FileId::random(),
output_level: 1,
time_window_bound: bound,
time_window_sec: time_window,
inputs: files,
// strict window is used in simple time window strategy in that rows in one file
// may get compacted to multiple destinations.
strict_window: true,
}));
Some(time_window)
}
}
/// Finds files that can be compacted in given level.
/// Currently they're files that is not currently under compaction.
#[inline]
fn find_compactable_files(level: &LevelMeta) -> Vec<FileHandle> {
level.files().filter(|f| !f.compacting()).cloned().collect()
}
/// Calculates buckets for files. If file does not contain a time range in metadata, it will be
/// assigned to a special bucket `i64::MAX` (normally no timestamp can be aligned to this bucket)
/// so that all files without timestamp can be compacted together.
fn calculate_time_buckets(bucket_sec: i64, files: &[FileHandle]) -> HashMap<i64, Vec<FileHandle>> {
let mut buckets = HashMap::new();
for file in files {
if let Some((start, end)) = file.time_range() {
let bounds = file_time_bucket_span(
start.convert_to(TimeUnit::Second).unwrap().value(),
end.convert_to(TimeUnit::Second).unwrap().value(),
bucket_sec,
);
for bound in bounds {
buckets
.entry(bound)
.or_insert_with(Vec::new)
.push(file.clone());
}
} else {
warn!("Found corrupted SST without timestamp bounds: {:?}", file);
}
}
buckets
}
/// Calculates timestamp span between start and end timestamp.
fn file_time_bucket_span(start_sec: i64, end_sec: i64, bucket_sec: i64) -> Vec<i64> {
assert!(start_sec <= end_sec);
// if timestamp is between `[i64::MIN, i64::MIN.align_by_bucket(bucket)]`, which cannot
// be aligned to a valid i64 bound, simply return `i64::MIN` rather than just underflow.
let mut start_aligned = start_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
let end_aligned = end_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
let mut res = Vec::with_capacity(((end_aligned - start_aligned) / bucket_sec + 1) as usize);
while start_aligned < end_aligned {
res.push(start_aligned);
start_aligned += bucket_sec;
}
res.push(end_aligned);
res
}
#[cfg(test)]
mod tests {
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use super::*;
use crate::compaction::tests::new_file_handle;
use crate::compaction::TIME_BUCKETS;
use crate::file_purger::noop::new_noop_file_purger;
use crate::sst::{FileId, Level, LevelMetas};
#[test]
fn test_time_bucket_span() {
assert_eq!(vec![0], file_time_bucket_span(1, 9, 10));
assert_eq!(vec![0, 10], file_time_bucket_span(1, 10, 10));
assert_eq!(vec![-10], file_time_bucket_span(-10, -1, 10));
assert_eq!(vec![-10, 0], file_time_bucket_span(-10, 0, 10));
}
#[test]
fn test_time_bucket_span_large() {
assert_eq!(
vec![
(i64::MAX - 10).align_by_bucket(10).unwrap(),
i64::MAX.align_by_bucket(10).unwrap(),
],
file_time_bucket_span(i64::MAX - 10, i64::MAX, 10)
);
// magic hmmm?
for bucket in 1..100 {
assert_eq!(
vec![
i64::MIN,
(i64::MIN + bucket).align_by_bucket(bucket).unwrap()
],
file_time_bucket_span(i64::MIN, i64::MIN + bucket, bucket)
);
}
}
fn new_file_handles(input: &[(FileId, i64, i64)]) -> Vec<FileHandle> {
input
.iter()
.map(|(file_id, start, end)| new_file_handle(*file_id, *start, *end, 0))
.collect()
}
fn check_bucket_calculation(
bucket_sec: i64,
files: Vec<FileHandle>,
expected: &[(i64, &[FileId])],
) {
let res = calculate_time_buckets(bucket_sec, &files);
let expected = expected
.iter()
.map(|(bucket, file_ids)| (*bucket, file_ids.iter().copied().collect::<HashSet<_>>()))
.collect::<HashMap<_, _>>();
for (bucket, file_ids) in expected {
let actual = res
.get(&bucket)
.unwrap()
.iter()
.map(|f| f.file_id())
.collect();
assert_eq!(
file_ids, actual,
"bucket: {bucket}, expected: {file_ids:?}, actual: {actual:?}",
);
}
}
#[test]
fn test_calculate_time_buckets() {
let file_id_a = FileId::random();
let file_id_b = FileId::random();
// simple case, files with disjoint
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 9000), (file_id_b, 10000, 19000)]),
&[(0, &[file_id_a]), (10, &[file_id_b])],
);
// files across buckets
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 10001), (file_id_b, 10000, 19000)]),
&[(0, &[file_id_a]), (10, &[file_id_a, file_id_b])],
);
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 10000)]),
&[(0, &[file_id_a]), (10, &[file_id_a])],
);
// file with an large time range
let file_id_array = &[file_id_a];
let expected = (0..(TIME_BUCKETS.get(4) / TIME_BUCKETS.get(0)))
.map(|b| (b * TIME_BUCKETS.get(0), file_id_array as _))
.collect::<Vec<_>>();
check_bucket_calculation(
TIME_BUCKETS.get(0),
new_file_handles(&[(file_id_a, 0, TIME_BUCKETS.get(4) * 1000)]),
&expected,
);
}
struct TtlTester {
files: Vec<(FileId, i64, i64, Level)>,
ttl: Option<Duration>,
expired: Vec<usize>,
now: Timestamp,
}
impl TtlTester {
fn check(&self) {
let expected_expired = self
.expired
.iter()
.map(|idx| self.files[*idx].0)
.collect::<HashSet<_>>();
let file_purger = new_noop_file_purger();
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
let file_handles = self
.files
.iter()
.map(|(file_id, start_ts, end_ts, level)| {
new_file_handle(*file_id, *start_ts, *end_ts, *level).meta()
})
.collect::<Vec<_>>();
let levels = LevelMetas::new(layer, file_purger).merge(
file_handles.into_iter(),
vec![].into_iter(),
None,
);
let expired = get_expired_ssts(levels.levels(), self.ttl, self.now)
.unwrap()
.into_iter()
.map(|f| f.file_id())
.collect::<HashSet<_>>();
assert_eq!(expected_expired, expired);
}
}
#[test]
fn test_find_expired_ssts() {
TtlTester {
files: vec![
(FileId::random(), 8000, 9000, 0),
(FileId::random(), 10000, 11000, 0),
(FileId::random(), 8000, 11000, 1),
(FileId::random(), 2000, 3000, 1),
],
ttl: Some(Duration::from_secs(1)),
expired: vec![3],
now: Timestamp::new_second(10),
}
.check();
TtlTester {
files: vec![
(FileId::random(), 8000, 8999, 0),
(FileId::random(), 10000, 11000, 0),
(FileId::random(), 8000, 11000, 1),
(FileId::random(), 2000, 3000, 1),
],
ttl: Some(Duration::from_secs(1)),
expired: vec![0, 3],
now: Timestamp::new_second(10),
}
.check();
}
}

View File

@@ -1,157 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::marker::PhantomData;
use std::sync::Arc;
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_telemetry::{debug, error, info};
use store_api::logstore::LogStore;
use store_api::storage::RegionId;
use tokio::sync::oneshot::Sender;
use tokio::sync::Notify;
use crate::compaction::task::CompactionTask;
use crate::compaction::CompactionPickerRef;
use crate::error::Result;
use crate::manifest::region::RegionManifest;
use crate::region::{RegionWriterRef, SharedDataRef};
use crate::scheduler::rate_limit::BoxedRateLimitToken;
use crate::scheduler::{Handler, Request};
use crate::schema::RegionSchemaRef;
use crate::sst::AccessLayerRef;
use crate::version::LevelMetasRef;
use crate::wal::Wal;
impl<S: LogStore> Request for CompactionRequestImpl<S> {
type Key = RegionId;
#[inline]
fn key(&self) -> RegionId {
self.region_id
}
fn complete(self, result: Result<()>) {
if let Some(sender) = self.sender {
// We don't care the send result as callers might not
// wait the result.
let _ = sender.send(result);
}
}
}
/// Region compaction request.
pub struct CompactionRequestImpl<S: LogStore> {
pub region_id: RegionId,
pub sst_layer: AccessLayerRef,
pub writer: RegionWriterRef<S>,
pub shared: SharedDataRef,
pub manifest: RegionManifest,
pub wal: Wal<S>,
pub ttl: Option<Duration>,
pub compaction_time_window: Option<i64>,
/// Compaction result sender.
pub sender: Option<Sender<Result<()>>>,
pub picker: CompactionPickerRef<S>,
pub sst_write_buffer_size: ReadableSize,
/// Whether to immediately reschedule another compaction when finished.
pub reschedule_on_finish: bool,
}
impl<S: LogStore> CompactionRequestImpl<S> {
#[inline]
pub(crate) fn schema(&self) -> RegionSchemaRef {
self.shared.version_control.current().schema().clone()
}
#[inline]
pub(crate) fn levels(&self) -> LevelMetasRef {
self.shared.version_control.current().ssts().clone()
}
}
pub struct CompactionHandler<S: LogStore> {
_phantom_data: PhantomData<S>,
#[cfg(test)]
pub pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
}
impl<S: LogStore> Default for CompactionHandler<S> {
fn default() -> Self {
Self {
_phantom_data: Default::default(),
#[cfg(test)]
pending_tasks: Arc::new(Default::default()),
}
}
}
impl<S: LogStore> CompactionHandler<S> {
#[cfg(test)]
pub fn new_with_pending_tasks(
tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
) -> Self {
Self {
_phantom_data: Default::default(),
pending_tasks: tasks,
}
}
}
#[async_trait::async_trait]
impl<S> Handler for CompactionHandler<S>
where
S: LogStore,
{
type Request = CompactionRequestImpl<S>;
async fn handle_request(
&self,
req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
let region_id = req.key();
let Some(task) = req.picker.pick(&req)? else {
info!("No file needs compaction in region: {:?}", region_id);
req.complete(Ok(()));
return Ok(());
};
debug!("Compaction task, region: {:?}, task: {:?}", region_id, task);
// TODO(hl): we need to keep a track of task handle here to allow task cancellation.
let _handle = common_runtime::spawn_bg(async move {
if let Err(e) = task.run().await {
// TODO(hl): maybe resubmit compaction task on failure?
error!(e; "Failed to compact region: {:?}", region_id);
req.complete(Err(e));
} else {
info!("Successfully compacted region: {:?}", region_id);
req.complete(Ok(()));
}
// releases rate limit token
token.try_release();
// notify scheduler to schedule next task when current task finishes.
finish_notifier.notify_one();
});
#[cfg(test)]
self.pending_tasks.write().await.push(_handle);
Ok(())
}
}

View File

@@ -1,309 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use std::fmt::{Debug, Formatter};
use common_base::readable_size::ReadableSize;
use common_telemetry::{debug, error, info};
use itertools::Itertools;
use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::storage::{CompactContext, RegionId};
use crate::compaction::writer::build_sst_reader;
use crate::error;
use crate::error::Result;
use crate::manifest::action::RegionEdit;
use crate::manifest::region::RegionManifest;
use crate::region::{RegionWriterRef, SharedDataRef, WriterCompactRequest};
use crate::schema::RegionSchemaRef;
use crate::sst::{
AccessLayerRef, FileHandle, FileId, FileMeta, Level, Source, SstInfo, WriteOptions,
};
use crate::wal::Wal;
const MAX_PARALLEL_COMPACTION: usize = 8;
#[async_trait::async_trait]
pub trait CompactionTask: Debug + Send + Sync + 'static {
async fn run(self) -> Result<()>;
}
pub struct CompactionTaskImpl<S: LogStore> {
pub schema: RegionSchemaRef,
pub sst_layer: AccessLayerRef,
pub outputs: Vec<CompactionOutput>,
pub writer: RegionWriterRef<S>,
pub shared_data: SharedDataRef,
pub wal: Wal<S>,
pub manifest: RegionManifest,
pub expired_ssts: Vec<FileHandle>,
pub sst_write_buffer_size: ReadableSize,
pub compaction_time_window: Option<i64>,
pub reschedule_on_finish: bool,
}
impl<S: LogStore> Debug for CompactionTaskImpl<S> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CompactionTaskImpl")
.field("region_name", &self.shared_data.name())
.finish()
}
}
impl<S: LogStore> Drop for CompactionTaskImpl<S> {
fn drop(&mut self) {
self.mark_files_compacting(false);
}
}
impl<S: LogStore> CompactionTaskImpl<S> {
/// Compacts inputs SSTs, returns `(output file, compacted input file)`.
async fn merge_ssts(&mut self) -> Result<(HashSet<FileMeta>, HashSet<FileMeta>)> {
let mut futs = Vec::with_capacity(self.outputs.len());
let mut compacted_inputs = HashSet::new();
let region_id = self.shared_data.id();
for output in self.outputs.drain(..) {
let schema = self.schema.clone();
let sst_layer = self.sst_layer.clone();
let sst_write_buffer_size = self.sst_write_buffer_size;
compacted_inputs.extend(output.inputs.iter().map(FileHandle::meta));
info!(
"Compaction output [{}]-> {}",
output
.inputs
.iter()
.map(|f| f.file_id().to_string())
.join(","),
output.output_file_id
);
// TODO(hl): Maybe spawn to runtime to exploit in-job parallelism.
futs.push(async move {
output
.build(region_id, schema, sst_layer, sst_write_buffer_size)
.await
});
}
let mut outputs = HashSet::with_capacity(futs.len());
while !futs.is_empty() {
let mut task_chunk = Vec::with_capacity(MAX_PARALLEL_COMPACTION);
for _ in 0..MAX_PARALLEL_COMPACTION {
if let Some(task) = futs.pop() {
task_chunk.push(common_runtime::spawn_bg(task));
}
}
let metas = futures::future::try_join_all(task_chunk)
.await
.context(error::JoinSnafu)?
.into_iter()
.collect::<Result<Vec<_>>>()?;
outputs.extend(metas.into_iter().flatten());
}
let inputs = compacted_inputs.into_iter().collect();
Ok((outputs, inputs))
}
/// Writes updated SST info into manifest.
async fn write_manifest_and_apply(
&self,
output: HashSet<FileMeta>,
input: HashSet<FileMeta>,
) -> Result<()> {
let version = &self.shared_data.version_control;
let region_version = version.metadata().version();
let edit = RegionEdit {
region_version,
flushed_sequence: None,
files_to_add: Vec::from_iter(output),
files_to_remove: Vec::from_iter(input),
compaction_time_window: self.compaction_time_window,
};
debug!(
"Compacted region: {}, region edit: {:?}",
version.metadata().name(),
edit
);
self.writer
.write_edit_and_apply(&self.wal, &self.shared_data, &self.manifest, edit, None)
.await
}
/// Mark files are under compaction.
fn mark_files_compacting(&self, compacting: bool) {
for o in &self.outputs {
for input in &o.inputs {
input.mark_compacting(compacting);
}
}
}
}
#[async_trait::async_trait]
impl<S: LogStore> CompactionTask for CompactionTaskImpl<S> {
async fn run(mut self) -> Result<()> {
let _timer = crate::metrics::COMPACT_ELAPSED.start_timer();
self.mark_files_compacting(true);
let (output, mut compacted) = self.merge_ssts().await.map_err(|e| {
error!(e; "Failed to compact region: {}", self.shared_data.name());
e
})?;
compacted.extend(self.expired_ssts.iter().map(FileHandle::meta));
let input_ids = compacted.iter().map(|f| f.file_id).collect::<Vec<_>>();
let output_ids = output.iter().map(|f| f.file_id).collect::<Vec<_>>();
info!(
"Compacting SST files, input: {:?}, output: {:?}, window: {:?}",
input_ids, output_ids, self.compaction_time_window
);
let no_output = output.is_empty();
let write_result = self
.write_manifest_and_apply(output, compacted)
.await
.map_err(|e| {
error!(e; "Failed to update region manifest: {}", self.shared_data.name());
e
});
if !no_output && self.reschedule_on_finish {
// only reschedule another compaction if current compaction has output and it's
// triggered by flush.
if let Err(e) = self
.writer
.compact(WriterCompactRequest {
shared_data: self.shared_data.clone(),
sst_layer: self.sst_layer.clone(),
manifest: self.manifest.clone(),
wal: self.wal.clone(),
region_writer: self.writer.clone(),
compact_ctx: CompactContext { wait: false },
})
.await
{
error!(e; "Failed to schedule a compaction after compaction, region id: {}", self.shared_data.id());
} else {
info!(
"Immediately schedule another compaction for region: {}",
self.shared_data.id()
);
}
}
write_result
}
}
/// Many-to-many compaction can be decomposed to a many-to-one compaction from level n to level n+1
/// and a many-to-one compaction from level n+1 to level n+1.
#[derive(Debug)]
pub struct CompactionOutput {
pub output_file_id: FileId,
/// Compaction output file level.
pub output_level: Level,
/// The left bound of time window.
pub time_window_bound: i64,
/// Time window size in seconds.
pub time_window_sec: i64,
/// Compaction input files.
pub inputs: Vec<FileHandle>,
/// If the compaction output is strictly windowed.
pub strict_window: bool,
}
impl CompactionOutput {
async fn build(
&self,
region_id: RegionId,
schema: RegionSchemaRef,
sst_layer: AccessLayerRef,
sst_write_buffer_size: ReadableSize,
) -> Result<Option<FileMeta>> {
let time_range = if self.strict_window {
(
Some(self.time_window_bound),
Some(self.time_window_bound + self.time_window_sec),
)
} else {
(None, None)
};
let reader = build_sst_reader(
region_id,
schema,
sst_layer.clone(),
&self.inputs,
time_range,
)
.await?;
let opts = WriteOptions {
sst_write_buffer_size,
};
let _timer = crate::metrics::MERGE_ELAPSED.start_timer();
let meta = sst_layer
.write_sst(self.output_file_id, Source::Reader(reader), &opts)
.await?
.map(
|SstInfo {
time_range,
file_size,
..
}| FileMeta {
region_id,
file_id: self.output_file_id,
time_range,
level: self.output_level,
file_size,
},
);
Ok(meta)
}
}
#[cfg(test)]
pub mod tests {
use std::sync::Arc;
use super::*;
use crate::compaction::task::CompactionTask;
pub type CallbackRef = Arc<dyn Fn() + Send + Sync>;
pub struct NoopCompactionTask {
pub cbs: Vec<CallbackRef>,
}
impl Debug for NoopCompactionTask {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("storage::compaction::task::tests::NoopCompactionTask")
.finish()
}
}
#[async_trait::async_trait]
impl CompactionTask for NoopCompactionTask {
async fn run(self) -> Result<()> {
for cb in &self.cbs {
cb()
}
Ok(())
}
}
}

View File

@@ -1,406 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Time-window compaction strategy
use std::collections::BTreeMap;
use std::fmt::{Debug, Formatter};
use std::marker::PhantomData;
use common_telemetry::{debug, info, warn};
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use store_api::logstore::LogStore;
use crate::compaction::picker::get_expired_ssts;
use crate::compaction::task::CompactionOutput;
use crate::compaction::{infer_time_bucket, CompactionRequestImpl, CompactionTaskImpl, Picker};
use crate::sst::{FileHandle, FileId, LevelMeta};
/// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction
/// candidates.
pub struct TwcsPicker<S> {
max_active_window_files: usize,
max_inactive_window_files: usize,
time_window_seconds: Option<i64>,
_phantom_data: PhantomData<S>,
}
impl<S> Debug for TwcsPicker<S> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TwcsPicker")
.field("max_active_window_files", &self.max_active_window_files)
.field("max_inactive_window_files", &self.max_inactive_window_files)
.finish()
}
}
impl<S> TwcsPicker<S> {
pub fn new(
max_active_window_files: usize,
max_inactive_window_files: usize,
time_window_seconds: Option<i64>,
) -> Self {
Self {
max_inactive_window_files,
max_active_window_files,
_phantom_data: Default::default(),
time_window_seconds,
}
}
/// Builds compaction output from files.
/// For active writing window, we allow for at most `max_active_window_files` files to alleviate
/// fragmentation. For other windows, we allow at most 1 file at each window.
fn build_output(
&self,
time_windows: &BTreeMap<i64, Vec<FileHandle>>,
active_window: Option<i64>,
window_size: i64,
) -> Vec<CompactionOutput> {
let mut output = vec![];
for (window, files) in time_windows {
if let Some(active_window) = active_window && *window == active_window {
if files.len() > self.max_active_window_files {
output.push(CompactionOutput {
output_file_id: FileId::random(),
output_level: 1, // we only have two levels and always compact to l1
time_window_bound: *window,
time_window_sec: window_size,
inputs: files.clone(),
// Strict window is not needed since we always compact many files to one
// single file in TWCS.
strict_window: false,
});
} else {
debug!("Active window not present or no enough files in active window {:?}, window: {}", active_window, *window);
}
} else {
// not active writing window
if files.len() > self.max_inactive_window_files {
output.push(CompactionOutput {
output_file_id: FileId::random(),
output_level: 1,
time_window_bound: *window,
time_window_sec: window_size,
inputs: files.clone(),
strict_window: false,
});
} else {
debug!("No enough files, current: {}, max_inactive_window_files: {}", files.len(), self.max_inactive_window_files)
}
}
}
output
}
}
impl<S: LogStore> Picker for TwcsPicker<S> {
type Request = CompactionRequestImpl<S>;
type Task = CompactionTaskImpl<S>;
fn pick(&self, req: &Self::Request) -> crate::error::Result<Option<Self::Task>> {
let levels = req.levels();
let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())?;
if !expired_ssts.is_empty() {
info!(
"Expired SSTs in region {}: {:?}",
req.region_id, expired_ssts
);
// here we mark expired SSTs as compacting to avoid them being picked.
expired_ssts.iter().for_each(|f| f.mark_compacting(true));
}
let time_window_size = req
.compaction_time_window
.or(self.time_window_seconds)
.unwrap_or_else(|| {
let inferred = infer_time_bucket(req.levels().level(0).files());
info!(
"Compaction window for region {} is not present, inferring from files: {:?}",
req.region_id, inferred
);
inferred
});
// Find active window from files in level 0.
let active_window =
find_latest_window_in_seconds(levels.level(0).files(), time_window_size);
let windows = assign_to_windows(
levels.levels().iter().flat_map(LevelMeta::files),
time_window_size,
);
let outputs = self.build_output(&windows, active_window, time_window_size);
if outputs.is_empty() && expired_ssts.is_empty() {
return Ok(None);
}
let task = CompactionTaskImpl {
schema: req.schema(),
sst_layer: req.sst_layer.clone(),
outputs,
writer: req.writer.clone(),
shared_data: req.shared.clone(),
wal: req.wal.clone(),
manifest: req.manifest.clone(),
expired_ssts,
sst_write_buffer_size: req.sst_write_buffer_size,
compaction_time_window: Some(time_window_size),
reschedule_on_finish: req.reschedule_on_finish,
};
Ok(Some(task))
}
}
/// Assigns files to windows with predefined window size (in seconds) by their max timestamps.
fn assign_to_windows<'a>(
files: impl Iterator<Item = &'a FileHandle>,
time_window_size: i64,
) -> BTreeMap<i64, Vec<FileHandle>> {
let mut windows: BTreeMap<i64, Vec<FileHandle>> = BTreeMap::new();
// Iterates all files and assign to time windows according to max timestamp
for file in files {
if let Some((_, end)) = file.time_range() {
let time_window = end
.convert_to(TimeUnit::Second)
.unwrap()
.value()
.align_to_ceil_by_bucket(time_window_size)
.unwrap_or(i64::MIN);
windows.entry(time_window).or_default().push(file.clone());
} else {
warn!("Unexpected file w/o timestamp: {:?}", file.file_id());
}
}
windows
}
/// Finds the latest active writing window among all files.
/// Returns `None` when there are no files or all files are corrupted.
fn find_latest_window_in_seconds<'a>(
files: impl Iterator<Item = &'a FileHandle>,
time_window_size: i64,
) -> Option<i64> {
let mut latest_timestamp = None;
for f in files {
if let Some((_, end)) = f.time_range() {
if let Some(latest) = latest_timestamp && end > latest {
latest_timestamp = Some(end);
} else {
latest_timestamp = Some(end);
}
} else {
warn!("Cannot find timestamp range of file: {}", f.file_id());
}
}
latest_timestamp
.and_then(|ts| ts.convert_to_ceil(TimeUnit::Second))
.and_then(|ts| ts.value().align_to_ceil_by_bucket(time_window_size))
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use log_store::NoopLogStore;
use super::*;
use crate::compaction::tests::new_file_handle;
use crate::sst::{FileId, Level};
#[test]
fn test_get_latest_window_in_seconds() {
assert_eq!(
Some(1),
find_latest_window_in_seconds([new_file_handle(FileId::random(), 0, 999, 0)].iter(), 1)
);
assert_eq!(
Some(1),
find_latest_window_in_seconds(
[new_file_handle(FileId::random(), 0, 1000, 0)].iter(),
1
)
);
assert_eq!(
Some(-9223372036854000),
find_latest_window_in_seconds(
[new_file_handle(FileId::random(), i64::MIN, i64::MIN + 1, 0)].iter(),
3600,
)
);
assert_eq!(
(i64::MAX / 10000000 + 1) * 10000,
find_latest_window_in_seconds(
[new_file_handle(FileId::random(), i64::MIN, i64::MAX, 0)].iter(),
10000,
)
.unwrap()
);
}
#[test]
fn test_assign_to_windows() {
let windows = assign_to_windows(
[
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
]
.iter(),
3,
);
assert_eq!(5, windows.get(&0).unwrap().len());
let files = [FileId::random(); 3];
let windows = assign_to_windows(
[
new_file_handle(files[0], -2000, -3, 0),
new_file_handle(files[1], 0, 2999, 0),
new_file_handle(files[2], 50, 10001, 0),
]
.iter(),
3,
);
assert_eq!(files[0], windows.get(&0).unwrap().get(0).unwrap().file_id());
assert_eq!(files[1], windows.get(&3).unwrap().get(0).unwrap().file_id());
assert_eq!(
files[2],
windows.get(&12).unwrap().get(0).unwrap().file_id()
);
}
struct CompactionPickerTestCase {
window_size: i64,
input_files: Vec<FileHandle>,
expected_outputs: Vec<ExpectedOutput>,
}
impl CompactionPickerTestCase {
fn check(&self) {
let windows = assign_to_windows(self.input_files.iter(), self.window_size);
let active_window =
find_latest_window_in_seconds(self.input_files.iter(), self.window_size);
let output = TwcsPicker::<NoopLogStore>::new(4, 1, None).build_output(
&windows,
active_window,
self.window_size,
);
let output = output
.iter()
.map(|o| {
let input_file_ids =
o.inputs.iter().map(|f| f.file_id()).collect::<HashSet<_>>();
(
input_file_ids,
o.output_level,
o.time_window_sec,
o.time_window_bound,
o.strict_window,
)
})
.collect::<Vec<_>>();
let expected = self
.expected_outputs
.iter()
.map(|o| {
let input_file_ids = o
.input_files
.iter()
.map(|idx| self.input_files[*idx].file_id())
.collect::<HashSet<_>>();
(
input_file_ids,
o.output_level,
o.time_window_sec,
o.time_window_bound,
o.strict_window,
)
})
.collect::<Vec<_>>();
assert_eq!(expected, output);
}
}
struct ExpectedOutput {
input_files: Vec<usize>,
output_level: Level,
time_window_sec: i64,
time_window_bound: i64,
strict_window: bool,
}
#[test]
fn test_build_twcs_output() {
let file_ids = (0..4).map(|_| FileId::random()).collect::<Vec<_>>();
CompactionPickerTestCase {
window_size: 3,
input_files: [
new_file_handle(file_ids[0], -2000, -3, 0),
new_file_handle(file_ids[1], -3000, -100, 0),
new_file_handle(file_ids[2], 0, 2999, 0), //active windows
new_file_handle(file_ids[3], 50, 2998, 0), //active windows
]
.to_vec(),
expected_outputs: vec![ExpectedOutput {
input_files: vec![0, 1],
output_level: 1,
time_window_sec: 3,
time_window_bound: 0,
strict_window: false,
}],
}
.check();
let file_ids = (0..6).map(|_| FileId::random()).collect::<Vec<_>>();
CompactionPickerTestCase {
window_size: 3,
input_files: [
new_file_handle(file_ids[0], -2000, -3, 0),
new_file_handle(file_ids[1], -3000, -100, 0),
new_file_handle(file_ids[2], 0, 2999, 0),
new_file_handle(file_ids[3], 50, 2998, 0),
new_file_handle(file_ids[4], 11, 2990, 0),
new_file_handle(file_ids[5], 50, 4998, 0),
]
.to_vec(),
expected_outputs: vec![
ExpectedOutput {
input_files: vec![0, 1],
output_level: 1,
time_window_sec: 3,
time_window_bound: 0,
strict_window: false,
},
ExpectedOutput {
input_files: vec![2, 3, 4],
output_level: 1,
time_window_sec: 3,
time_window_bound: 3,
strict_window: false,
},
],
}
.check();
}
}

View File

@@ -1,588 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_query::logical_plan::{DfExpr, Expr};
use common_time::timestamp::TimeUnit;
use datafusion_expr::Operator;
use datatypes::value::timestamp_to_scalar_value;
use store_api::storage::RegionId;
use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
use crate::error;
use crate::schema::RegionSchemaRef;
use crate::sst::{AccessLayerRef, FileHandle};
/// Builds an SST reader that only reads rows within given time range.
pub(crate) async fn build_sst_reader(
region_id: RegionId,
schema: RegionSchemaRef,
sst_layer: AccessLayerRef,
files: &[FileHandle],
time_range: (Option<i64>, Option<i64>),
) -> error::Result<ChunkReaderImpl> {
// TODO(hl): Schemas in different SSTs may differ, thus we should infer
// timestamp column name from Parquet metadata.
// safety: Region schema's timestamp column must present
let ts_col = schema.user_schema().timestamp_column().unwrap();
let ts_col_unit = ts_col.data_type.as_timestamp().unwrap().unit();
let ts_col_name = ts_col.name.clone();
ChunkReaderBuilder::new(region_id, schema, sst_layer)
.pick_ssts(files)
.filters(
build_time_range_filter(time_range, &ts_col_name, ts_col_unit)
.into_iter()
.collect(),
)
.build()
.await
}
/// Build time range filter expr from lower (inclusive) and upper bound(exclusive).
/// Returns `None` if time range overflows.
fn build_time_range_filter(
time_range: (Option<i64>, Option<i64>),
ts_col_name: &str,
ts_col_unit: TimeUnit,
) -> Option<Expr> {
let (low_ts_inclusive, high_ts_exclusive) = time_range;
let ts_col = DfExpr::Column(datafusion_common::Column::from_name(ts_col_name));
// Converting seconds to whatever unit won't lose precision.
// Here only handles overflow.
let low_ts = low_ts_inclusive
.map(common_time::Timestamp::new_second)
.and_then(|ts| ts.convert_to(ts_col_unit))
.map(|ts| ts.value());
let high_ts = high_ts_exclusive
.map(common_time::Timestamp::new_second)
.and_then(|ts| ts.convert_to(ts_col_unit))
.map(|ts| ts.value());
let expr = match (low_ts, high_ts) {
(Some(low), Some(high)) => {
let lower_bound_expr =
DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(low)));
let upper_bound_expr =
DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(high)));
Some(datafusion_expr::and(
datafusion_expr::binary_expr(ts_col.clone(), Operator::GtEq, lower_bound_expr),
datafusion_expr::binary_expr(ts_col, Operator::Lt, upper_bound_expr),
))
}
(Some(low), None) => {
let lower_bound_expr =
datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(low)));
Some(datafusion_expr::binary_expr(
ts_col,
Operator::GtEq,
lower_bound_expr,
))
}
(None, Some(high)) => {
let upper_bound_expr =
datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(high)));
Some(datafusion_expr::binary_expr(
ts_col,
Operator::Lt,
upper_bound_expr,
))
}
(None, None) => None,
};
expr.map(Expr::from)
}
#[cfg(test)]
mod tests {
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use api::v1::OpType;
use common_base::readable_size::ReadableSize;
use common_test_util::temp_dir::create_temp_dir;
use common_time::Timestamp;
use datatypes::prelude::{LogicalTypeId, ScalarVector, ScalarVectorBuilder};
use datatypes::timestamp::TimestampMillisecond;
use datatypes::vectors::{
TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64VectorBuilder,
};
use object_store::services::Fs;
use object_store::ObjectStore;
use store_api::storage::{ChunkReader, SequenceNumber};
use super::*;
use crate::file_purger::noop::new_noop_file_purger;
use crate::memtable::{
DefaultMemtableBuilder, IterContext, KeyValues, Memtable, MemtableBuilder,
};
use crate::metadata::RegionMetadata;
use crate::sst::parquet::ParquetWriter;
use crate::sst::{self, FileId, FileMeta, FsAccessLayer, Source, SstInfo, WriteOptions};
use crate::test_util::descriptor_util::RegionDescBuilder;
const REGION_ID: RegionId = RegionId::from_u64(1);
fn schema_for_test() -> RegionSchemaRef {
// Just build a region desc and use its columns metadata.
let desc = RegionDescBuilder::new("test")
.push_field_column(("v", LogicalTypeId::UInt64, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
metadata.schema().clone()
}
pub fn write_kvs(
memtable: &dyn Memtable,
sequence: SequenceNumber,
op_type: OpType,
ts: &[i64], // timestamp
values: &[Option<u64>],
) {
let keys: Vec<TimestampMillisecond> = ts.iter().map(|ts| (*ts).into()).collect();
let kvs = kvs_for_test(sequence, op_type, &keys, values);
memtable.write(&kvs).unwrap();
}
fn kvs_for_test(
sequence: SequenceNumber,
op_type: OpType,
ts: &[TimestampMillisecond],
values: &[Option<u64>],
) -> KeyValues {
let start_index_in_batch = 0;
assert_eq!(ts.len(), values.len());
let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(ts.len());
for key in ts {
key_builders.push(Some(*key));
}
let ts_col = Arc::new(key_builders.finish()) as _;
let mut value_builders = UInt64VectorBuilder::with_capacity(values.len());
for value in values {
value_builders.push(*value);
}
let row_values = vec![Arc::new(value_builders.finish()) as _];
let kvs = KeyValues {
sequence,
op_type,
start_index_in_batch,
keys: vec![],
values: row_values,
timestamp: Some(ts_col),
};
assert_eq!(ts.len(), kvs.len());
assert_eq!(ts.is_empty(), kvs.is_empty());
kvs
}
async fn write_sst(
sst_file_id: FileId,
schema: RegionSchemaRef,
seq: &AtomicU64,
object_store: ObjectStore,
ts: &[i64],
ops: &[OpType],
) -> FileHandle {
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
let mut breaks = ops
.iter()
.zip(ops.iter().skip(1))
.enumerate()
.filter_map(
|(idx, (prev, next))| {
if prev != next {
Some(idx + 1)
} else {
None
}
},
)
.collect::<Vec<_>>();
breaks.insert(0, 0);
breaks.push(ts.len());
for i in 0..breaks.len() - 1 {
let op = ops[i];
let seg_len = breaks[i + 1] - breaks[i];
let ts_seg = ts
.iter()
.skip(breaks[i])
.take(seg_len)
.copied()
.collect::<Vec<_>>();
let value_seg = ts
.iter()
.skip(breaks[i])
.take(seg_len)
.map(|i| (*i) as u64)
.map(Some)
.collect::<Vec<_>>();
write_kvs(
&*memtable,
seq.load(Ordering::Relaxed), // sequence
op,
&ts_seg, // keys
&value_seg, // values
);
let _ = seq.fetch_add(1, Ordering::Relaxed);
}
let iter = memtable.iter(IterContext::default()).unwrap();
let file_path = sst_file_id.as_parquet();
let writer = ParquetWriter::new(&file_path, Source::Iter(iter), object_store.clone());
let SstInfo {
time_range,
file_size,
..
} = writer
.write_sst(&sst::WriteOptions::default())
.await
.unwrap()
.unwrap();
let handle = FileHandle::new(
FileMeta {
region_id: 0.into(),
file_id: sst_file_id,
time_range,
level: 0,
file_size,
},
Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}),
new_noop_file_purger(),
);
let _ = seq.fetch_add(1, Ordering::Relaxed);
handle
}
// The region id is only used to build the reader, we don't check its content.
async fn check_reads(
region_id: RegionId,
schema: RegionSchemaRef,
sst_layer: AccessLayerRef,
files: &[FileHandle],
lower_sec_inclusive: i64,
upper_sec_exclusive: i64,
expect: &[i64],
) {
let mut reader = build_sst_reader(
region_id,
schema,
sst_layer,
files,
(Some(lower_sec_inclusive), Some(upper_sec_exclusive)),
)
.await
.unwrap();
let mut res = vec![];
while let Some(f) = reader.next_chunk().await.unwrap() {
let ts_col = f.columns[0]
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap();
res.extend(ts_col.iter_data().map(|t| t.unwrap().0.value()));
}
assert_eq!(expect, &res);
}
#[tokio::test]
async fn test_sst_reader() {
let dir = create_temp_dir("write_parquet");
let path = dir.path().to_str().unwrap();
let mut builder = Fs::default();
let _ = builder.root(path);
let object_store = ObjectStore::new(builder).unwrap().finish();
let seq = AtomicU64::new(0);
let schema = schema_for_test();
let file1 = write_sst(
FileId::random(),
schema.clone(),
&seq,
object_store.clone(),
&[1000, 2000, 3000, 4001, 5001],
&[
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
],
)
.await;
let file2 = write_sst(
FileId::random(),
schema.clone(),
&seq,
object_store.clone(),
&[4002, 5002, 6000, 7000, 8000],
&[
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
],
)
.await;
let sst_layer = Arc::new(FsAccessLayer::new("./", object_store));
let files = vec![file1, file2];
// read from two sst files with time range filter,
check_reads(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&files,
3,
6,
&[3000, 4001, 4002, 5001, 5002],
)
.await;
check_reads(REGION_ID, schema, sst_layer, &files, 1, 2, &[1000]).await;
}
async fn read_file(
files: &[FileHandle],
schema: RegionSchemaRef,
sst_layer: AccessLayerRef,
) -> Vec<i64> {
let mut timestamps = vec![];
let mut reader = build_sst_reader(
REGION_ID,
schema,
sst_layer,
files,
(Some(i64::MIN), Some(i64::MAX)),
)
.await
.unwrap();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let ts = chunk.columns[0]
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap();
timestamps.extend(ts.iter_data().map(|t| t.unwrap().0.value()));
}
timestamps
}
/// Writes rows into file i1/i2 and splits these rows into sst file o1/o2/o3,
/// and check the output contains the same data as input files.
#[tokio::test]
async fn test_sst_split() {
let dir = create_temp_dir("write_parquet");
let path = dir.path().to_str().unwrap();
let mut builder = Fs::default();
let _ = builder.root(path);
let object_store = ObjectStore::new(builder).unwrap().finish();
let schema = schema_for_test();
let seq = AtomicU64::new(0);
let input_file_ids = [FileId::random(), FileId::random()];
let output_file_ids = [FileId::random(), FileId::random(), FileId::random()];
let file1 = write_sst(
input_file_ids[0],
schema.clone(),
&seq,
object_store.clone(),
&[1000, 2000, 3000, 4001, 5001],
&[
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
],
)
.await;
// in file2 we delete the row with timestamp 1000.
let file2 = write_sst(
input_file_ids[1],
schema.clone(),
&seq,
object_store.clone(),
&[1000, 5002, 6000, 7000, 8000],
&[
OpType::Delete, // a deletion
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
],
)
.await;
let sst_layer = Arc::new(FsAccessLayer::new("./", object_store.clone()));
let input_files = vec![file2, file1];
let reader1 = build_sst_reader(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&input_files,
(Some(0), Some(3)),
)
.await
.unwrap();
let reader2 = build_sst_reader(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&input_files,
(Some(3), Some(6)),
)
.await
.unwrap();
let reader3 = build_sst_reader(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&input_files,
(Some(6), Some(10)),
)
.await
.unwrap();
let opts = WriteOptions {
sst_write_buffer_size: ReadableSize::mb(8),
};
let s1 = ParquetWriter::new(
&output_file_ids[0].as_parquet(),
Source::Reader(reader1),
object_store.clone(),
)
.write_sst(&opts)
.await
.unwrap()
.unwrap();
assert_eq!(
Some((
Timestamp::new_millisecond(2000),
Timestamp::new_millisecond(2000)
)),
s1.time_range,
);
let s2 = ParquetWriter::new(
&output_file_ids[1].as_parquet(),
Source::Reader(reader2),
object_store.clone(),
)
.write_sst(&opts)
.await
.unwrap()
.unwrap();
assert_eq!(
Some((
Timestamp::new_millisecond(3000),
Timestamp::new_millisecond(5002)
)),
s2.time_range,
);
let s3 = ParquetWriter::new(
&output_file_ids[2].as_parquet(),
Source::Reader(reader3),
object_store.clone(),
)
.write_sst(&opts)
.await
.unwrap()
.unwrap();
assert_eq!(
Some((
Timestamp::new_millisecond(6000),
Timestamp::new_millisecond(8000)
)),
s3.time_range
);
let output_files = output_file_ids
.into_iter()
.map(|f| {
FileHandle::new(
FileMeta {
region_id: 0.into(),
file_id: f,
level: 1,
time_range: None,
file_size: 0,
},
Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}),
new_noop_file_purger(),
)
})
.collect::<Vec<_>>();
let timestamps_in_inputs = read_file(&input_files, schema.clone(), sst_layer.clone()).await;
let timestamps_in_outputs =
read_file(&output_files, schema.clone(), sst_layer.clone()).await;
assert_eq!(timestamps_in_outputs, timestamps_in_inputs);
}
#[test]
fn test_build_time_range_filter() {
assert!(build_time_range_filter(
(Some(i64::MIN), Some(i64::MAX)),
"ts",
TimeUnit::Nanosecond
)
.is_none());
assert_eq!(
Expr::from(datafusion_expr::binary_expr(
datafusion_expr::col("ts"),
Operator::Lt,
datafusion_expr::lit(timestamp_to_scalar_value(
TimeUnit::Nanosecond,
Some(TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64),
)),
)),
build_time_range_filter((Some(i64::MIN), Some(1)), "ts", TimeUnit::Nanosecond).unwrap()
);
assert_eq!(
Expr::from(datafusion_expr::binary_expr(
datafusion_expr::col("ts"),
Operator::GtEq,
datafusion_expr::lit(timestamp_to_scalar_value(
TimeUnit::Nanosecond,
Some(
2 * TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64
),
)),
)),
build_time_range_filter((Some(2), Some(i64::MAX)), "ts", TimeUnit::Nanosecond).unwrap()
);
}
}

View File

@@ -1,71 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! storage engine config
use std::time::Duration;
use common_base::readable_size::ReadableSize;
/// Default max flush tasks.
pub const DEFAULT_MAX_FLUSH_TASKS: usize = 8;
/// Default region write buffer size.
pub const DEFAULT_REGION_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(32);
/// Default interval to trigger auto flush in millis.
pub const DEFAULT_AUTO_FLUSH_INTERVAL: u32 = 60 * 60 * 1000;
/// Default interval to schedule the picker to flush automatically in millis.
pub const DEFAULT_PICKER_SCHEDULE_INTERVAL: u32 = 5 * 60 * 1000;
#[derive(Debug, Clone)]
pub struct EngineConfig {
pub compress_manifest: bool,
pub manifest_checkpoint_margin: Option<u16>,
pub manifest_gc_duration: Option<Duration>,
pub max_files_in_l0: usize,
pub max_purge_tasks: usize,
/// Max inflight flush tasks.
pub max_flush_tasks: usize,
/// Default write buffer size for a region.
pub region_write_buffer_size: ReadableSize,
/// Interval to schedule the auto flush picker.
pub picker_schedule_interval: Duration,
/// Interval to auto flush a region if it has not flushed yet.
pub auto_flush_interval: Duration,
/// Limit for global write buffer size. Disabled by default.
pub global_write_buffer_size: Option<ReadableSize>,
/// Global retention period for all regions.
///
/// The precedence order is: region ttl > global ttl.
pub global_ttl: Option<Duration>,
}
impl Default for EngineConfig {
fn default() -> Self {
Self {
compress_manifest: false,
manifest_checkpoint_margin: Some(10),
manifest_gc_duration: Some(Duration::from_secs(30)),
max_files_in_l0: 8,
max_purge_tasks: 32,
max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
picker_schedule_interval: Duration::from_millis(
DEFAULT_PICKER_SCHEDULE_INTERVAL.into(),
),
auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
global_write_buffer_size: None,
global_ttl: None,
}
}
}

View File

@@ -1,750 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::time::Duration;
use async_trait::async_trait;
use common_telemetry::logging::{self, debug};
use object_store::{util, ObjectStore};
use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::manifest::Manifest;
use store_api::storage::{
CloseContext, CloseOptions, CompactionStrategy, CreateOptions, EngineContext, OpenOptions,
Region, RegionDescriptor, StorageEngine,
};
use crate::compaction::CompactionSchedulerRef;
use crate::config::EngineConfig;
use crate::error::{self, Error, Result};
use crate::file_purger::{FilePurgeHandler, FilePurgerRef};
use crate::flush::{
FlushScheduler, FlushSchedulerRef, FlushStrategyRef, PickerConfig, SizeBasedStrategy,
};
use crate::manifest::region::RegionManifest;
use crate::manifest::storage::manifest_compress_type;
use crate::memtable::{DefaultMemtableBuilder, MemtableBuilderRef};
use crate::metadata::RegionMetadata;
use crate::region::{RegionImpl, StoreConfig};
use crate::scheduler::{LocalScheduler, Scheduler, SchedulerConfig};
use crate::sst::FsAccessLayer;
/// [StorageEngine] implementation.
pub struct EngineImpl<S: LogStore> {
inner: Arc<EngineInner<S>>,
}
impl<S: LogStore> Clone for EngineImpl<S> {
fn clone(&self) -> Self {
Self {
inner: self.inner.clone(),
}
}
}
#[async_trait]
impl<S: LogStore> StorageEngine for EngineImpl<S> {
type Error = Error;
type Region = RegionImpl<S>;
async fn open_region(
&self,
_ctx: &EngineContext,
name: &str,
opts: &OpenOptions,
) -> Result<Option<Self::Region>> {
self.inner.open_region(name, opts).await
}
async fn close_region(
&self,
_ctx: &EngineContext,
name: &str,
opts: &CloseOptions,
) -> Result<()> {
self.inner.close_region(name, opts).await
}
async fn create_region(
&self,
_ctx: &EngineContext,
descriptor: RegionDescriptor,
opts: &CreateOptions,
) -> Result<Self::Region> {
self.inner.create_region(descriptor, opts).await
}
async fn drop_region(&self, _ctx: &EngineContext, region: Self::Region) -> Result<()> {
region.drop_region().await?;
self.inner.remove_region(region.name());
Ok(())
}
fn get_region(&self, _ctx: &EngineContext, name: &str) -> Result<Option<Self::Region>> {
Ok(self.inner.get_region(name))
}
async fn close(&self, _ctx: &EngineContext) -> Result<()> {
logging::info!("Stopping storage engine");
self.inner.close().await?;
logging::info!("Storage engine stopped");
Ok(())
}
}
impl<S: LogStore> EngineImpl<S> {
pub fn new(
config: EngineConfig,
log_store: Arc<S>,
object_store: ObjectStore,
compaction_scheduler: CompactionSchedulerRef<S>,
) -> Result<Self> {
Ok(Self {
inner: Arc::new(EngineInner::new(
config,
log_store,
object_store,
compaction_scheduler,
)?),
})
}
}
/// Generate region sst path,
/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'.
#[inline]
pub fn region_sst_dir(parent_dir: &str, region_name: &str) -> String {
format!("{parent_dir}{region_name}/")
}
/// Generate region manifest path,
/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'.
#[inline]
pub fn region_manifest_dir(parent_dir: &str, region_name: &str) -> String {
format!("{parent_dir}{region_name}/manifest/")
}
/// A slot for region in the engine.
///
/// Also used as a placeholder in the region map when the region isn't ready, e.g. during
/// creating/opening.
#[derive(Debug)]
pub(crate) enum RegionSlot<S: LogStore> {
/// The region is during creation.
Creating,
/// The region is during opening.
Opening,
/// The region is ready for access.
Ready(RegionImpl<S>),
}
impl<S: LogStore> RegionSlot<S> {
/// Try to get a ready region.
fn try_get_ready_region(&self) -> Result<RegionImpl<S>> {
if let RegionSlot::Ready(region) = self {
Ok(region.clone())
} else {
error::InvalidRegionStateSnafu {
state: self.state_name(),
}
.fail()
}
}
/// Returns the ready region or `None`.
fn get_ready_region(&self) -> Option<RegionImpl<S>> {
if let RegionSlot::Ready(region) = self {
Some(region.clone())
} else {
None
}
}
fn state_name(&self) -> &'static str {
match self {
RegionSlot::Creating => "creating",
RegionSlot::Opening => "opening",
RegionSlot::Ready(_) => "ready",
}
}
}
impl<S: LogStore> Clone for RegionSlot<S> {
// Manually implement Clone due to [rust#26925](https://github.com/rust-lang/rust/issues/26925).
// Maybe we should require `LogStore` to be clonable to work around this.
fn clone(&self) -> RegionSlot<S> {
match self {
RegionSlot::Creating => RegionSlot::Creating,
RegionSlot::Opening => RegionSlot::Opening,
RegionSlot::Ready(region) => RegionSlot::Ready(region.clone()),
}
}
}
/// Used to update slot or clean the slot on failure.
struct SlotGuard<'a, S: LogStore> {
name: &'a str,
regions: &'a RegionMap<S>,
skip_clean: bool,
}
impl<'a, S: LogStore> SlotGuard<'a, S> {
fn new(name: &'a str, regions: &'a RegionMap<S>) -> SlotGuard<'a, S> {
SlotGuard {
name,
regions,
skip_clean: false,
}
}
/// Update the slot and skip cleaning on drop.
fn update(&mut self, slot: RegionSlot<S>) {
self.regions.update(self.name, slot);
self.skip_clean = true;
}
}
impl<'a, S: LogStore> Drop for SlotGuard<'a, S> {
fn drop(&mut self) {
if !self.skip_clean {
self.regions.remove(self.name)
}
}
}
/// Region slot map.
pub struct RegionMap<S: LogStore>(RwLock<HashMap<String, RegionSlot<S>>>);
impl<S: LogStore> RegionMap<S> {
/// Returns a new region map.
pub fn new() -> RegionMap<S> {
RegionMap(RwLock::new(HashMap::new()))
}
/// Returns the `Some(slot)` if there is existing slot with given `name`, or insert
/// given `slot` and returns `None`.
pub(crate) fn get_or_occupy_slot(
&self,
name: &str,
slot: RegionSlot<S>,
) -> Option<RegionSlot<S>> {
{
// Try to get the region under read lock.
let regions = self.0.read().unwrap();
if let Some(slot) = regions.get(name) {
return Some(slot.clone());
}
}
// Get the region under write lock.
let mut regions = self.0.write().unwrap();
if let Some(slot) = regions.get(name) {
return Some(slot.clone());
}
// No slot in map, we can insert the slot now.
let _ = regions.insert(name.to_string(), slot);
None
}
/// Gets the region by the specific name.
fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
let slot = self.0.read().unwrap().get(name).cloned()?;
slot.get_ready_region()
}
/// Update the slot by name.
fn update(&self, name: &str, slot: RegionSlot<S>) {
let mut regions = self.0.write().unwrap();
if let Some(old) = regions.get_mut(name) {
*old = slot;
}
}
/// Remove region by name.
fn remove(&self, name: &str) {
let mut regions = self.0.write().unwrap();
let _ = regions.remove(name);
}
/// Collects regions.
pub(crate) fn list_regions(&self) -> Vec<RegionImpl<S>> {
let regions = self.0.read().unwrap();
regions
.values()
.filter_map(|slot| slot.get_ready_region())
.collect()
}
/// Clear the region map.
pub(crate) fn clear(&self) {
self.0.write().unwrap().clear();
}
}
impl<S: LogStore> Default for RegionMap<S> {
fn default() -> Self {
Self::new()
}
}
struct EngineInner<S: LogStore> {
object_store: ObjectStore,
log_store: Arc<S>,
regions: Arc<RegionMap<S>>,
memtable_builder: MemtableBuilderRef,
flush_scheduler: FlushSchedulerRef<S>,
flush_strategy: FlushStrategyRef,
compaction_scheduler: CompactionSchedulerRef<S>,
file_purger: FilePurgerRef,
config: Arc<EngineConfig>,
}
impl<S: LogStore> EngineInner<S> {
pub fn new(
config: EngineConfig,
log_store: Arc<S>,
object_store: ObjectStore,
compaction_scheduler: CompactionSchedulerRef<S>,
) -> Result<Self> {
let regions = Arc::new(RegionMap::new());
let flush_scheduler = Arc::new(FlushScheduler::new(
SchedulerConfig {
max_inflight_tasks: config.max_flush_tasks,
},
compaction_scheduler.clone(),
regions.clone(),
PickerConfig {
schedule_interval: config.picker_schedule_interval,
auto_flush_interval: config.auto_flush_interval,
},
)?);
let file_purger = Arc::new(LocalScheduler::new(
SchedulerConfig {
max_inflight_tasks: config.max_purge_tasks,
},
FilePurgeHandler,
));
let flush_strategy = Arc::new(SizeBasedStrategy::new(
config
.global_write_buffer_size
.map(|size| size.as_bytes() as usize),
));
let memtable_builder = if config.global_write_buffer_size.is_some() {
// If global write buffer size is provided, we set the flush strategy
// to the memtable to track global memtable usage.
DefaultMemtableBuilder::with_flush_strategy(Some(flush_strategy.clone()))
} else {
DefaultMemtableBuilder::default()
};
Ok(Self {
object_store,
log_store,
regions,
memtable_builder: Arc::new(memtable_builder),
flush_scheduler,
flush_strategy,
compaction_scheduler,
file_purger,
config: Arc::new(config),
})
}
async fn close_region(&self, name: &str, opts: &CloseOptions) -> Result<()> {
if let Some(region) = self.get_region(name) {
let ctx = CloseContext { flush: opts.flush };
region.close(&ctx).await?;
}
self.regions.remove(name);
Ok(())
}
async fn open_region(&self, name: &str, opts: &OpenOptions) -> Result<Option<RegionImpl<S>>> {
// We can wait until the state of the slot has been changed to ready, but this will
// make the code more complicate, so we just return the error here.
if let Some(slot) = self.regions.get_or_occupy_slot(name, RegionSlot::Opening) {
return slot.try_get_ready_region().map(Some);
}
let mut guard = SlotGuard::new(name, &self.regions);
let store_config = self
.region_store_config(
&opts.parent_dir,
opts.write_buffer_size,
name,
&self.config,
opts.ttl,
opts.compaction_strategy.clone(),
)
.await?;
let region = match RegionImpl::open(name.to_string(), store_config, opts).await? {
None => return Ok(None),
Some(v) => v,
};
guard.update(RegionSlot::Ready(region.clone()));
debug!(
"Storage engine open region {}, id: {}",
region.name(),
region.id()
);
Ok(Some(region))
}
async fn create_region(
&self,
descriptor: RegionDescriptor,
opts: &CreateOptions,
) -> Result<RegionImpl<S>> {
if let Some(slot) = self
.regions
.get_or_occupy_slot(&descriptor.name, RegionSlot::Creating)
{
return slot.try_get_ready_region();
}
// Now the region in under `Creating` state.
let region_name = descriptor.name.clone();
let mut guard = SlotGuard::new(&region_name, &self.regions);
let metadata: RegionMetadata =
descriptor
.try_into()
.context(error::InvalidRegionDescSnafu {
region: &region_name,
})?;
let store_config = self
.region_store_config(
&opts.parent_dir,
opts.write_buffer_size,
&region_name,
&self.config,
opts.ttl,
opts.compaction_strategy.clone(),
)
.await?;
let region = RegionImpl::create(metadata, store_config).await?;
guard.update(RegionSlot::Ready(region.clone()));
debug!(
"Storage engine create region {}, id: {}",
region.name(),
region.id()
);
Ok(region)
}
fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
self.regions.get_region(name)
}
fn remove_region(&self, name: &str) {
self.regions.remove(name)
}
async fn region_store_config(
&self,
parent_dir: &str,
write_buffer_size: Option<usize>,
region_name: &str,
config: &EngineConfig,
region_ttl: Option<Duration>,
compaction_strategy: CompactionStrategy,
) -> Result<StoreConfig<S>> {
let parent_dir = util::normalize_dir(parent_dir);
let sst_dir = &region_sst_dir(&parent_dir, region_name);
let sst_layer = Arc::new(FsAccessLayer::new(sst_dir, self.object_store.clone()));
let manifest_dir = region_manifest_dir(&parent_dir, region_name);
let manifest = RegionManifest::with_checkpointer(
&manifest_dir,
self.object_store.clone(),
manifest_compress_type(config.compress_manifest),
config.manifest_checkpoint_margin,
config.manifest_gc_duration,
);
manifest.start().await?;
let flush_strategy = self.flush_strategy.clone();
// If region_ttl is `None`, the global ttl takes effect.
let ttl = region_ttl.or(self.config.global_ttl);
Ok(StoreConfig {
log_store: self.log_store.clone(),
sst_layer,
manifest,
memtable_builder: self.memtable_builder.clone(),
flush_scheduler: self.flush_scheduler.clone(),
flush_strategy,
compaction_scheduler: self.compaction_scheduler.clone(),
engine_config: self.config.clone(),
file_purger: self.file_purger.clone(),
ttl,
write_buffer_size: write_buffer_size
.unwrap_or(self.config.region_write_buffer_size.as_bytes() as usize),
compaction_strategy,
})
}
async fn close(&self) -> Result<()> {
let regions = self.regions.list_regions();
let ctx = CloseContext::default();
for region in regions {
// Tolerate failure during closing regions.
if let Err(e) = region.close(&ctx).await {
logging::error!(e; "Failed to close region {}", region.id());
}
}
// Clear regions to release references to regions in the region map.
self.regions.clear();
self.compaction_scheduler.stop(true).await?;
self.flush_scheduler.stop().await?;
self.file_purger.stop(true).await
}
}
#[cfg(test)]
mod tests {
use std::ffi::OsStr;
use std::path::Path;
use common_test_util::temp_dir::{create_temp_dir, TempDir};
use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::{Float32Vector, Int32Vector, TimestampMillisecondVector, VectorRef};
use log_store::raft_engine::log_store::RaftEngineLogStore;
use log_store::test_util::log_store_util;
use object_store::services::Fs;
use store_api::storage::{
ChunkReader, FlushContext, ReadContext, Region, ScanRequest, Snapshot, WriteContext,
WriteRequest,
};
use super::*;
use crate::compaction::noop::NoopCompactionScheduler;
use crate::test_util::descriptor_util::RegionDescBuilder;
type TestEngine = EngineImpl<RaftEngineLogStore>;
type TestRegion = RegionImpl<RaftEngineLogStore>;
async fn create_engine_and_region(
tmp_dir: &TempDir,
log_file_dir: &TempDir,
region_name: &str,
region_id: u64,
config: EngineConfig,
) -> (TestEngine, TestRegion) {
let log_file_dir_path = log_file_dir.path().to_str().unwrap();
let log_store = log_store_util::create_tmp_local_file_log_store(log_file_dir_path).await;
let store_dir = tmp_dir.path().to_string_lossy();
let mut builder = Fs::default();
let _ = builder.root(&store_dir);
let object_store = ObjectStore::new(builder).unwrap().finish();
let compaction_scheduler = Arc::new(NoopCompactionScheduler::default());
let engine = EngineImpl::new(
config,
Arc::new(log_store),
object_store,
compaction_scheduler,
)
.unwrap();
let desc = RegionDescBuilder::new(region_name)
.id(region_id)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_field_column(("v1", LogicalTypeId::Float32, true))
.timestamp(("ts", LogicalTypeId::TimestampMillisecond, false))
.build();
let region = engine
.create_region(&EngineContext::default(), desc, &CreateOptions::default())
.await
.unwrap();
(engine, region)
}
fn parquet_file_num(path: &Path) -> usize {
path.read_dir()
.unwrap()
.filter_map(|entry| entry.ok())
.filter(|entry| entry.path().extension() == Some(OsStr::new("parquet")))
.count()
}
#[tokio::test]
async fn test_create_new_region() {
let dir = create_temp_dir("test_create_region");
let log_file_dir = create_temp_dir("test_engine_wal");
let region_name = "region-0";
let region_id = 123456;
let config = EngineConfig::default();
let (engine, region) =
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
assert_eq!(region_name, region.name());
let ctx = EngineContext::default();
let region2 = engine.get_region(&ctx, region_name).unwrap().unwrap();
assert_eq!(region_name, region2.name());
assert!(engine.get_region(&ctx, "no such region").unwrap().is_none());
}
#[tokio::test]
async fn test_create_region_with_buffer_size() {
let dir = create_temp_dir("test_buffer_size");
let log_file_dir = create_temp_dir("test_buffer_wal");
let region_name = "region-0";
let region_id = 123456;
let mut config = EngineConfig::default();
let expect_buffer_size = config.region_write_buffer_size / 2;
config.region_write_buffer_size = expect_buffer_size;
let (_engine, region) =
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
assert_eq!(
expect_buffer_size.as_bytes() as usize,
region.write_buffer_size().await
);
}
#[tokio::test]
async fn test_drop_region() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("test_drop_region");
let log_file_dir = create_temp_dir("test_engine_wal");
let region_name = "test_region";
let region_id = 123456;
let config = EngineConfig::default();
let (engine, region) =
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
assert_eq!(region_name, region.name());
let mut wb = region.write_request();
let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef;
let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef;
let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef;
let put_data = HashMap::from([
("k1".to_string(), k1),
("v1".to_string(), v1),
("ts".to_string(), tsv),
]);
wb.put(put_data).unwrap();
let _ = region.write(&WriteContext::default(), wb).await.unwrap();
// Flush memtable to sst.
region.flush(&FlushContext::default()).await.unwrap();
let ctx = EngineContext::default();
engine
.close_region(&ctx, region.name(), &CloseOptions::default())
.await
.unwrap();
let dir_path = dir.path().join(region_name);
assert_eq!(1, parquet_file_num(&dir_path));
{
let region = engine
.open_region(&ctx, region_name, &OpenOptions::default())
.await
.unwrap()
.unwrap();
engine.drop_region(&ctx, region).await.unwrap();
assert!(engine.get_region(&ctx, region_name).unwrap().is_none());
assert!(!engine
.inner
.object_store
.is_exist(dir_path.join("manifest").to_str().unwrap())
.await
.unwrap());
}
// Wait for gc
tokio::time::sleep(Duration::from_millis(60)).await;
assert_eq!(0, parquet_file_num(&dir_path));
}
#[tokio::test]
async fn test_truncate_region() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("test_truncate_region");
let log_file_dir = create_temp_dir("test_engine_wal");
let region_name = "test_region";
let region_id = 123456;
let config = EngineConfig::default();
let (engine, region) =
create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await;
assert_eq!(region_name, region.name());
let mut wb = region.write_request();
let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef;
let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef;
let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef;
let put_data = HashMap::from([
("k1".to_string(), k1),
("v1".to_string(), v1),
("ts".to_string(), tsv),
]);
wb.put(put_data).unwrap();
// Insert data.
region.write(&WriteContext::default(), wb).await.unwrap();
let ctx = EngineContext::default();
// Truncate region.
region.truncate().await.unwrap();
assert!(engine.get_region(&ctx, region.name()).unwrap().is_some());
// Scan to verify the region is empty.
let read_ctx = ReadContext::default();
let snapshot = region.snapshot(&read_ctx).unwrap();
let resp = snapshot
.scan(&read_ctx, ScanRequest::default())
.await
.unwrap();
let mut reader = resp.reader;
assert!(reader.next_chunk().await.unwrap().is_none());
}
}

View File

@@ -1,635 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::io::Error as IoError;
use std::str::Utf8Error;
use common_datasource::compression::CompressionType;
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use common_runtime::error::Error as RuntimeError;
use datatypes::arrow::error::ArrowError;
use datatypes::prelude::ConcreteDataType;
use object_store::ErrorKind;
use serde_json::error::Error as JsonError;
use snafu::{Location, Snafu};
use store_api::manifest::action::ProtocolVersion;
use store_api::manifest::ManifestVersion;
use store_api::storage::{RegionId, SequenceNumber};
use tokio::task::JoinError;
use crate::metadata::Error as MetadataError;
use crate::write_batch;
#[derive(Snafu)]
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("Invalid region descriptor, region: {}", region))]
InvalidRegionDesc {
region: String,
location: Location,
source: MetadataError,
},
#[snafu(display("Missing column {} in write batch", column))]
BatchMissingColumn { column: String, location: Location },
#[snafu(display("Failed to write parquet file"))]
WriteParquet {
#[snafu(source)]
error: parquet::errors::ParquetError,
location: Location,
},
#[snafu(display("Failed to write to buffer"))]
WriteBuffer {
location: Location,
source: common_datasource::error::Error,
},
#[snafu(display("Failed to create RecordBatch from vectors"))]
NewRecordBatch {
location: Location,
#[snafu(source)]
error: ArrowError,
},
#[snafu(display("Fail to read object from path: {}", path))]
ReadObject {
path: String,
location: Location,
#[snafu(source)]
error: object_store::Error,
},
#[snafu(display("Fail to write object into path: {}", path))]
WriteObject {
path: String,
location: Location,
#[snafu(source)]
error: object_store::Error,
},
#[snafu(display("Fail to delete object from path: {}", path))]
DeleteObject {
path: String,
location: Location,
#[snafu(source)]
error: object_store::Error,
},
#[snafu(display("Fail to compress object by {}, path: {}", compress_type, path))]
CompressObject {
compress_type: CompressionType,
path: String,
#[snafu(source)]
error: std::io::Error,
},
#[snafu(display("Fail to decompress object by {}, path: {}", compress_type, path))]
DecompressObject {
compress_type: CompressionType,
path: String,
#[snafu(source)]
error: std::io::Error,
},
#[snafu(display("Fail to list objects in path: {}", path))]
ListObjects {
path: String,
location: Location,
#[snafu(source)]
error: object_store::Error,
},
#[snafu(display("Fail to create str from bytes"))]
Utf8 {
location: Location,
#[snafu(source)]
error: Utf8Error,
},
#[snafu(display("Fail to encode object into json "))]
EncodeJson {
location: Location,
#[snafu(source)]
error: JsonError,
},
#[snafu(display("Fail to decode object from json "))]
DecodeJson {
location: Location,
#[snafu(source)]
error: JsonError,
},
#[snafu(display("Invalid scan index, start: {}, end: {}", start, end))]
InvalidScanIndex {
start: ManifestVersion,
end: ManifestVersion,
location: Location,
},
#[snafu(display("Failed to write WAL, WAL region_id: {}", region_id))]
WriteWal {
region_id: RegionId,
location: Location,
source: BoxedError,
},
#[snafu(display("Failed to encode WAL header"))]
EncodeWalHeader {
location: Location,
#[snafu(source)]
error: std::io::Error,
},
#[snafu(display("Failed to decode WAL header"))]
DecodeWalHeader {
location: Location,
#[snafu(source)]
error: std::io::Error,
},
#[snafu(display("Failed to wait flushing, region_id: {}", region_id))]
WaitFlush {
region_id: RegionId,
#[snafu(source)]
error: tokio::sync::oneshot::error::RecvError,
location: Location,
},
#[snafu(display(
"Manifest protocol forbid to read, min_version: {}, supported_version: {}",
min_version,
supported_version
))]
ManifestProtocolForbidRead {
min_version: ProtocolVersion,
supported_version: ProtocolVersion,
location: Location,
},
#[snafu(display(
"Manifest protocol forbid to write, min_version: {}, supported_version: {}",
min_version,
supported_version
))]
ManifestProtocolForbidWrite {
min_version: ProtocolVersion,
supported_version: ProtocolVersion,
location: Location,
},
#[snafu(display("Failed to decode action list, {}", msg))]
DecodeMetaActionList { msg: String, location: Location },
#[snafu(display("Failed to read line, err"))]
Readline {
#[snafu(source)]
error: IoError,
},
#[snafu(display("Failed to read Parquet file: {}", file))]
ReadParquet {
file: String,
#[snafu(source)]
error: parquet::errors::ParquetError,
location: Location,
},
#[snafu(display("Region is under {} state, cannot proceed operation", state))]
InvalidRegionState {
state: &'static str,
location: Location,
},
#[snafu(display("Failed to read WAL, region_id: {}", region_id))]
ReadWal {
region_id: RegionId,
location: Location,
source: BoxedError,
},
#[snafu(display("Failed to mark WAL as obsolete, region id: {}", region_id))]
MarkWalObsolete {
region_id: u64,
location: Location,
source: BoxedError,
},
#[snafu(display("WAL data corrupted, region_id: {}, message: {}", region_id, message))]
WalDataCorrupted {
region_id: RegionId,
message: String,
location: Location,
},
#[snafu(display("Failed to delete WAL namespace, region id: {}", region_id))]
DeleteWalNamespace {
region_id: RegionId,
location: Location,
source: BoxedError,
},
#[snafu(display(
"Sequence of region should increase monotonically (should be {} < {})",
prev,
given
))]
SequenceNotMonotonic {
prev: SequenceNumber,
given: SequenceNumber,
location: Location,
},
#[snafu(display("Failed to convert store schema, file: {}", file))]
ConvertStoreSchema {
file: String,
location: Location,
source: MetadataError,
},
#[snafu(display("Invalid raw region metadata, region: {}", region))]
InvalidRawRegion {
region: String,
location: Location,
source: MetadataError,
},
#[snafu(display("Try to write the closed region"))]
ClosedRegion { location: Location },
#[snafu(display("Invalid projection"))]
InvalidProjection {
location: Location,
source: MetadataError,
},
#[snafu(display("Failed to push data to batch builder"))]
PushBatch {
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("Failed to build batch, {}", msg))]
BuildBatch { msg: String, location: Location },
#[snafu(display("Failed to filter column {}", name))]
FilterColumn {
name: String,
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("Invalid alter request"))]
InvalidAlterRequest {
location: Location,
source: MetadataError,
},
#[snafu(display("Failed to alter metadata"))]
AlterMetadata {
location: Location,
source: MetadataError,
},
#[snafu(display("Failed to create default value for column {}", name))]
CreateDefault {
name: String,
location: Location,
source: datatypes::error::Error,
},
#[snafu(display(
"Not allowed to write data with version {} to schema with version {}",
data_version,
schema_version
))]
WriteToOldVersion {
/// Schema version of data to write.
data_version: u32,
schema_version: u32,
location: Location,
},
#[snafu(display("Column {} not in schema with version {}", column, version))]
NotInSchemaToCompat {
column: String,
version: u32,
location: Location,
},
#[snafu(display("Incompatible schema to read, reason: {}", reason))]
CompatRead { reason: String, location: Location },
#[snafu(display("Failed to read column {}, could not create default value", column))]
CreateDefaultToRead {
column: String,
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("Failed to read column {}, no proper default value for it", column))]
NoDefaultToRead { column: String, location: Location },
#[snafu(display("Failed to convert arrow chunk to batch, name: {}", name))]
ConvertChunk {
name: String,
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("Unknown column {}", name))]
UnknownColumn { name: String, location: Location },
#[snafu(display("Failed to create record batch for write batch"))]
CreateRecordBatch {
location: Location,
source: common_recordbatch::error::Error,
},
#[snafu(display(
"Request is too large, max is {}, current is {}",
write_batch::MAX_BATCH_SIZE,
num_rows
))]
RequestTooLarge { num_rows: usize, location: Location },
#[snafu(display(
"Type of column {} does not match type in schema, expect {:?}, given {:?}",
name,
expect,
given
))]
TypeMismatch {
name: String,
expect: ConcreteDataType,
given: ConcreteDataType,
location: Location,
},
#[snafu(display("Column {} is not null but input has null", name))]
HasNull { name: String, location: Location },
#[snafu(display(
"Length of column {} not equals to other columns, expect {}, given {}",
name,
expect,
given
))]
UnequalLengths {
name: String,
expect: usize,
given: usize,
location: Location,
},
#[snafu(display("Failed to decode write batch, corrupted data {}", message))]
BatchCorrupted { message: String, location: Location },
#[snafu(display("Failed to decode arrow data"))]
DecodeArrow {
location: Location,
#[snafu(source)]
error: ArrowError,
},
#[snafu(display("Failed to encode arrow data"))]
EncodeArrow {
location: Location,
#[snafu(source)]
error: ArrowError,
},
#[snafu(display("Failed to parse schema"))]
ParseSchema {
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("More columns than expected in the request"))]
MoreColumnThanExpected { location: Location },
#[snafu(display("Failed to decode parquet file time range, msg: {}", msg))]
DecodeParquetTimeRange { msg: String, location: Location },
#[snafu(display("Scheduler rate limited, msg: {}", msg))]
RateLimited { msg: String },
#[snafu(display("Cannot schedule request, scheduler's already stopped"))]
IllegalSchedulerState { location: Location },
#[snafu(display("Failed to start manifest gc task"))]
StartManifestGcTask {
location: Location,
source: RuntimeError,
},
#[snafu(display("Failed to stop manifest gc task"))]
StopManifestGcTask {
location: Location,
source: RuntimeError,
},
#[snafu(display("Failed to stop scheduler"))]
StopScheduler {
#[snafu(source)]
error: JoinError,
location: Location,
},
#[snafu(display("Failed to delete SST file"))]
DeleteSst {
#[snafu(source)]
error: object_store::Error,
location: Location,
},
#[snafu(display("Failed to calculate SST expire time"))]
TtlCalculation {
location: Location,
source: common_time::error::Error,
},
#[snafu(display("Failed to create a checkpoint: {}", msg))]
ManifestCheckpoint { msg: String, location: Location },
#[snafu(display("The compaction task is cancelled, region_id: {}", region_id))]
CompactTaskCancel {
region_id: RegionId,
#[snafu(source)]
error: tokio::sync::oneshot::error::RecvError,
},
#[snafu(display(
"The flush request is duplicate, region_id: {}, sequence: {}",
region_id,
sequence
))]
DuplicateFlush {
region_id: RegionId,
sequence: SequenceNumber,
location: Location,
},
#[snafu(display("Failed to start picking task for flush"))]
StartPickTask {
location: Location,
source: RuntimeError,
},
#[snafu(display("Failed to stop picking task for flush"))]
StopPickTask {
location: Location,
source: RuntimeError,
},
#[snafu(display("Failed to convert columns to rows"))]
ConvertColumnsToRows {
#[snafu(source)]
error: ArrowError,
location: Location,
},
#[snafu(display("Failed to sort arrays"))]
SortArrays {
#[snafu(source)]
error: ArrowError,
location: Location,
},
#[snafu(display("Failed to build scan predicate"))]
BuildPredicate {
source: table::error::Error,
location: Location,
},
#[snafu(display("Failed to join spawned tasks"))]
JoinError {
#[snafu(source)]
error: JoinError,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
impl Error {
/// Returns true if the error is the object path to delete
/// doesn't exist.
pub(crate) fn is_object_to_delete_not_found(&self) -> bool {
if let Error::DeleteObject { error, .. } = self {
error.kind() == ErrorKind::NotFound
} else {
false
}
}
}
impl ErrorExt for Error {
fn status_code(&self) -> StatusCode {
use Error::*;
match self {
InvalidScanIndex { .. }
| BatchMissingColumn { .. }
| InvalidProjection { .. }
| BuildBatch { .. }
| NotInSchemaToCompat { .. }
| WriteToOldVersion { .. }
| CreateRecordBatch { .. }
| RequestTooLarge { .. }
| TypeMismatch { .. }
| HasNull { .. }
| UnequalLengths { .. }
| MoreColumnThanExpected { .. } => StatusCode::InvalidArguments,
Utf8 { .. }
| EncodeJson { .. }
| DecodeJson { .. }
| WaitFlush { .. }
| DecodeMetaActionList { .. }
| Readline { .. }
| WalDataCorrupted { .. }
| SequenceNotMonotonic { .. }
| ConvertStoreSchema { .. }
| InvalidRawRegion { .. }
| ClosedRegion { .. }
| FilterColumn { .. }
| AlterMetadata { .. }
| CompatRead { .. }
| CreateDefaultToRead { .. }
| NoDefaultToRead { .. }
| NewRecordBatch { .. }
| BatchCorrupted { .. }
| DecodeArrow { .. }
| EncodeArrow { .. }
| ManifestCheckpoint { .. }
| CompressObject { .. }
| DecompressObject { .. }
| ParseSchema { .. } => StatusCode::Unexpected,
WriteParquet { .. }
| ReadObject { .. }
| WriteObject { .. }
| ListObjects { .. }
| DeleteObject { .. }
| WriteWal { .. }
| DecodeWalHeader { .. }
| EncodeWalHeader { .. }
| ManifestProtocolForbidRead { .. }
| ManifestProtocolForbidWrite { .. }
| ReadParquet { .. }
| InvalidRegionState { .. }
| ReadWal { .. } => StatusCode::StorageUnavailable,
UnknownColumn { .. } => StatusCode::TableColumnNotFound,
InvalidAlterRequest { source, .. } | InvalidRegionDesc { source, .. } => {
source.status_code()
}
WriteBuffer { source, .. } => source.status_code(),
PushBatch { source, .. } => source.status_code(),
CreateDefault { source, .. } => source.status_code(),
ConvertChunk { source, .. } => source.status_code(),
MarkWalObsolete { source, .. } => source.status_code(),
DeleteWalNamespace { source, .. } => source.status_code(),
DecodeParquetTimeRange { .. } => StatusCode::Unexpected,
RateLimited { .. } | StopScheduler { .. } | CompactTaskCancel { .. } => {
StatusCode::Internal
}
DeleteSst { .. } => StatusCode::StorageUnavailable,
StartManifestGcTask { .. }
| StopManifestGcTask { .. }
| IllegalSchedulerState { .. }
| DuplicateFlush { .. }
| StartPickTask { .. }
| StopPickTask { .. } => StatusCode::Unexpected,
TtlCalculation { source, .. } => source.status_code(),
ConvertColumnsToRows { .. } | SortArrays { .. } => StatusCode::Unexpected,
BuildPredicate { source, .. } => source.status_code(),
JoinError { .. } => StatusCode::Unexpected,
}
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -1,235 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_telemetry::{debug, error};
use store_api::storage::RegionId;
use tokio::sync::Notify;
use crate::error::Result;
use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken};
use crate::scheduler::{Handler, LocalScheduler, Request};
use crate::sst::{AccessLayerRef, FileId};
pub struct FilePurgeRequest {
pub region_id: RegionId,
pub file_id: FileId,
pub sst_layer: AccessLayerRef,
}
impl Request for FilePurgeRequest {
type Key = String;
fn key(&self) -> Self::Key {
format!("{}/{}", self.region_id, self.file_id)
}
fn complete(self, _result: Result<()>) {}
}
pub struct FilePurgeHandler;
#[async_trait::async_trait]
impl Handler for FilePurgeHandler {
type Request = FilePurgeRequest;
async fn handle_request(
&self,
req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
req.sst_layer.delete_sst(req.file_id).await.map_err(|e| {
error!(e; "Failed to delete SST file, file: {}, region: {}",
req.file_id.as_parquet(), req.region_id);
e
})?;
debug!(
"Successfully deleted SST file: {}, region: {}",
req.file_id.as_parquet(),
req.region_id
);
token.try_release();
finish_notifier.notify_one();
Ok(())
}
}
pub type FilePurgerRef = Arc<LocalScheduler<FilePurgeRequest>>;
#[cfg(test)]
pub mod noop {
use std::sync::Arc;
use tokio::sync::Notify;
use crate::error::Result;
use crate::file_purger::{FilePurgeRequest, FilePurgerRef};
use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken};
use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig};
pub fn new_noop_file_purger() -> FilePurgerRef {
Arc::new(LocalScheduler::new(
SchedulerConfig::default(),
NoopFilePurgeHandler,
))
}
#[derive(Debug)]
pub struct NoopFilePurgeHandler;
#[async_trait::async_trait]
impl Handler for NoopFilePurgeHandler {
type Request = FilePurgeRequest;
async fn handle_request(
&self,
_req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
token.try_release();
finish_notifier.notify_one();
Ok(())
}
}
}
#[cfg(test)]
mod tests {
use api::v1::OpType;
use common_test_util::temp_dir::create_temp_dir;
use object_store::services::Fs;
use object_store::ObjectStore;
use super::*;
use crate::file_purger::noop::NoopFilePurgeHandler;
use crate::memtable::tests::{schema_for_test, write_kvs};
use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
use crate::scheduler::{Scheduler, SchedulerConfig};
use crate::sst::{AccessLayer, FileHandle, FileMeta, FsAccessLayer, Source, WriteOptions};
struct MockRateLimitToken;
impl RateLimitToken for MockRateLimitToken {
fn try_release(&self) {}
}
async fn create_sst_file(
os: ObjectStore,
sst_file_id: FileId,
file_purger: FilePurgerRef,
) -> (FileHandle, String, AccessLayerRef) {
let schema = schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
write_kvs(
&*memtable,
10,
OpType::Put,
&[1, 2],
&[(Some(1), Some(1)), (Some(2), Some(2))],
);
let iter = memtable.iter(IterContext::default()).unwrap();
let sst_path = "table1";
let layer = Arc::new(FsAccessLayer::new(sst_path, os.clone()));
let sst_info = layer
.write_sst(sst_file_id, Source::Iter(iter), &WriteOptions::default())
.await
.unwrap()
.unwrap();
(
FileHandle::new(
FileMeta {
region_id: 0.into(),
file_id: sst_file_id,
time_range: None,
level: 0,
file_size: sst_info.file_size,
},
layer.clone(),
file_purger,
),
sst_path.to_string(),
layer as _,
)
}
#[tokio::test]
async fn test_file_purger_handler() {
let dir = create_temp_dir("file-purge");
let mut builder = Fs::default();
let _ = builder.root(dir.path().to_str().unwrap());
let object_store = ObjectStore::new(builder).unwrap().finish();
let sst_file_id = FileId::random();
let noop_file_purger = Arc::new(LocalScheduler::new(
SchedulerConfig::default(),
NoopFilePurgeHandler,
));
let (_file, path, layer) =
create_sst_file(object_store.clone(), sst_file_id, noop_file_purger).await;
let request = FilePurgeRequest {
region_id: 0.into(),
file_id: sst_file_id,
sst_layer: layer,
};
let handler = FilePurgeHandler;
let notify = Arc::new(Notify::new());
handler
.handle_request(request, Box::new(MockRateLimitToken {}), notify.clone())
.await
.unwrap();
notify.notified().await;
let exists = object_store
.is_exist(&format!("{}/{}", path, sst_file_id.as_parquet()))
.await
.unwrap();
assert!(!exists);
}
#[tokio::test]
async fn test_file_purge_loop() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("file-purge");
let mut builder = Fs::default();
let _ = builder.root(dir.path().to_str().unwrap());
let object_store = ObjectStore::new(builder).unwrap().finish();
let sst_file_id = FileId::random();
let scheduler = Arc::new(LocalScheduler::new(
SchedulerConfig::default(),
FilePurgeHandler,
));
let (handle, path, _layer) =
create_sst_file(object_store.clone(), sst_file_id, scheduler.clone()).await;
{
// mark file as deleted and drop the handle, we expect the file is deleted.
handle.mark_deleted();
drop(handle);
}
scheduler.stop(true).await.unwrap();
assert!(!object_store
.is_exist(&format!("{}/{}", path, sst_file_id.as_parquet()))
.await
.unwrap());
}
}

View File

@@ -1,495 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
mod picker;
mod scheduler;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use common_base::readable_size::ReadableSize;
use common_telemetry::logging;
pub use picker::{FlushPicker, PickerConfig};
pub use scheduler::{
FlushHandle, FlushRegionRequest, FlushRequest, FlushScheduler, FlushSchedulerRef,
};
use store_api::logstore::LogStore;
use store_api::storage::consts::WRITE_ROW_GROUP_SIZE;
use store_api::storage::{RegionId, SequenceNumber};
use crate::config::EngineConfig;
use crate::error::Result;
use crate::manifest::action::*;
use crate::manifest::region::RegionManifest;
use crate::memtable::{IterContext, MemtableId, MemtableRef};
use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED};
use crate::region::{RegionWriterRef, SharedDataRef};
use crate::sst::{AccessLayerRef, FileId, FileMeta, Source, SstInfo, WriteOptions};
use crate::wal::Wal;
/// Current flush-related status of a region.
#[derive(Debug, Clone, Copy)]
pub struct RegionStatus {
/// Id of the region this status belongs to.
pub region_id: RegionId,
/// Size of the mutable memtable.
pub bytes_mutable: usize,
/// Write buffer size of the region.
pub write_buffer_size: usize,
}
/// Type of flush request to send.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FlushType {
/// Flush current region.
Region,
/// Engine level flush. Find regions to flush globally.
Engine,
}
/// Strategy to control whether to flush a region before writing to the region.
pub trait FlushStrategy: Send + Sync + std::fmt::Debug {
/// Returns whether to trigger a flush operation.
fn should_flush(&self, status: RegionStatus) -> Option<FlushType>;
/// Reserves `mem` bytes.
fn reserve_mem(&self, mem: usize);
/// Tells the strategy we are freeing `mem` bytes.
///
/// We are in the process of freeing `mem` bytes, so it is not considered
/// when checking the soft limit.
fn schedule_free_mem(&self, mem: usize);
/// We have freed `mem` bytes.
fn free_mem(&self, mem: usize);
}
pub type FlushStrategyRef = Arc<dyn FlushStrategy>;
/// Flush strategy based on memory usage.
#[derive(Debug)]
pub struct SizeBasedStrategy {
/// Write buffer size for all memtables.
global_write_buffer_size: Option<usize>,
/// Mutable memtable memory size limitation, only valid when `global_write_buffer_size`
/// is `Some`.
mutable_limitation: usize,
/// Memory in used (e.g. used by mutable and immutable memtables).
memory_used: AtomicUsize,
/// Memory that hasn't been scheduled to free (e.g. used by mutable memtables).
memory_active: AtomicUsize,
}
impl SizeBasedStrategy {
/// Returns a new [SizeBasedStrategy] with specific `global_write_buffer_size`.
pub fn new(global_write_buffer_size: Option<usize>) -> Self {
Self {
global_write_buffer_size,
mutable_limitation: get_mutable_limitation(global_write_buffer_size),
memory_used: AtomicUsize::new(0),
memory_active: AtomicUsize::new(0),
}
}
/// Returns whether to trigger an engine level flush.
///
/// Inspired by RocksDB's WriteBufferManager.
/// <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94>
fn should_flush_engine(&self) -> bool {
// We only check global limit when it is Some.
let Some(global_write_buffer_size) = self.global_write_buffer_size else {
return false;
};
let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed);
if mutable_memtable_memory_usage > self.mutable_limitation {
logging::info!(
"Engine should flush (over mutable limit), mutable_usage: {}, mutable_limitation: {}.",
mutable_memtable_memory_usage,
self.mutable_limitation,
);
return true;
}
let memory_usage = self.memory_used.load(Ordering::Relaxed);
// If the memory exceeds the buffer size, we trigger more aggressive
// flush. But if already more than half memory is being flushed,
// triggering more flush may not help. We will hold it instead.
if memory_usage >= global_write_buffer_size
&& mutable_memtable_memory_usage >= global_write_buffer_size / 2
{
logging::info!(
"Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
mutable_usage: {}.",
memory_usage,
global_write_buffer_size,
mutable_memtable_memory_usage,
);
return true;
}
false
}
/// Returns true if the global memory limitation is enabled.
#[inline]
fn is_global_limit_enabled(&self) -> bool {
self.global_write_buffer_size.is_some()
}
}
#[inline]
fn get_mutable_limitation(global_write_buffer_size: Option<usize>) -> usize {
// Inspired by RocksDB.
// https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L86
global_write_buffer_size
.map(|size| size * 7 / 8)
.unwrap_or(0)
}
impl Default for SizeBasedStrategy {
fn default() -> Self {
Self {
global_write_buffer_size: None,
mutable_limitation: 0,
memory_used: AtomicUsize::new(0),
memory_active: AtomicUsize::new(0),
}
}
}
impl FlushStrategy for SizeBasedStrategy {
fn should_flush(&self, status: RegionStatus) -> Option<FlushType> {
if status.bytes_mutable >= status.write_buffer_size {
// If the mutable memtable is full, we should freeze it and flush it.
logging::debug!(
"Region should flush as mutable memtable is full, region: {}, bytes_mutable: {}, \
write_buffer_size: {}.",
status.region_id,
status.bytes_mutable,
status.write_buffer_size,
);
return Some(FlushType::Region);
}
if self.should_flush_engine() {
return Some(FlushType::Engine);
}
None
}
fn reserve_mem(&self, mem: usize) {
if self.is_global_limit_enabled() {
let _ = self.memory_used.fetch_add(mem, Ordering::Relaxed);
let _ = self.memory_active.fetch_add(mem, Ordering::Relaxed);
}
}
fn schedule_free_mem(&self, mem: usize) {
if self.is_global_limit_enabled() {
let _ = self.memory_active.fetch_sub(mem, Ordering::Relaxed);
}
}
fn free_mem(&self, mem: usize) {
if self.is_global_limit_enabled() {
let _ = self.memory_used.fetch_sub(mem, Ordering::Relaxed);
}
}
}
pub struct FlushJob<S: LogStore> {
/// Max memtable id in these memtables,
/// used to remove immutable memtables in current version.
pub max_memtable_id: MemtableId,
/// Memtables to be flushed.
pub memtables: Vec<MemtableRef>,
/// Last sequence of data to be flushed.
pub flush_sequence: SequenceNumber,
/// Shared data of region to be flushed.
pub shared: SharedDataRef,
/// Sst access layer of the region.
pub sst_layer: AccessLayerRef,
/// Region writer, used to persist log entry that points to the latest manifest file.
pub writer: RegionWriterRef<S>,
/// Region write-ahead logging, used to write data/meta to the log file.
pub wal: Wal<S>,
/// Region manifest service, used to persist metadata.
pub manifest: RegionManifest,
/// Storage engine config
pub engine_config: Arc<EngineConfig>,
}
impl<S: LogStore> FlushJob<S> {
/// Execute the flush job.
async fn run(&mut self) -> Result<()> {
let _timer = FLUSH_ELAPSED.start_timer();
let file_metas = self.write_memtables_to_layer().await?;
if file_metas.is_empty() {
// skip writing manifest and wal if no files are flushed.
return Ok(());
}
self.write_manifest_and_apply(&file_metas).await?;
Ok(())
}
async fn write_memtables_to_layer(&mut self) -> Result<Vec<FileMeta>> {
let region_id = self.shared.id();
let mut futures = Vec::with_capacity(self.memtables.len());
let iter_ctx = IterContext {
// TODO(ruihang): dynamic row group size based on content (#412)
batch_size: WRITE_ROW_GROUP_SIZE,
// All sequences are visible by default.
..Default::default()
};
for m in &self.memtables {
// skip empty memtable
if m.num_rows() == 0 {
continue;
}
let file_id = FileId::random();
// TODO(hl): Check if random file name already exists in meta.
let iter = m.iter(iter_ctx.clone())?;
let sst_layer = self.sst_layer.clone();
let write_options = WriteOptions {
sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
};
futures.push(async move {
Ok(sst_layer
.write_sst(file_id, Source::Iter(iter), &write_options)
.await?
.map(
|SstInfo {
time_range,
file_size,
..
}| FileMeta {
region_id,
file_id,
time_range,
level: 0,
file_size,
},
))
});
}
let metas: Vec<_> = futures_util::future::try_join_all(futures)
.await?
.into_iter()
.flatten()
.collect();
let flush_bytes = metas.iter().map(|f| f.file_size).sum();
FLUSH_BYTES_TOTAL.inc_by(flush_bytes);
let file_ids = metas.iter().map(|f| f.file_id).collect::<Vec<_>>();
logging::info!("Successfully flush memtables, region:{region_id}, files: {file_ids:?}");
Ok(metas)
}
async fn write_manifest_and_apply(&mut self, file_metas: &[FileMeta]) -> Result<()> {
let edit = RegionEdit {
region_version: self.shared.version_control.metadata().version(),
flushed_sequence: Some(self.flush_sequence),
files_to_add: file_metas.to_vec(),
files_to_remove: Vec::default(),
compaction_time_window: None,
};
self.writer
.write_edit_and_apply(
&self.wal,
&self.shared,
&self.manifest,
edit,
Some(self.max_memtable_id),
)
.await?;
self.wal.obsolete(self.flush_sequence).await
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::memtable::AllocTracker;
#[test]
fn test_get_mutable_limitation() {
assert_eq!(7, get_mutable_limitation(Some(8)));
assert_eq!(8, get_mutable_limitation(Some(10)));
assert_eq!(56, get_mutable_limitation(Some(64)));
assert_eq!(0, get_mutable_limitation(None));
}
#[test]
fn test_strategy_global_disabled() {
let strategy = SizeBasedStrategy::new(None);
strategy.reserve_mem(1000);
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
strategy.schedule_free_mem(1000);
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
strategy.free_mem(1000);
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 400,
write_buffer_size: 300,
};
assert_eq!(Some(FlushType::Region), strategy.should_flush(status));
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 100,
write_buffer_size: 300,
};
assert_eq!(None, strategy.should_flush(status));
}
#[test]
fn test_strategy_over_mutable_limit() {
let strategy = SizeBasedStrategy::new(Some(1000));
strategy.reserve_mem(500);
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 300,
write_buffer_size: 500,
};
assert_eq!(None, strategy.should_flush(status));
strategy.reserve_mem(400);
// Flush region.
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 400,
write_buffer_size: 300,
};
assert_eq!(Some(FlushType::Region), strategy.should_flush(status));
// More than mutable limitation, Flush global.
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 100,
write_buffer_size: 300,
};
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
strategy.schedule_free_mem(500);
assert_eq!(None, strategy.should_flush(status));
assert_eq!(900, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed));
strategy.free_mem(500);
assert_eq!(400, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed));
}
#[test]
fn test_strategy_over_global() {
common_telemetry::init_default_ut_logging();
let strategy = SizeBasedStrategy::new(Some(1000));
strategy.reserve_mem(1100);
strategy.schedule_free_mem(200);
// More than global limit.
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 100,
write_buffer_size: 300,
};
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
// More than global limit, but mutable not enough (< 500).
strategy.schedule_free_mem(450);
let status = RegionStatus {
region_id: 1.into(),
bytes_mutable: 100,
write_buffer_size: 300,
};
assert_eq!(None, strategy.should_flush(status));
strategy.schedule_free_mem(100);
assert_eq!(None, strategy.should_flush(status));
// Now mutable is enough.
strategy.reserve_mem(150);
// We can flush again.
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
strategy.reserve_mem(100);
assert_eq!(Some(FlushType::Engine), strategy.should_flush(status));
}
#[test]
fn test_alloc_tracker_without_strategy() {
let tracker = AllocTracker::new(None);
assert_eq!(0, tracker.bytes_allocated());
tracker.on_allocate(100);
assert_eq!(100, tracker.bytes_allocated());
tracker.on_allocate(200);
assert_eq!(300, tracker.bytes_allocated());
tracker.done_allocating();
assert_eq!(300, tracker.bytes_allocated());
}
#[test]
fn test_alloc_tracker_with_strategy() {
let strategy = Arc::new(SizeBasedStrategy::new(Some(1000)));
{
let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef));
tracker.on_allocate(100);
assert_eq!(100, tracker.bytes_allocated());
assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed));
for _ in 0..2 {
// Done allocating won't free the same memory multiple times.
tracker.done_allocating();
assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
}
}
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
}
#[test]
fn test_alloc_tracker_without_done_allocating() {
let strategy = Arc::new(SizeBasedStrategy::new(Some(1000)));
{
let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef));
tracker.on_allocate(100);
assert_eq!(100, tracker.bytes_allocated());
assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed));
}
assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed));
assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed));
}
}

View File

@@ -1,263 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Duration;
use async_trait::async_trait;
use common_telemetry::logging;
use common_time::util;
use store_api::logstore::LogStore;
use store_api::storage::{FlushContext, FlushReason, Region};
use crate::config::{DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_PICKER_SCHEDULE_INTERVAL};
use crate::region::RegionImpl;
/// Config for [FlushPicker].
pub struct PickerConfig {
/// Interval to schedule the picker.
pub schedule_interval: Duration,
/// Interval to auto flush a region if it has not flushed yet.
pub auto_flush_interval: Duration,
}
impl PickerConfig {
/// Returns the auto flush interval in millis or a default value
/// if overflow occurs.
fn auto_flush_interval_millis(&self) -> i64 {
self.auto_flush_interval
.as_millis()
.try_into()
.unwrap_or(DEFAULT_AUTO_FLUSH_INTERVAL.into())
}
}
impl Default for PickerConfig {
fn default() -> Self {
PickerConfig {
schedule_interval: Duration::from_millis(DEFAULT_PICKER_SCHEDULE_INTERVAL.into()),
auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()),
}
}
}
/// Flush task picker.
#[derive(Debug, Clone)]
pub struct FlushPicker {
/// Interval to flush a region automatically.
auto_flush_interval_millis: i64,
}
impl FlushPicker {
/// Returns a new FlushPicker.
pub fn new(config: PickerConfig) -> FlushPicker {
FlushPicker {
auto_flush_interval_millis: config.auto_flush_interval_millis(),
}
}
/// Picks regions and flushes them by interval.
///
/// Returns the number of flushed regions.
pub async fn pick_by_interval<T: FlushItem>(&self, regions: &[T]) -> usize {
let now = util::current_time_millis();
// Flush regions by interval.
if let Some(earliest_flush_millis) = now.checked_sub(self.auto_flush_interval_millis) {
flush_regions_by_interval(regions, earliest_flush_millis).await
} else {
0
}
}
/// Picks and flushes regions when the write buffer is full.
pub async fn pick_by_write_buffer_full<T: FlushItem>(&self, regions: &[T]) {
// In such case, we pick the oldest region to flush. If this is not enough,
// the next time the region writer will trigger the picker again. Then we
// can pick another region to flush. The total memory will go down eventually.
let target = regions
.iter()
.filter(|region| region.mutable_memtable_usage() > 0)
.min_by_key(|region| region.last_flush_time());
if let Some(region) = target {
logging::debug!(
"Request flush for region {} due to global buffer is full",
region.item_id()
);
region.request_flush(FlushReason::GlobalBufferFull).await;
}
}
}
/// Item for picker to flush.
#[async_trait]
pub trait FlushItem {
/// Id of the item.
fn item_id(&self) -> u64;
/// Last flush time in millis.
fn last_flush_time(&self) -> i64;
/// Mutable memtable usage.
fn mutable_memtable_usage(&self) -> usize;
/// Requests the item to schedule a flush for specific `reason`.
///
/// The flush job itself should run in background.
async fn request_flush(&self, reason: FlushReason);
}
#[async_trait]
impl<S: LogStore> FlushItem for RegionImpl<S> {
fn item_id(&self) -> u64 {
self.id().into()
}
fn last_flush_time(&self) -> i64 {
self.last_flush_millis()
}
fn mutable_memtable_usage(&self) -> usize {
let current = self.version_control().current();
let memtables = current.memtables();
memtables.mutable_bytes_allocated()
}
async fn request_flush(&self, reason: FlushReason) {
let ctx = FlushContext {
wait: false,
reason,
..Default::default()
};
if let Err(e) = self.flush(&ctx).await {
logging::error!(e; "Failed to flush region {}", self.id());
}
}
}
/// Auto flush regions based on last flush time.
///
/// Returns the number of flushed regions.
async fn flush_regions_by_interval<T: FlushItem>(
regions: &[T],
earliest_flush_millis: i64,
) -> usize {
let mut flushed = 0;
for region in regions {
if region.last_flush_time() < earliest_flush_millis {
logging::debug!(
"Auto flush region {} due to last flush time ({} < {})",
region.item_id(),
region.last_flush_time(),
earliest_flush_millis,
);
flushed += 1;
region.request_flush(FlushReason::Periodically).await;
}
}
flushed
}
#[cfg(test)]
mod tests {
use std::sync::Mutex;
use super::*;
struct MockItem {
id: u64,
last_flush_time: i64,
usage: usize,
flush_reason: Mutex<Option<FlushReason>>,
}
impl MockItem {
fn new(id: u64, last_flush_time: i64, usage: usize) -> MockItem {
MockItem {
id,
last_flush_time,
usage,
flush_reason: Mutex::new(None),
}
}
fn flush_reason(&self) -> Option<FlushReason> {
*self.flush_reason.lock().unwrap()
}
}
#[async_trait]
impl FlushItem for MockItem {
fn item_id(&self) -> u64 {
self.id
}
fn last_flush_time(&self) -> i64 {
self.last_flush_time
}
fn mutable_memtable_usage(&self) -> usize {
self.usage
}
async fn request_flush(&self, reason: FlushReason) {
let mut flush_reason = self.flush_reason.lock().unwrap();
*flush_reason = Some(reason);
}
}
#[tokio::test]
async fn test_pick_by_interval() {
let regions = [
MockItem::new(0, util::current_time_millis(), 1),
MockItem::new(1, util::current_time_millis() - 60 * 1000, 1),
];
let picker = FlushPicker::new(PickerConfig {
// schedule_interval is unused in this test.
schedule_interval: Duration::from_millis(10),
auto_flush_interval: Duration::from_millis(30 * 1000),
});
let flushed = picker.pick_by_interval(&regions).await;
assert_eq!(1, flushed);
assert!(regions[0].flush_reason().is_none());
assert_eq!(Some(FlushReason::Periodically), regions[1].flush_reason());
}
#[tokio::test]
async fn test_pick_by_buffer_full() {
let regions = [
MockItem::new(0, util::current_time_millis(), 10),
MockItem::new(1, util::current_time_millis() - 60 * 1000, 0),
MockItem::new(1, util::current_time_millis() - 60 * 1000, 10),
];
let picker = FlushPicker::new(PickerConfig {
schedule_interval: Duration::from_millis(10),
auto_flush_interval: Duration::from_millis(30 * 1000),
});
picker.pick_by_write_buffer_full(&regions).await;
assert!(regions[0].flush_reason().is_none());
assert!(regions[1].flush_reason().is_none());
assert_eq!(
Some(FlushReason::GlobalBufferFull),
regions[2].flush_reason()
);
// No target.
let regions = [MockItem::new(1, util::current_time_millis(), 0)];
picker.pick_by_write_buffer_full(&regions).await;
assert!(regions[0].flush_reason().is_none());
}
}

View File

@@ -1,378 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use common_base::readable_size::ReadableSize;
use common_runtime::{RepeatedTask, TaskFunction};
use common_telemetry::logging;
use snafu::{ensure, ResultExt};
use store_api::logstore::LogStore;
use store_api::storage::{RegionId, SequenceNumber};
use tokio::sync::oneshot::{Receiver, Sender};
use tokio::sync::{oneshot, Notify};
use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
use crate::config::EngineConfig;
use crate::engine::RegionMap;
use crate::error::{
DuplicateFlushSnafu, Error, Result, StartPickTaskSnafu, StopPickTaskSnafu, WaitFlushSnafu,
};
use crate::flush::{FlushJob, FlushPicker, PickerConfig};
use crate::manifest::region::RegionManifest;
use crate::memtable::{MemtableId, MemtableRef};
use crate::metrics::FLUSH_ERRORS_TOTAL;
use crate::region;
use crate::region::{RegionWriterRef, SharedDataRef};
use crate::scheduler::rate_limit::BoxedRateLimitToken;
use crate::scheduler::{Handler, LocalScheduler, Request, Scheduler, SchedulerConfig};
use crate::sst::AccessLayerRef;
use crate::wal::Wal;
/// Key for [FlushRequest].
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub enum FlushKey {
Engine,
Region(RegionId, SequenceNumber),
}
/// Flush request.
pub enum FlushRequest<S: LogStore> {
/// Flush the engine.
Engine,
/// Flush a region.
Region {
/// Region flush request.
req: FlushRegionRequest<S>,
/// Flush result sender.
sender: Sender<Result<()>>,
},
}
impl<S: LogStore> Request for FlushRequest<S> {
type Key = FlushKey;
#[inline]
fn key(&self) -> FlushKey {
match &self {
FlushRequest::Engine => FlushKey::Engine,
FlushRequest::Region { req, .. } => {
FlushKey::Region(req.shared.id(), req.flush_sequence)
}
}
}
fn complete(self, result: Result<()>) {
if let FlushRequest::Region { sender, .. } = self {
let _ = sender.send(result);
}
}
}
/// Region flush request.
pub struct FlushRegionRequest<S: LogStore> {
/// Max memtable id in these memtables,
/// used to remove immutable memtables in current version.
pub max_memtable_id: MemtableId,
/// Memtables to be flushed.
pub memtables: Vec<MemtableRef>,
/// Last sequence of data to be flushed.
pub flush_sequence: SequenceNumber,
/// Shared data of region to be flushed.
pub shared: SharedDataRef,
/// Sst access layer of the region.
pub sst_layer: AccessLayerRef,
/// Region writer, used to persist log entry that points to the latest manifest file.
pub writer: RegionWriterRef<S>,
/// Region write-ahead logging, used to write data/meta to the log file.
pub wal: Wal<S>,
/// Region manifest service, used to persist metadata.
pub manifest: RegionManifest,
/// Storage engine config
pub engine_config: Arc<EngineConfig>,
// Compaction related options:
/// TTL of the region.
pub ttl: Option<Duration>,
/// Time window for compaction.
pub compaction_time_window: Option<i64>,
pub compaction_picker: CompactionPickerRef<S>,
}
impl<S: LogStore> FlushRegionRequest<S> {
#[inline]
fn region_id(&self) -> RegionId {
self.shared.id()
}
}
impl<S: LogStore> From<&FlushRegionRequest<S>> for FlushJob<S> {
fn from(req: &FlushRegionRequest<S>) -> FlushJob<S> {
FlushJob {
max_memtable_id: req.max_memtable_id,
memtables: req.memtables.clone(),
flush_sequence: req.flush_sequence,
shared: req.shared.clone(),
sst_layer: req.sst_layer.clone(),
writer: req.writer.clone(),
wal: req.wal.clone(),
manifest: req.manifest.clone(),
engine_config: req.engine_config.clone(),
}
}
}
impl<S: LogStore> From<&FlushRegionRequest<S>> for CompactionRequestImpl<S> {
fn from(req: &FlushRegionRequest<S>) -> CompactionRequestImpl<S> {
CompactionRequestImpl {
region_id: req.region_id(),
sst_layer: req.sst_layer.clone(),
writer: req.writer.clone(),
shared: req.shared.clone(),
manifest: req.manifest.clone(),
wal: req.wal.clone(),
ttl: req.ttl,
compaction_time_window: req.compaction_time_window,
sender: None,
picker: req.compaction_picker.clone(),
sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
// compaction triggered by flush always reschedules
reschedule_on_finish: true,
}
}
}
/// A handle to get the flush result.
#[derive(Debug)]
pub struct FlushHandle {
region_id: RegionId,
receiver: Receiver<Result<()>>,
}
impl FlushHandle {
/// Waits until the flush job is finished.
pub async fn wait(self) -> Result<()> {
self.receiver.await.context(WaitFlushSnafu {
region_id: self.region_id,
})?
}
}
/// Flush scheduler.
pub struct FlushScheduler<S: LogStore> {
/// Flush task scheduler.
scheduler: LocalScheduler<FlushRequest<S>>,
/// Auto flush task.
auto_flush_task: RepeatedTask<Error>,
#[cfg(test)]
pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
}
pub type FlushSchedulerRef<S> = Arc<FlushScheduler<S>>;
impl<S: LogStore> FlushScheduler<S> {
/// Returns a new [FlushScheduler].
pub fn new(
config: SchedulerConfig,
compaction_scheduler: CompactionSchedulerRef<S>,
regions: Arc<RegionMap<S>>,
picker_config: PickerConfig,
) -> Result<Self> {
let task_interval = picker_config.schedule_interval;
let picker = FlushPicker::new(picker_config);
// Now we just clone the picker since we don't need to share states and
// the clone of picker is cheap.
let task_fn = AutoFlushFunction {
regions: regions.clone(),
picker: picker.clone(),
};
let auto_flush_task = RepeatedTask::new(task_interval, Box::new(task_fn));
auto_flush_task
.start(common_runtime::bg_runtime())
.context(StartPickTaskSnafu)?;
#[cfg(test)]
let pending_tasks = Arc::new(tokio::sync::RwLock::new(vec![]));
let handler = FlushHandler {
compaction_scheduler,
regions,
picker,
#[cfg(test)]
pending_tasks: pending_tasks.clone(),
};
Ok(Self {
scheduler: LocalScheduler::new(config, handler),
auto_flush_task,
#[cfg(test)]
pending_tasks,
})
}
/// Schedules a region flush request and return the handle to the flush task.
pub fn schedule_region_flush(&self, req: FlushRegionRequest<S>) -> Result<FlushHandle> {
let region_id = req.region_id();
let sequence = req.flush_sequence;
let (sender, receiver) = oneshot::channel();
let scheduled = self
.scheduler
.schedule(FlushRequest::Region { req, sender })?;
// Normally we should not have duplicate flush request.
ensure!(
scheduled,
DuplicateFlushSnafu {
region_id,
sequence,
}
);
Ok(FlushHandle {
region_id,
receiver,
})
}
/// Schedules a engine flush request.
pub fn schedule_engine_flush(&self) -> Result<()> {
let _ = self.scheduler.schedule(FlushRequest::Engine)?;
Ok(())
}
/// Stop the scheduler.
pub async fn stop(&self) -> Result<()> {
self.auto_flush_task
.stop()
.await
.context(StopPickTaskSnafu)?;
self.scheduler.stop(true).await?;
#[cfg(test)]
let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await;
Ok(())
}
}
struct FlushHandler<S: LogStore> {
compaction_scheduler: CompactionSchedulerRef<S>,
regions: Arc<RegionMap<S>>,
picker: FlushPicker,
#[cfg(test)]
pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
}
#[async_trait::async_trait]
impl<S: LogStore> Handler for FlushHandler<S> {
type Request = FlushRequest<S>;
async fn handle_request(
&self,
req: FlushRequest<S>,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
let compaction_scheduler = self.compaction_scheduler.clone();
let region_map = self.regions.clone();
let picker = self.picker.clone();
let _handle = common_runtime::spawn_bg(async move {
match req {
FlushRequest::Engine => {
let regions = region_map.list_regions();
picker.pick_by_write_buffer_full(&regions).await;
}
FlushRequest::Region { req, sender } => {
execute_flush_region(req, sender, compaction_scheduler).await;
}
}
// releases rate limit token
token.try_release();
// notify scheduler to schedule next task when current task finishes.
finish_notifier.notify_one();
});
#[cfg(test)]
self.pending_tasks.write().await.push(_handle);
Ok(())
}
}
async fn execute_flush_region<S: LogStore>(
req: FlushRegionRequest<S>,
sender: Sender<Result<()>>,
compaction_scheduler: CompactionSchedulerRef<S>,
) {
let mut flush_job = FlushJob::from(&req);
if let Err(e) = flush_job.run().await {
logging::error!(e; "Failed to flush region {}", req.region_id());
FLUSH_ERRORS_TOTAL.inc();
FlushRequest::Region { req, sender }.complete(Err(e));
} else {
logging::debug!("Successfully flush region: {}", req.region_id());
// Update last flush time.
req.shared.update_flush_millis();
let compaction_request = CompactionRequestImpl::from(&req);
let max_files_in_l0 = req.engine_config.max_files_in_l0;
let shared_data = req.shared.clone();
let level0_file_num = shared_data
.version_control
.current()
.ssts()
.level(0)
.file_num();
if level0_file_num <= max_files_in_l0 {
logging::debug!(
"No enough SST files in level 0 (threshold: {}), skip compaction",
max_files_in_l0
);
} else {
// If flush is success, schedule a compaction request for this region.
let _ =
region::schedule_compaction(shared_data, compaction_scheduler, compaction_request);
}
// Complete the request.
FlushRequest::Region { req, sender }.complete(Ok(()));
}
}
/// Task function to pick regions to flush.
struct AutoFlushFunction<S: LogStore> {
/// Regions of the engine.
regions: Arc<RegionMap<S>>,
picker: FlushPicker,
}
#[async_trait]
impl<S: LogStore> TaskFunction<Error> for AutoFlushFunction<S> {
async fn call(&mut self) -> Result<()> {
// Get all regions.
let regions = self.regions.list_regions();
let _ = self.picker.pick_by_interval(&regions).await;
Ok(())
}
fn name(&self) -> &str {
"FlushPicker-pick-task"
}
}

View File

@@ -1,49 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Storage engine implementation.
#![feature(let_chains)]
mod chunk;
pub mod codec;
pub mod compaction;
pub mod config;
mod engine;
pub mod error;
mod flush;
pub mod manifest;
pub mod memtable;
pub mod metadata;
pub mod proto;
pub mod read;
pub mod region;
pub mod scheduler;
pub mod schema;
mod snapshot;
pub mod sst;
mod sync;
#[cfg(test)]
mod test_util;
mod version;
mod wal;
pub mod write_batch;
pub use engine::EngineImpl;
mod file_purger;
mod metrics;
mod window_infer;
pub use sst::parquet::ParquetWriter;
pub use sst::Source;

View File

@@ -1,26 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! manifest storage
pub(crate) mod action;
pub mod checkpoint;
pub mod helper;
mod impl_;
pub mod region;
pub(crate) mod storage;
#[cfg(test)]
pub mod test_utils;
pub use self::impl_::*;
pub use self::storage::manifest_compress_type;

View File

@@ -1,443 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::io::{BufRead, BufReader};
use serde::{Deserialize, Serialize};
use serde_json as json;
use snafu::{ensure, OptionExt, ResultExt};
use store_api::manifest::action::{ProtocolAction, ProtocolVersion, VersionHeader};
use store_api::manifest::{Checkpoint, ManifestVersion, MetaAction};
use store_api::storage::{RegionId, SequenceNumber};
use crate::error::{
self, DecodeJsonSnafu, DecodeMetaActionListSnafu, ManifestProtocolForbidReadSnafu,
ReadlineSnafu, Result,
};
use crate::manifest::helper;
use crate::metadata::{ColumnFamilyMetadata, ColumnMetadata, VersionNumber};
use crate::sst::{FileId, FileMeta};
/// Minimal data that could be used to persist and recover [RegionMetadata](crate::metadata::RegionMetadata).
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct RawRegionMetadata {
pub id: RegionId,
pub name: String,
pub columns: RawColumnsMetadata,
pub column_families: RawColumnFamiliesMetadata,
pub version: VersionNumber,
}
/// Minimal data that could be used to persist and recover [ColumnsMetadata](crate::metadata::ColumnsMetadata).
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct RawColumnsMetadata {
pub columns: Vec<ColumnMetadata>,
pub row_key_end: usize,
pub timestamp_key_index: usize,
pub user_column_end: usize,
}
/// Minimal data that could be used to persist and recover [ColumnFamiliesMetadata](crate::metadata::ColumnFamiliesMetadata).
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct RawColumnFamiliesMetadata {
pub column_families: Vec<ColumnFamilyMetadata>,
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionChange {
/// The committed sequence of the region when this change happens. So the
/// data with sequence **greater than** this sequence would use the new
/// metadata.
pub committed_sequence: SequenceNumber,
/// The metadata after changed.
pub metadata: RawRegionMetadata,
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionRemove {
pub region_id: RegionId,
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionEdit {
pub region_version: VersionNumber,
pub flushed_sequence: Option<SequenceNumber>,
pub files_to_add: Vec<FileMeta>,
pub files_to_remove: Vec<FileMeta>,
pub compaction_time_window: Option<i64>,
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionTruncate {
pub region_id: RegionId,
pub committed_sequence: SequenceNumber,
}
/// The region version checkpoint
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionVersion {
pub manifest_version: ManifestVersion,
pub flushed_sequence: Option<SequenceNumber>,
pub files: HashMap<FileId, FileMeta>,
}
/// The region manifest data checkpoint
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)]
pub struct RegionManifestData {
pub committed_sequence: SequenceNumber,
pub metadata: RawRegionMetadata,
pub version: Option<RegionVersion>,
}
#[derive(Debug, Default)]
pub struct RegionManifestDataBuilder {
committed_sequence: SequenceNumber,
metadata: RawRegionMetadata,
version: Option<RegionVersion>,
}
impl RegionManifestDataBuilder {
pub fn with_checkpoint(checkpoint: Option<RegionManifestData>) -> Self {
if let Some(s) = checkpoint {
Self {
metadata: s.metadata,
version: s.version,
committed_sequence: s.committed_sequence,
}
} else {
Default::default()
}
}
pub fn apply_change(&mut self, change: RegionChange) {
self.metadata = change.metadata;
self.committed_sequence = change.committed_sequence;
}
pub fn apply_edit(&mut self, manifest_version: ManifestVersion, edit: RegionEdit) {
if let Some(version) = &mut self.version {
version.manifest_version = manifest_version;
version.flushed_sequence = edit.flushed_sequence;
for file in edit.files_to_add {
let _ = version.files.insert(file.file_id, file);
}
for file in edit.files_to_remove {
let _ = version.files.remove(&file.file_id);
}
} else {
self.version = Some(RegionVersion {
manifest_version,
flushed_sequence: edit.flushed_sequence,
files: edit
.files_to_add
.into_iter()
.map(|f| (f.file_id, f))
.collect(),
});
}
}
pub fn build(self) -> RegionManifestData {
RegionManifestData {
metadata: self.metadata,
version: self.version,
committed_sequence: self.committed_sequence,
}
}
}
// The checkpoint of region manifest, generated by checkpoint.
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
pub struct RegionCheckpoint {
/// The snasphot protocol
pub protocol: ProtocolAction,
/// The last manifest version that this checkpoint compacts(inclusive).
pub last_version: ManifestVersion,
// The number of manifest actions that this checkpoint compacts.
pub compacted_actions: usize,
// The checkpoint data
pub checkpoint: Option<RegionManifestData>,
}
impl Checkpoint for RegionCheckpoint {
type Error = error::Error;
fn set_protocol(&mut self, action: ProtocolAction) {
self.protocol = action;
}
fn last_version(&self) -> ManifestVersion {
self.last_version
}
fn encode(&self) -> Result<Vec<u8>> {
helper::encode_checkpoint(self)
}
fn decode(bs: &[u8], reader_version: ProtocolVersion) -> Result<Self> {
helper::decode_checkpoint(bs, reader_version)
}
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub enum RegionMetaAction {
Protocol(ProtocolAction),
Change(RegionChange),
Remove(RegionRemove),
Edit(RegionEdit),
Truncate(RegionTruncate),
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionMetaActionList {
pub actions: Vec<RegionMetaAction>,
pub prev_version: ManifestVersion,
}
impl RegionMetaActionList {
pub fn with_action(action: RegionMetaAction) -> Self {
Self {
actions: vec![action],
prev_version: 0,
}
}
pub fn new(actions: Vec<RegionMetaAction>) -> Self {
Self {
actions,
prev_version: 0,
}
}
}
impl MetaAction for RegionMetaActionList {
type Error = error::Error;
fn set_protocol(&mut self, action: ProtocolAction) {
// The protocol action should be the first action in action list by convention.
self.actions.insert(0, RegionMetaAction::Protocol(action));
}
fn set_prev_version(&mut self, version: ManifestVersion) {
self.prev_version = version;
}
/// Encode self into json in the form of string lines, starts with prev_version and then action json list.
fn encode(&self) -> Result<Vec<u8>> {
helper::encode_actions(self.prev_version, &self.actions)
}
fn decode(
bs: &[u8],
reader_version: ProtocolVersion,
) -> Result<(Self, Option<ProtocolAction>)> {
let mut lines = BufReader::new(bs).lines();
let mut action_list = RegionMetaActionList {
actions: Vec::default(),
prev_version: 0,
};
{
let first_line = lines
.next()
.with_context(|| DecodeMetaActionListSnafu {
msg: format!(
"Invalid content in manifest: {}",
std::str::from_utf8(bs).unwrap_or("**invalid bytes**")
),
})?
.context(ReadlineSnafu)?;
// Decode prev_version
let v: VersionHeader = json::from_str(&first_line).context(DecodeJsonSnafu)?;
action_list.prev_version = v.prev_version;
}
// Decode actions
let mut protocol_action = None;
let mut actions = Vec::default();
for line in lines {
let line = &line.context(ReadlineSnafu)?;
let action: RegionMetaAction = json::from_str(line).context(DecodeJsonSnafu)?;
if let RegionMetaAction::Protocol(p) = &action {
ensure!(
p.is_readable(reader_version),
ManifestProtocolForbidReadSnafu {
min_version: p.min_reader_version,
supported_version: reader_version,
}
);
protocol_action = Some(p.clone());
}
actions.push(action);
}
action_list.actions = actions;
Ok((action_list, protocol_action))
}
}
#[cfg(test)]
mod tests {
use common_telemetry::logging;
use datatypes::type_id::LogicalTypeId;
use super::*;
use crate::manifest::test_utils;
use crate::metadata::RegionMetadata;
use crate::sst::FileId;
use crate::test_util::descriptor_util::RegionDescBuilder;
#[test]
fn test_encode_decode_action_list() {
common_telemetry::init_default_ut_logging();
let mut protocol = ProtocolAction::new();
protocol.min_reader_version = 1;
let mut action_list = RegionMetaActionList::new(vec![
RegionMetaAction::Protocol(protocol.clone()),
RegionMetaAction::Edit(test_utils::build_region_edit(
99,
&[FileId::random(), FileId::random()],
&[FileId::random()],
)),
]);
action_list.set_prev_version(3);
let bs = action_list.encode().unwrap();
// {"prev_version":3}
// {"Protocol":{"min_reader_version":1,"min_writer_version":0}}
// {"Edit":{"region_version":0,"flush_sequence":99,"files_to_add":[{"file_name":"test1","level":1},{"file_name":"test2","level":2}],"files_to_remove":[{"file_name":"test0","level":0}]}}
logging::debug!(
"Encoded action list: \r\n{}",
String::from_utf8(bs.clone()).unwrap()
);
let e = RegionMetaActionList::decode(&bs, 0);
assert!(e.is_err());
assert_eq!(
"Manifest protocol forbid to read, min_version: 1, supported_version: 0",
format!("{}", e.err().unwrap())
);
let (decode_list, p) = RegionMetaActionList::decode(&bs, 1).unwrap();
assert_eq!(decode_list, action_list);
assert_eq!(p.unwrap(), protocol);
}
// These tests are used to ensure backward compatibility of manifest files.
// DO NOT modify the serialized string when they fail, check if your
// modification to manifest-related structs is compatible with older manifests.
#[test]
fn test_region_manifest_compatibility() {
let region_edit = r#"{"region_version":0,"flushed_sequence":null,"files_to_add":[{"region_id":4402341478400,"file_name":"4b220a70-2b03-4641-9687-b65d94641208.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":1}],"files_to_remove":[{"region_id":4402341478400,"file_name":"34b6ebb9-b8a5-4a4b-b744-56f67defad02.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":0}]}"#;
let _ = serde_json::from_str::<RegionEdit>(region_edit).unwrap();
let region_change = r#" {"committed_sequence":42,"metadata":{"id":0,"name":"region-0","columns":{"columns":[{"cf_id":0,"desc":{"id":2,"name":"k1","data_type":{"Int32":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":0,"desc":{"id":1,"name":"timestamp","data_type":{"Timestamp":{"Millisecond":null}},"is_nullable":false,"is_time_index":true,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":3,"name":"v1","data_type":{"Float32":{}},"is_nullable":true,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483649,"name":"__sequence","data_type":{"UInt64":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483650,"name":"__op_type","data_type":{"UInt8":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}}],"row_key_end":2,"timestamp_key_index":1,"enable_version_column":false,"user_column_end":3},"column_families":{"column_families":[{"name":"default","cf_id":1,"column_index_start":2,"column_index_end":3}]},"version":0}}"#;
let _ = serde_json::from_str::<RegionChange>(region_change).unwrap();
let region_remove = r#"{"region_id":42}"#;
let _ = serde_json::from_str::<RegionRemove>(region_remove).unwrap();
let protocol_action = r#"{"min_reader_version":1,"min_writer_version":2}"#;
let _ = serde_json::from_str::<ProtocolAction>(protocol_action).unwrap();
}
fn mock_file_meta() -> FileMeta {
FileMeta {
region_id: 0.into(),
file_id: FileId::random(),
time_range: None,
level: 0,
file_size: 1024,
}
}
#[test]
fn test_region_manifest_builder() {
let desc = RegionDescBuilder::new("test_region_manifest_builder")
.push_field_column(("v0", LogicalTypeId::Int64, true))
.build();
let region_metadata: RegionMetadata = desc.try_into().unwrap();
let mut builder = RegionManifestDataBuilder::with_checkpoint(None);
builder.apply_change(RegionChange {
committed_sequence: 42,
metadata: RawRegionMetadata::from(&region_metadata),
});
let files = vec![mock_file_meta(), mock_file_meta()];
builder.apply_edit(
84,
RegionEdit {
region_version: 0,
flushed_sequence: Some(99),
files_to_add: files.clone(),
files_to_remove: vec![],
compaction_time_window: None,
},
);
builder.apply_edit(
85,
RegionEdit {
region_version: 0,
flushed_sequence: Some(100),
files_to_add: vec![],
files_to_remove: vec![files[0].clone()],
compaction_time_window: None,
},
);
let manifest = builder.build();
assert_eq!(manifest.metadata, RawRegionMetadata::from(&region_metadata));
assert_eq!(manifest.committed_sequence, 42);
assert_eq!(
manifest.version,
Some(RegionVersion {
manifest_version: 85,
flushed_sequence: Some(100),
files: files[1..].iter().map(|f| (f.file_id, f.clone())).collect(),
})
);
}
#[test]
fn test_encode_decode_region_checkpoint() {
let region_checkpoint = RegionCheckpoint {
protocol: ProtocolAction::default(),
last_version: 42,
compacted_actions: 10,
checkpoint: Some(RegionManifestData {
committed_sequence: 100,
metadata: RawRegionMetadata::default(),
version: Some(RegionVersion {
manifest_version: 84,
flushed_sequence: Some(99),
files: vec![mock_file_meta(), mock_file_meta()]
.into_iter()
.map(|f| (f.file_id, f))
.collect(),
}),
}),
};
let bytes = region_checkpoint.encode().unwrap();
assert!(!bytes.is_empty());
let decoded_checkpoint = RegionCheckpoint::decode(&bytes, 0).unwrap();
assert_eq!(region_checkpoint, decoded_checkpoint);
}
}

View File

@@ -1,35 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use async_trait::async_trait;
use store_api::manifest::{Checkpoint, MetaAction};
use crate::error::{Error, Result};
use crate::manifest::ManifestImpl;
#[async_trait]
pub trait Checkpointer: Send + Sync + std::fmt::Debug {
type Checkpoint: Checkpoint<Error = Error>;
type MetaAction: MetaAction<Error = Error>;
/// Try to create a checkpoint, return the checkpoint if successes.
async fn do_checkpoint(
&self,
manifest: &ManifestImpl<Self::Checkpoint, Self::MetaAction>,
) -> Result<Option<Self::Checkpoint>>;
fn as_any(&self) -> &dyn Any;
}

View File

@@ -1,69 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::io::Write;
use serde::Serialize;
use serde_json::to_writer;
use snafu::{ensure, ResultExt};
use store_api::manifest::action::{ProtocolVersion, VersionHeader};
use store_api::manifest::ManifestVersion;
use crate::error::{
DecodeJsonSnafu, EncodeJsonSnafu, ManifestProtocolForbidReadSnafu, Result, Utf8Snafu,
};
use crate::manifest::action::RegionCheckpoint;
pub const NEWLINE: &[u8] = b"\n";
pub fn encode_actions<T: Serialize>(
prev_version: ManifestVersion,
actions: &[T],
) -> Result<Vec<u8>> {
let mut bytes = Vec::default();
{
// Encode prev_version
let v = VersionHeader { prev_version };
to_writer(&mut bytes, &v).context(EncodeJsonSnafu)?;
// unwrap is fine here, because we write into a buffer.
bytes.write_all(NEWLINE).unwrap();
}
for action in actions {
to_writer(&mut bytes, action).context(EncodeJsonSnafu)?;
bytes.write_all(NEWLINE).unwrap();
}
Ok(bytes)
}
pub fn encode_checkpoint(snasphot: &RegionCheckpoint) -> Result<Vec<u8>> {
let s = serde_json::to_string(snasphot).context(EncodeJsonSnafu)?;
Ok(s.into_bytes())
}
pub fn decode_checkpoint(bs: &[u8], reader_version: ProtocolVersion) -> Result<RegionCheckpoint> {
let s = std::str::from_utf8(bs).context(Utf8Snafu)?;
let checkpoint: RegionCheckpoint = serde_json::from_str(s).context(DecodeJsonSnafu)?;
ensure!(
checkpoint.protocol.is_readable(reader_version),
ManifestProtocolForbidReadSnafu {
min_version: checkpoint.protocol.min_reader_version,
supported_version: reader_version,
}
);
Ok(checkpoint)
}

View File

@@ -1,405 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::marker::PhantomData;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use arc_swap::ArcSwap;
use async_trait::async_trait;
use common_datasource::compression::CompressionType;
use common_runtime::{RepeatedTask, TaskFunction};
use common_telemetry::{debug, logging, warn};
use object_store::ObjectStore;
use snafu::{ensure, ResultExt};
use store_api::manifest::action::{self, ProtocolAction, ProtocolVersion};
use store_api::manifest::*;
use crate::error::{
Error, ManifestProtocolForbidWriteSnafu, Result, StartManifestGcTaskSnafu,
StopManifestGcTaskSnafu,
};
use crate::manifest::action::RegionCheckpoint;
use crate::manifest::checkpoint::Checkpointer;
use crate::manifest::storage::{ManifestObjectStore, ObjectStoreLogIterator};
const CHECKPOINT_ACTIONS_MARGIN: u16 = 10;
const GC_DURATION_SECS: u64 = 600;
#[derive(Clone, Debug)]
pub struct ManifestImpl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
inner: Arc<ManifestImplInner<S, M>>,
checkpointer: Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>>,
last_checkpoint_version: Arc<AtomicU64>,
checkpoint_actions_margin: u16,
gc_task: Option<Arc<RepeatedTask<Error>>>,
}
impl<S: 'static + Checkpoint<Error = Error>, M: 'static + MetaAction<Error = Error>>
ManifestImpl<S, M>
{
pub fn new(
manifest_dir: &str,
object_store: ObjectStore,
compress_type: CompressionType,
checkpoint_actions_margin: Option<u16>,
gc_duration: Option<Duration>,
checkpointer: Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>>,
) -> Self {
let inner = Arc::new(ManifestImplInner::new(
manifest_dir,
object_store,
compress_type,
));
let gc_task = if checkpointer.is_some() {
// only start gc task when checkpoint is enabled.
Some(Arc::new(RepeatedTask::new(
gc_duration.unwrap_or_else(|| Duration::from_secs(GC_DURATION_SECS)),
Box::new(ManifestGcTask {
inner: inner.clone(),
}),
)))
} else {
None
};
ManifestImpl {
inner,
checkpointer,
checkpoint_actions_margin: checkpoint_actions_margin
.unwrap_or(CHECKPOINT_ACTIONS_MARGIN),
last_checkpoint_version: Arc::new(AtomicU64::new(MIN_VERSION)),
gc_task,
}
}
pub fn create(
manifest_dir: &str,
object_store: ObjectStore,
compress_type: CompressionType,
) -> Self {
Self::new(manifest_dir, object_store, compress_type, None, None, None)
}
#[inline]
pub(crate) fn checkpointer(
&self,
) -> &Option<Arc<dyn Checkpointer<Checkpoint = S, MetaAction = M>>> {
&self.checkpointer
}
#[inline]
pub(crate) fn set_last_checkpoint_version(&self, version: ManifestVersion) {
self.last_checkpoint_version
.store(version, Ordering::Relaxed);
}
/// Update inner state.
pub fn update_state(&self, version: ManifestVersion, protocol: Option<ProtocolAction>) {
self.inner.update_state(version, protocol);
}
pub(crate) async fn save_checkpoint(&self, checkpoint: &RegionCheckpoint) -> Result<()> {
ensure!(
checkpoint
.protocol
.is_writable(self.inner.supported_writer_version),
ManifestProtocolForbidWriteSnafu {
min_version: checkpoint.protocol.min_writer_version,
supported_version: self.inner.supported_writer_version,
}
);
let bytes = checkpoint.encode()?;
self.manifest_store()
.save_checkpoint(checkpoint.last_version, &bytes)
.await
}
pub(crate) async fn may_do_checkpoint(&self, version: ManifestVersion) -> Result<()> {
if version - self.last_checkpoint_version.load(Ordering::Relaxed)
>= self.checkpoint_actions_margin as u64
{
let s = self.do_checkpoint().await?;
debug!("Manifest checkpoint, checkpoint: {:#?}", s);
}
Ok(())
}
#[inline]
pub(crate) fn manifest_store(&self) -> &Arc<ManifestObjectStore> {
self.inner.manifest_store()
}
}
#[async_trait]
impl<S: 'static + Checkpoint<Error = Error>, M: 'static + MetaAction<Error = Error>> Manifest
for ManifestImpl<S, M>
{
type Error = Error;
type Checkpoint = S;
type MetaAction = M;
type MetaActionIterator = MetaActionIteratorImpl<M>;
async fn update(&self, action_list: M) -> Result<ManifestVersion> {
let version = self.inner.save(action_list).await?;
self.may_do_checkpoint(version).await?;
Ok(version)
}
async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<Self::MetaActionIterator> {
self.inner.scan(start, end).await
}
async fn do_checkpoint(&self) -> Result<Option<S>> {
if let Some(cp) = &self.checkpointer {
let checkpoint = cp.do_checkpoint(self).await?;
if let Some(checkpoint) = &checkpoint {
self.set_last_checkpoint_version(checkpoint.last_version());
}
return Ok(checkpoint);
}
Ok(None)
}
async fn last_checkpoint(&self) -> Result<Option<S>> {
self.inner.last_checkpoint().await
}
fn last_version(&self) -> ManifestVersion {
self.inner.last_version()
}
async fn start(&self) -> Result<()> {
if let Some(task) = &self.gc_task {
task.start(common_runtime::bg_runtime())
.context(StartManifestGcTaskSnafu)?;
}
Ok(())
}
async fn stop(&self) -> Result<()> {
if let Some(task) = &self.gc_task {
task.stop().await.context(StopManifestGcTaskSnafu)?;
}
Ok(())
}
}
#[derive(Debug)]
struct ManifestImplInner<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
store: Arc<ManifestObjectStore>,
version: AtomicU64,
/// Current using protocol
protocol: ArcSwap<ProtocolAction>,
/// Current node supported protocols (reader_version, writer_version)
supported_reader_version: ProtocolVersion,
supported_writer_version: ProtocolVersion,
_phantom: PhantomData<(S, M)>,
}
pub struct MetaActionIteratorImpl<M: MetaAction<Error = Error>> {
log_iter: ObjectStoreLogIterator,
reader_version: ProtocolVersion,
last_protocol: Option<ProtocolAction>,
_phantom: PhantomData<M>,
}
impl<M: MetaAction<Error = Error>> MetaActionIteratorImpl<M> {
pub fn last_protocol(&self) -> &Option<ProtocolAction> {
&self.last_protocol
}
}
#[async_trait]
impl<M: MetaAction<Error = Error>> MetaActionIterator for MetaActionIteratorImpl<M> {
type Error = Error;
type MetaAction = M;
async fn next_action(&mut self) -> Result<Option<(ManifestVersion, M)>> {
match self.log_iter.next_log().await? {
Some((v, bytes)) => {
let (action_list, protocol) = M::decode(&bytes, self.reader_version)?;
if protocol.is_some() {
self.last_protocol = protocol;
}
Ok(Some((v, action_list)))
}
None => Ok(None),
}
}
}
struct ManifestGcTask<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> {
inner: Arc<ManifestImplInner<S, M>>,
}
#[async_trait::async_trait]
impl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> TaskFunction<Error>
for ManifestGcTask<S, M>
{
fn name(&self) -> &str {
"region-manifest-gc"
}
async fn call(&mut self) -> Result<()> {
if let Some((last_version, _)) = self.inner.store.load_last_checkpoint().await? {
// Purge all manifest <= last_version and checkpoint files < last_version.
let deleted = self
.inner
.store
.delete_until(last_version + 1, true)
.await?;
debug!(
"Deleted {} logs from region manifest storage(path={}), last_version: {}.",
deleted,
self.inner.store.path(),
last_version,
);
}
Ok(())
}
}
impl<S: Checkpoint<Error = Error>, M: MetaAction<Error = Error>> ManifestImplInner<S, M> {
fn new(manifest_dir: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self {
let (reader_version, writer_version) = action::supported_protocol_version();
Self {
store: Arc::new(ManifestObjectStore::new(
manifest_dir,
object_store,
compress_type,
)),
version: AtomicU64::new(0),
protocol: ArcSwap::new(Arc::new(ProtocolAction::new())),
supported_reader_version: reader_version,
supported_writer_version: writer_version,
_phantom: PhantomData,
}
}
#[inline]
fn manifest_store(&self) -> &Arc<ManifestObjectStore> {
&self.store
}
#[inline]
fn inc_version(&self) -> ManifestVersion {
self.version.fetch_add(1, Ordering::Relaxed)
}
fn update_state(&self, version: ManifestVersion, protocol: Option<ProtocolAction>) {
self.version.store(version, Ordering::Relaxed);
if let Some(p) = protocol {
self.protocol.store(Arc::new(p));
}
}
#[inline]
fn last_version(&self) -> ManifestVersion {
self.version.load(Ordering::Relaxed)
}
async fn save(&self, mut action_list: M) -> Result<ManifestVersion> {
let protocol = self.protocol.load();
ensure!(
protocol.is_writable(self.supported_writer_version),
ManifestProtocolForbidWriteSnafu {
min_version: protocol.min_writer_version,
supported_version: self.supported_writer_version,
}
);
let version = self.inc_version();
if version == 0 || protocol.min_writer_version < self.supported_writer_version {
let new_protocol = ProtocolAction {
min_reader_version: self.supported_reader_version,
min_writer_version: self.supported_writer_version,
};
action_list.set_protocol(new_protocol.clone());
logging::info!(
"Updated manifest protocol from {} to {}.",
protocol,
new_protocol
);
self.protocol.store(Arc::new(new_protocol));
}
logging::debug!(
"Save region metadata action: {:?}, version: {}",
action_list,
version
);
self.store.save(version, &action_list.encode()?).await?;
Ok(version)
}
async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<MetaActionIteratorImpl<M>> {
Ok(MetaActionIteratorImpl {
log_iter: self.store.scan(start, end).await?,
reader_version: self.supported_reader_version,
last_protocol: None,
_phantom: PhantomData,
})
}
async fn last_checkpoint(&self) -> Result<Option<S>> {
let protocol = self.protocol.load();
let last_checkpoint = self.store.load_last_checkpoint().await?;
if let Some((version, bytes)) = last_checkpoint {
let checkpoint = S::decode(&bytes, protocol.min_reader_version)?;
assert!(checkpoint.last_version() >= version);
if checkpoint.last_version() > version {
// It happens when saving checkpoint successfully, but failed at saving checkpoint metadata(the "__last_checkpoint" file).
// Then we try to use the old checkpoint and do the checkpoint next time.
// If the old checkpoint was deleted, it's fine that we return the latest checkpoint.
// The only side effect is leaving some unused checkpoint files,
// and they will be purged by gc task.
warn!("The checkpoint manifest version {} in {} is greater than checkpoint metadata version {}.", self.store.path(), checkpoint.last_version(), version);
if let Some((_, bytes)) = self.store.load_checkpoint(version).await? {
let old_checkpoint = S::decode(&bytes, protocol.min_reader_version)?;
return Ok(Some(old_checkpoint));
}
}
Ok(Some(checkpoint))
} else {
Ok(None)
}
}
}

View File

@@ -1,690 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region manifest impl
use std::any::Any;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use common_datasource::compression::CompressionType;
use common_telemetry::{info, warn};
use object_store::ObjectStore;
use store_api::manifest::action::ProtocolAction;
use store_api::manifest::{
Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator, MIN_VERSION,
};
use crate::error::{ManifestCheckpointSnafu, Result};
use crate::manifest::action::*;
use crate::manifest::checkpoint::Checkpointer;
use crate::manifest::ManifestImpl;
pub type RegionManifest = ManifestImpl<RegionCheckpoint, RegionMetaActionList>;
#[derive(Debug)]
pub struct RegionManifestCheckpointer {
// The latest manifest version when flushing memtables.
// Checkpoint can't exceed over flushed manifest version because we have to keep
// the region metadata for replaying WAL to ensure correct data schema.
flushed_manifest_version: AtomicU64,
}
impl RegionManifestCheckpointer {
pub(crate) fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) {
let current = self.flushed_manifest_version.load(Ordering::Relaxed);
self.flushed_manifest_version
.store(current.max(manifest_version), Ordering::Relaxed);
}
}
#[async_trait]
impl Checkpointer for RegionManifestCheckpointer {
type Checkpoint = RegionCheckpoint;
type MetaAction = RegionMetaActionList;
async fn do_checkpoint(
&self,
manifest: &ManifestImpl<RegionCheckpoint, RegionMetaActionList>,
) -> Result<Option<RegionCheckpoint>> {
let last_checkpoint = manifest.last_checkpoint().await?;
let current_version = manifest.last_version();
let (start_version, mut protocol, mut manifest_builder) =
if let Some(checkpoint) = last_checkpoint {
(
checkpoint.last_version + 1,
checkpoint.protocol,
RegionManifestDataBuilder::with_checkpoint(checkpoint.checkpoint),
)
} else {
(
MIN_VERSION,
ProtocolAction::default(),
RegionManifestDataBuilder::default(),
)
};
let end_version =
current_version.min(self.flushed_manifest_version.load(Ordering::Relaxed)) + 1;
if start_version >= end_version {
return Ok(None);
}
info!("Begin to do region manifest checkpoint, path: {}, start_version: {}, end_version: {}, flushed_manifest_version: {}",
manifest.manifest_store().path(),
start_version,
end_version,
self.flushed_manifest_version.load(Ordering::Relaxed));
let mut iter = manifest.scan(start_version, end_version).await?;
let mut last_version = start_version;
let mut compacted_actions = 0;
while let Some((version, action_list)) = iter.next_action().await? {
for action in action_list.actions {
match action {
RegionMetaAction::Change(c) => manifest_builder.apply_change(c),
RegionMetaAction::Edit(e) => manifest_builder.apply_edit(version, e),
RegionMetaAction::Protocol(p) => protocol = p,
action => {
return ManifestCheckpointSnafu {
msg: format!("can't apply region action: {:?}", action),
}
.fail();
}
}
}
last_version = version;
compacted_actions += 1;
}
if compacted_actions == 0 {
return Ok(None);
}
let region_manifest = manifest_builder.build();
let checkpoint = RegionCheckpoint {
protocol,
last_version,
compacted_actions,
checkpoint: Some(region_manifest),
};
manifest.save_checkpoint(&checkpoint).await?;
if let Err(e) = manifest
.manifest_store()
.delete(start_version, last_version + 1)
.await
{
// We only log when the error kind isn't `NotFound`
if !e.is_object_to_delete_not_found() {
// It doesn't matter when deletion fails, they will be purged by gc task.
warn!(
"Failed to delete manifest logs [{},{}] in path: {}. err: {}",
start_version,
last_version,
manifest.manifest_store().path(),
e
);
}
}
info!("Region manifest checkpoint, path: {}, start_version: {}, last_version: {}, compacted actions: {}",
manifest.manifest_store().path(),
start_version,
last_version,
compacted_actions);
Ok(Some(checkpoint))
}
fn as_any(&self) -> &dyn Any {
self
}
}
impl RegionManifest {
pub fn with_checkpointer(
manifest_dir: &str,
object_store: ObjectStore,
compress_type: CompressionType,
checkpoint_actions_margin: Option<u16>,
gc_duration: Option<Duration>,
) -> Self {
Self::new(
manifest_dir,
object_store,
compress_type,
checkpoint_actions_margin,
gc_duration,
Some(Arc::new(RegionManifestCheckpointer {
flushed_manifest_version: AtomicU64::new(0),
})),
)
}
// Update flushed manifest version in checkpointer
pub fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) {
if let Some(checkpointer) = self.checkpointer() {
if let Some(checkpointer) = checkpointer
.as_any()
.downcast_ref::<RegionManifestCheckpointer>()
{
checkpointer.set_flushed_manifest_version(manifest_version);
}
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use common_test_util::temp_dir::create_temp_dir;
use object_store::services::{Fs, S3};
use object_store::test_util::{s3_test_config, TempFolder};
use object_store::ObjectStore;
use store_api::manifest::action::ProtocolAction;
use store_api::manifest::{Manifest, MetaActionIterator, MAX_VERSION};
use super::*;
use crate::manifest::manifest_compress_type;
use crate::manifest::test_utils::*;
use crate::metadata::RegionMetadata;
use crate::sst::FileId;
#[tokio::test]
async fn test_fs_region_manifest_compress() {
let manifest = new_fs_manifest(true, None).await;
test_region_manifest(&manifest).await
}
#[tokio::test]
async fn test_fs_region_manifest_uncompress() {
let manifest = new_fs_manifest(false, None).await;
test_region_manifest(&manifest).await
}
#[tokio::test]
async fn test_s3_region_manifest_compress() {
if s3_test_config().is_some() {
let (manifest, temp_dir) = new_s3_manifest(true, None).await;
test_region_manifest(&manifest).await;
temp_dir.remove_all().await.unwrap();
}
}
#[tokio::test]
async fn test_s3_region_manifest_uncompress() {
if s3_test_config().is_some() {
let (manifest, temp_dir) = new_s3_manifest(false, None).await;
test_region_manifest(&manifest).await;
temp_dir.remove_all().await.unwrap();
}
}
async fn new_fs_manifest(compress: bool, gc_duration: Option<Duration>) -> RegionManifest {
let tmp_dir = create_temp_dir("test_region_manifest");
let mut builder = Fs::default();
let _ = builder.root(&tmp_dir.path().to_string_lossy());
let object_store = ObjectStore::new(builder).unwrap().finish();
let manifest = RegionManifest::with_checkpointer(
"/manifest/",
object_store,
manifest_compress_type(compress),
None,
gc_duration,
);
manifest.start().await.unwrap();
manifest
}
async fn new_s3_manifest(
compress: bool,
gc_duration: Option<Duration>,
) -> (RegionManifest, TempFolder) {
let s3_config = s3_test_config().unwrap();
let mut builder = S3::default();
let _ = builder
.root(&s3_config.root)
.access_key_id(&s3_config.access_key_id)
.secret_access_key(&s3_config.secret_access_key)
.bucket(&s3_config.bucket);
if s3_config.region.is_some() {
let _ = builder.region(s3_config.region.as_ref().unwrap());
}
let store = ObjectStore::new(builder).unwrap().finish();
let temp_folder = TempFolder::new(&store, "/");
let manifest = RegionManifest::with_checkpointer(
"/manifest/",
store,
manifest_compress_type(compress),
None,
gc_duration,
);
manifest.start().await.unwrap();
(manifest, temp_folder)
}
async fn test_region_manifest(manifest: &RegionManifest) {
common_telemetry::init_default_ut_logging();
let region_meta = Arc::new(build_region_meta());
assert_eq!(
None,
manifest
.scan(0, MAX_VERSION)
.await
.unwrap()
.next_action()
.await
.unwrap()
);
assert!(manifest
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 99,
},
)))
.await
.is_ok());
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
assert_eq!(0, v);
assert_eq!(2, action_list.actions.len());
let protocol = &action_list.actions[0];
assert!(matches!(
protocol,
RegionMetaAction::Protocol(ProtocolAction { .. })
));
let action = &action_list.actions[1];
match action {
RegionMetaAction::Change(c) => {
assert_eq!(
RegionMetadata::try_from(c.metadata.clone()).unwrap(),
*region_meta
);
assert_eq!(c.committed_sequence, 99);
}
_ => unreachable!(),
}
// Save some actions
assert!(manifest
.update(RegionMetaActionList::new(vec![
RegionMetaAction::Edit(build_region_edit(1, &[FileId::random()], &[])),
RegionMetaAction::Edit(build_region_edit(
2,
&[FileId::random(), FileId::random()],
&[],
)),
]))
.await
.is_ok());
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
assert_eq!(0, v);
assert_eq!(2, action_list.actions.len());
let protocol = &action_list.actions[0];
assert!(matches!(
protocol,
RegionMetaAction::Protocol(ProtocolAction { .. })
));
let action = &action_list.actions[1];
match action {
RegionMetaAction::Change(c) => {
assert_eq!(
RegionMetadata::try_from(c.metadata.clone()).unwrap(),
*region_meta
);
assert_eq!(c.committed_sequence, 99);
}
_ => unreachable!(),
}
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
assert_eq!(1, v);
assert_eq!(2, action_list.actions.len());
assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_)));
assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_)));
// Reach end
assert!(iter.next_action().await.unwrap().is_none());
manifest.stop().await.unwrap();
}
async fn assert_scan(manifest: &RegionManifest, start_version: ManifestVersion, expected: u64) {
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
let mut actions = 0;
while let Some((v, _)) = iter.next_action().await.unwrap() {
assert_eq!(v, start_version + actions);
actions += 1;
}
assert_eq!(expected, actions);
}
#[tokio::test(flavor = "multi_thread")]
async fn test_fs_region_manifest_checkpoint_compress() {
let duration = Duration::from_millis(50);
let manifest = new_fs_manifest(true, Some(duration)).await;
test_region_manifest_checkpoint(&manifest, duration).await
}
#[tokio::test]
async fn test_fs_region_manifest_checkpoint_uncompress() {
let duration = Duration::from_millis(50);
let manifest = new_fs_manifest(false, Some(duration)).await;
test_region_manifest_checkpoint(&manifest, duration).await
}
#[tokio::test]
async fn test_s3_region_manifest_checkpoint_compress() {
if s3_test_config().is_some() {
let duration = Duration::from_millis(50);
let (manifest, temp_dir) = new_s3_manifest(true, Some(duration)).await;
test_region_manifest_checkpoint(&manifest, duration).await;
temp_dir.remove_all().await.unwrap();
}
}
#[tokio::test]
async fn test_s3_region_manifest_checkpoint_uncompress() {
if s3_test_config().is_some() {
let duration = Duration::from_millis(50);
let (manifest, temp_dir) = new_s3_manifest(false, Some(duration)).await;
test_region_manifest_checkpoint(&manifest, duration).await;
temp_dir.remove_all().await.unwrap();
}
}
async fn test_region_manifest_checkpoint(
manifest: &RegionManifest,
test_gc_duration: Duration,
) {
common_telemetry::init_default_ut_logging();
let region_meta = Arc::new(build_region_meta());
let new_region_meta = Arc::new(build_altered_region_meta());
let file = FileId::random();
let file_ids = vec![FileId::random(), FileId::random()];
let actions: Vec<RegionMetaActionList> = vec![
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 1,
})),
RegionMetaActionList::new(vec![
RegionMetaAction::Edit(build_region_edit(2, &[file], &[])),
RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])),
]),
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
metadata: new_region_meta.as_ref().into(),
committed_sequence: 99,
})),
];
for action in actions {
let _ = manifest.update(action).await.unwrap();
}
assert!(manifest.last_checkpoint().await.unwrap().is_none());
assert_scan(manifest, 0, 3).await;
// update flushed manifest version for doing checkpoint
manifest.set_flushed_manifest_version(2);
let mut checkpoint_versions = vec![];
// do a checkpoint
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, last_checkpoint);
assert_eq!(checkpoint.compacted_actions, 3);
assert_eq!(checkpoint.last_version, 2);
checkpoint_versions.push(2);
let alterd_raw_meta = RawRegionMetadata::from(new_region_meta.as_ref());
assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
committed_sequence: 99,
metadata,
version: Some(RegionVersion {
manifest_version: 1,
flushed_sequence: Some(3),
files,
}),
}) if files.len() == 2 &&
files.contains_key(&file_ids[0]) &&
files.contains_key(&file_ids[1]) &&
*metadata == alterd_raw_meta));
// all actions were compacted
assert_eq!(
None,
manifest
.scan(0, MAX_VERSION)
.await
.unwrap()
.next_action()
.await
.unwrap()
);
assert!(manifest.do_checkpoint().await.unwrap().is_none());
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, last_checkpoint);
// add new actions
let new_file = FileId::random();
let actions: Vec<RegionMetaActionList> = vec![
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 200,
})),
RegionMetaActionList::new(vec![RegionMetaAction::Edit(build_region_edit(
201,
&[new_file],
&file_ids,
))]),
];
for action in actions {
let _ = manifest.update(action).await.unwrap();
}
assert_scan(manifest, 3, 2).await;
// do another checkpoints
// compacted RegionChange
manifest.set_flushed_manifest_version(3);
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, last_checkpoint);
assert_eq!(checkpoint.compacted_actions, 1);
assert_eq!(checkpoint.last_version, 3);
checkpoint_versions.push(3);
assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
committed_sequence: 200,
metadata,
version: Some(RegionVersion {
manifest_version: 1,
flushed_sequence: Some(3),
files,
}),
}) if files.len() == 2 &&
files.contains_key(&file_ids[0]) &&
files.contains_key(&file_ids[1]) &&
*metadata == RawRegionMetadata::from(region_meta.as_ref())));
assert_scan(manifest, 4, 1).await;
// compacted RegionEdit
manifest.set_flushed_manifest_version(4);
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, last_checkpoint);
assert_eq!(checkpoint.compacted_actions, 1);
assert_eq!(checkpoint.last_version, 4);
checkpoint_versions.push(4);
assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData {
committed_sequence: 200,
metadata,
version: Some(RegionVersion {
manifest_version: 4,
flushed_sequence: Some(201),
files,
}),
}) if files.len() == 1 &&
files.contains_key(&new_file) &&
*metadata == RawRegionMetadata::from(region_meta.as_ref())));
// all actions were compacted
assert_eq!(
None,
manifest
.scan(0, MAX_VERSION)
.await
.unwrap()
.next_action()
.await
.unwrap()
);
// wait for gc
tokio::time::sleep(test_gc_duration * 3).await;
for v in checkpoint_versions {
if v < 4 {
// ensure old checkpoints were purged.
assert!(manifest
.manifest_store()
.load_checkpoint(v)
.await
.unwrap()
.is_none());
} else {
// the last checkpoints is still exists.
let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, last_checkpoint);
}
}
manifest.stop().await.unwrap();
}
#[tokio::test]
async fn test_region_manifest_truncate() {
common_telemetry::init_default_ut_logging();
let manifest = new_fs_manifest(false, None).await;
let region_meta = Arc::new(build_region_meta());
let committed_sequence = 99;
let file = FileId::random();
let file_ids = vec![FileId::random(), FileId::random()];
// Save some actions.
let actions: Vec<RegionMetaActionList> = vec![
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 1,
})),
RegionMetaActionList::new(vec![
RegionMetaAction::Edit(build_region_edit(2, &[file], &[])),
RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])),
]),
RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
region_id: 0.into(),
committed_sequence,
})),
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 1,
})),
];
for action in actions {
manifest.update(action).await.unwrap();
}
// Scan manifest.
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
info!("action_list = {:?}", action_list.actions);
assert_eq!(0, v);
assert_eq!(2, action_list.actions.len());
let protocol = &action_list.actions[0];
assert!(matches!(
protocol,
RegionMetaAction::Protocol(ProtocolAction { .. })
));
let change = &action_list.actions[1];
assert!(matches!(
change,
RegionMetaAction::Change(RegionChange {
committed_sequence: 1,
..
})
));
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
assert_eq!(1, v);
assert_eq!(2, action_list.actions.len());
assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_)));
assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_)));
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
assert_eq!(2, v);
assert_eq!(1, action_list.actions.len());
let truncate = &action_list.actions[0];
assert!(matches!(
truncate,
RegionMetaAction::Truncate(RegionTruncate {
committed_sequence: 99,
..
})
));
let (v, action_list) = iter.next_action().await.unwrap().unwrap();
assert_eq!(3, v);
assert_eq!(1, action_list.actions.len());
let change = &action_list.actions[0];
assert!(matches!(
change,
RegionMetaAction::Change(RegionChange {
committed_sequence: 1,
..
})
));
// Reach end
assert!(iter.next_action().await.unwrap().is_none());
}
}

View File

@@ -1,741 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::iter::Iterator;
use std::str::FromStr;
use async_trait::async_trait;
use common_datasource::compression::CompressionType;
use common_telemetry::logging;
use futures::TryStreamExt;
use lazy_static::lazy_static;
use object_store::{raw_normalize_path, util, Entry, ErrorKind, ObjectStore};
use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use store_api::manifest::{LogIterator, ManifestLogStorage, ManifestVersion};
use crate::error::{
CompressObjectSnafu, DecodeJsonSnafu, DecompressObjectSnafu, DeleteObjectSnafu,
EncodeJsonSnafu, Error, InvalidScanIndexSnafu, ListObjectsSnafu, ReadObjectSnafu, Result,
Utf8Snafu, WriteObjectSnafu,
};
lazy_static! {
static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap();
static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap();
}
const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip;
/// Due to backward compatibility, it is possible that the user's manifest file has not been compressed.
/// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing.
const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
#[inline]
pub const fn manifest_compress_type(compress: bool) -> CompressionType {
if compress {
DEFAULT_MANIFEST_COMPRESSION_TYPE
} else {
FALL_BACK_COMPRESS_TYPE
}
}
#[inline]
pub fn delta_file(version: ManifestVersion) -> String {
format!("{version:020}.json")
}
#[inline]
pub fn checkpoint_file(version: ManifestVersion) -> String {
format!("{version:020}.checkpoint")
}
#[inline]
pub fn gen_path(path: &str, file: &str, compress_type: CompressionType) -> String {
if compress_type == CompressionType::Uncompressed {
format!("{}{}", path, file)
} else {
format!("{}{}.{}", path, file, compress_type.file_extension())
}
}
/// Return's the file manifest version from path
///
/// # Panics
/// Panics if the file path is not a valid delta or checkpoint file.
#[inline]
pub fn file_version(path: &str) -> ManifestVersion {
let s = path.split('.').next().unwrap();
s.parse().unwrap_or_else(|_| panic!("Invalid file: {path}"))
}
/// Return's the file compress algorithm by file extension.
///
/// for example file
/// `00000000000000000000.json.gz` -> `CompressionType::GZIP`
#[inline]
pub fn file_compress_type(path: &str) -> CompressionType {
let s = path.rsplit('.').next().unwrap_or("");
CompressionType::from_str(s).unwrap_or(CompressionType::Uncompressed)
}
#[inline]
pub fn is_delta_file(file_name: &str) -> bool {
DELTA_RE.is_match(file_name)
}
#[inline]
pub fn is_checkpoint_file(file_name: &str) -> bool {
CHECKPOINT_RE.is_match(file_name)
}
pub struct ObjectStoreLogIterator {
object_store: ObjectStore,
iter: Box<dyn Iterator<Item = (ManifestVersion, Entry)> + Send + Sync>,
}
#[async_trait]
impl LogIterator for ObjectStoreLogIterator {
type Error = Error;
async fn next_log(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
match self.iter.next() {
Some((v, entry)) => {
let compress_type = file_compress_type(entry.name());
let bytes = self
.object_store
.read(entry.path())
.await
.context(ReadObjectSnafu { path: entry.path() })?;
let data = compress_type
.decode(bytes)
.await
.context(DecompressObjectSnafu {
compress_type,
path: entry.path(),
})?;
Ok(Some((v, data)))
}
None => Ok(None),
}
}
}
#[derive(Clone, Debug)]
pub struct ManifestObjectStore {
object_store: ObjectStore,
compress_type: CompressionType,
path: String,
}
impl ManifestObjectStore {
pub fn new(path: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self {
Self {
object_store,
compress_type,
path: util::normalize_dir(path),
}
}
#[inline]
/// Returns the delta file path under the **current** compression algorithm
fn delta_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &delta_file(version), self.compress_type)
}
#[inline]
/// Returns the checkpoint file path under the **current** compression algorithm
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &checkpoint_file(version), self.compress_type)
}
#[inline]
/// Returns the last checkpoint path, because the last checkpoint is not compressed,
/// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
fn last_checkpoint_path(&self) -> String {
format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
}
/// Return all `R`s in the root directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
/// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
async fn get_paths<F, R>(&self, filter: F) -> Result<Vec<R>>
where
F: Fn(Entry) -> Option<R>,
{
let streamer = self
.object_store
.lister_with(&self.path)
.await
.context(ListObjectsSnafu { path: &self.path })?;
streamer
.try_filter_map(|e| async { Ok(filter(e)) })
.try_collect::<Vec<_>>()
.await
.context(ListObjectsSnafu { path: &self.path })
}
pub(crate) fn path(&self) -> &str {
&self.path
}
}
#[derive(Serialize, Deserialize, Debug)]
struct CheckpointMetadata {
pub size: usize,
/// The latest version this checkpoint contains.
pub version: ManifestVersion,
pub checksum: Option<String>,
pub extend_metadata: Option<HashMap<String, String>>,
}
impl CheckpointMetadata {
fn encode(&self) -> Result<impl AsRef<[u8]>> {
serde_json::to_string(self).context(EncodeJsonSnafu)
}
fn decode(bs: &[u8]) -> Result<Self> {
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
serde_json::from_str(data).context(DecodeJsonSnafu)
}
}
#[async_trait]
impl ManifestLogStorage for ManifestObjectStore {
type Error = Error;
type Iter = ObjectStoreLogIterator;
async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<ObjectStoreLogIterator> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
let mut entries: Vec<(ManifestVersion, Entry)> = self
.get_paths(|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
if start <= version && version < end {
return Some((version, entry));
}
}
None
})
.await?;
entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2));
Ok(ObjectStoreLogIterator {
object_store: self.object_store.clone(),
iter: Box::new(entries.into_iter()),
})
}
async fn delete_until(
&self,
end: ManifestVersion,
keep_last_checkpoint: bool,
) -> Result<usize> {
// Stores (entry, is_checkpoint, version) in a Vec.
let entries: Vec<_> = self
.get_paths(|entry| {
let file_name = entry.name();
let is_checkpoint = is_checkpoint_file(file_name);
if is_delta_file(file_name) || is_checkpoint_file(file_name) {
let version = file_version(file_name);
if version < end {
return Some((entry, is_checkpoint, version));
}
}
None
})
.await?;
let checkpoint_version = if keep_last_checkpoint {
// Note that the order of entries is unspecific.
entries
.iter()
.filter_map(
|(_e, is_checkpoint, version)| {
if *is_checkpoint {
Some(version)
} else {
None
}
},
)
.max()
} else {
None
};
let paths: Vec<_> = entries
.iter()
.filter(|(_e, is_checkpoint, version)| {
if let Some(max_version) = checkpoint_version {
if *is_checkpoint {
// We need to keep the checkpoint file.
version < max_version
} else {
// We can delete the log file with max_version as the checkpoint
// file contains the log file's content.
version <= max_version
}
} else {
true
}
})
.map(|e| e.0.path().to_string())
.collect();
let ret = paths.len();
logging::debug!(
"Deleting {} logs from manifest storage path {} until {}, checkpoint: {:?}, paths: {:?}",
ret,
self.path,
end,
checkpoint_version,
paths,
);
self.object_store
.remove(paths)
.await
.with_context(|_| DeleteObjectSnafu {
path: self.path.clone(),
})?;
Ok(ret)
}
async fn delete_all(&self, remove_action_manifest: ManifestVersion) -> Result<()> {
let entries: Vec<Entry> = self.get_paths(Some).await?;
// Filter out the latest delta file.
let paths: Vec<_> = entries
.iter()
.filter(|e| {
let name = e.name();
if is_delta_file(name) && file_version(name) == remove_action_manifest {
return false;
}
true
})
.map(|e| e.path().to_string())
.collect();
logging::info!(
"Deleting {} from manifest storage path {} paths: {:?}",
paths.len(),
self.path,
paths,
);
// Delete all files except the latest delta file.
self.object_store
.remove(paths)
.await
.with_context(|_| DeleteObjectSnafu {
path: self.path.clone(),
})?;
// Delete the latest delta file and the manifest directory.
self.object_store
.remove_all(&self.path)
.await
.with_context(|_| DeleteObjectSnafu {
path: self.path.clone(),
})?;
logging::info!("Deleted manifest storage path {}", self.path);
Ok(())
}
async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
let path = self.delta_file_path(version);
logging::debug!("Save log to manifest storage, version: {}", version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
self.object_store
.write(&path, data)
.await
.context(WriteObjectSnafu { path })
}
async fn delete(&self, start: ManifestVersion, end: ManifestVersion) -> Result<()> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
// Due to backward compatibility, it is possible that the user's log between start and end has not been compressed,
// so we need to delete the uncompressed file corresponding to that version, even if the uncompressed file in that version do not exist.
let mut paths = Vec::with_capacity(((end - start) * 2) as usize);
for version in start..end {
paths.push(raw_normalize_path(&self.delta_file_path(version)));
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
paths.push(raw_normalize_path(&gen_path(
&self.path,
&delta_file(version),
FALL_BACK_COMPRESS_TYPE,
)));
}
}
logging::debug!(
"Deleting logs from manifest storage, start: {}, end: {}",
start,
end
);
self.object_store
.remove(paths.clone())
.await
.with_context(|_| DeleteObjectSnafu {
path: paths.join(","),
})?;
Ok(())
}
async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
self.object_store
.write(&path, data)
.await
.context(WriteObjectSnafu { path })?;
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: None,
extend_metadata: None,
};
logging::debug!(
"Save checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path,
checkpoint_metadata
);
let bs = checkpoint_metadata.encode()?;
self.object_store
.write(&last_checkpoint_path, bs.as_ref().to_vec())
.await
.context(WriteObjectSnafu {
path: last_checkpoint_path,
})?;
Ok(())
}
async fn load_checkpoint(
&self,
version: ManifestVersion,
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let path = self.checkpoint_file_path(version);
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
let checkpoint_data =
match self.object_store.read(&path).await {
Ok(checkpoint) => {
let decompress_data = self.compress_type.decode(checkpoint).await.context(
DecompressObjectSnafu {
compress_type: self.compress_type,
path,
},
)?;
Ok(Some(decompress_data))
}
Err(e) => {
if e.kind() == ErrorKind::NotFound {
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
let fall_back_path = gen_path(
&self.path,
&checkpoint_file(version),
FALL_BACK_COMPRESS_TYPE,
);
logging::debug!(
"Failed to load checkpoint from path: {}, fall back to path: {}",
path,
fall_back_path
);
match self.object_store.read(&fall_back_path).await {
Ok(checkpoint) => {
let decompress_data = FALL_BACK_COMPRESS_TYPE
.decode(checkpoint)
.await
.context(DecompressObjectSnafu {
compress_type: FALL_BACK_COMPRESS_TYPE,
path,
})?;
Ok(Some(decompress_data))
}
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
Err(e) => Err(e).context(ReadObjectSnafu {
path: &fall_back_path,
}),
}
} else {
Ok(None)
}
} else {
Err(e).context(ReadObjectSnafu { path: &path })
}
}
}?;
Ok(checkpoint_data.map(|data| (version, data)))
}
async fn delete_checkpoint(&self, version: ManifestVersion) -> Result<()> {
// Due to backward compatibility, it is possible that the user's checkpoint file has not been compressed,
// so we need to delete the uncompressed checkpoint file corresponding to that version, even if the uncompressed checkpoint file in that version do not exist.
let paths = if self.compress_type != FALL_BACK_COMPRESS_TYPE {
vec![
raw_normalize_path(&self.checkpoint_file_path(version)),
raw_normalize_path(&gen_path(
&self.path,
&checkpoint_file(version),
FALL_BACK_COMPRESS_TYPE,
)),
]
} else {
vec![raw_normalize_path(&self.checkpoint_file_path(version))]
};
self.object_store
.remove(paths.clone())
.await
.context(DeleteObjectSnafu {
path: paths.join(","),
})?;
Ok(())
}
async fn load_last_checkpoint(&self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let last_checkpoint_path = self.last_checkpoint_path();
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
Ok(data) => data,
Err(e) if e.kind() == ErrorKind::NotFound => {
return Ok(None);
}
Err(e) => {
return Err(e).context(ReadObjectSnafu {
path: last_checkpoint_path,
});
}
};
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
logging::debug!(
"Load checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path,
checkpoint_metadata
);
self.load_checkpoint(checkpoint_metadata.version).await
}
}
#[cfg(test)]
mod tests {
use common_test_util::temp_dir::create_temp_dir;
use object_store::services::Fs;
use object_store::ObjectStore;
use super::*;
fn new_test_manifest_store() -> ManifestObjectStore {
common_telemetry::init_default_ut_logging();
let tmp_dir = create_temp_dir("test_manifest_log_store");
let mut builder = Fs::default();
let _ = builder.root(&tmp_dir.path().to_string_lossy());
let object_store = ObjectStore::new(builder).unwrap().finish();
ManifestObjectStore::new("/", object_store, CompressionType::Uncompressed)
}
#[test]
// Define this test mainly to prevent future unintentional changes may break the backward compatibility.
fn test_compress_file_path_generation() {
let path = "/foo/bar/";
let version: ManifestVersion = 0;
let file_path = gen_path(path, &delta_file(version), CompressionType::Gzip);
assert_eq!(file_path.as_str(), "/foo/bar/00000000000000000000.json.gz")
}
#[tokio::test]
async fn test_manifest_log_store_uncompress() {
let mut log_store = new_test_manifest_store();
log_store.compress_type = CompressionType::Uncompressed;
test_manifest_log_store_case(log_store).await;
}
#[tokio::test]
async fn test_manifest_log_store_compress() {
let mut log_store = new_test_manifest_store();
log_store.compress_type = CompressionType::Gzip;
test_manifest_log_store_case(log_store).await;
}
async fn test_manifest_log_store_case(log_store: ManifestObjectStore) {
for v in 0..5 {
log_store
.save(v, format!("hello, {v}").as_bytes())
.await
.unwrap();
}
let mut it = log_store.scan(1, 4).await.unwrap();
for v in 1..4 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
}
assert!(it.next_log().await.unwrap().is_none());
let mut it = log_store.scan(0, 11).await.unwrap();
for v in 0..5 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
}
assert!(it.next_log().await.unwrap().is_none());
// Delete [0, 3)
log_store.delete(0, 3).await.unwrap();
// [3, 5) remains
let mut it = log_store.scan(0, 11).await.unwrap();
for v in 3..5 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
}
assert!(it.next_log().await.unwrap().is_none());
// test checkpoint
assert!(log_store.load_last_checkpoint().await.unwrap().is_none());
log_store
.save_checkpoint(3, "checkpoint".as_bytes())
.await
.unwrap();
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, "checkpoint".as_bytes());
assert_eq!(3, v);
//delete (,4) logs and keep checkpoint 3.
let _ = log_store.delete_until(4, true).await.unwrap();
let _ = log_store.load_checkpoint(3).await.unwrap().unwrap();
let _ = log_store.load_last_checkpoint().await.unwrap().unwrap();
let mut it = log_store.scan(0, 11).await.unwrap();
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(4, version);
assert_eq!("hello, 4".as_bytes(), bytes);
assert!(it.next_log().await.unwrap().is_none());
// delete all logs and checkpoints
let _ = log_store.delete_until(11, false).await.unwrap();
assert!(log_store.load_checkpoint(3).await.unwrap().is_none());
assert!(log_store.load_last_checkpoint().await.unwrap().is_none());
let mut it = log_store.scan(0, 11).await.unwrap();
assert!(it.next_log().await.unwrap().is_none());
}
#[tokio::test]
// test ManifestObjectStore can read/delete previously uncompressed data correctly
async fn test_compress_backward_compatible() {
let mut log_store = new_test_manifest_store();
// write uncompress data to stimulate previously uncompressed data
log_store.compress_type = CompressionType::Uncompressed;
for v in 0..5 {
log_store
.save(v, format!("hello, {v}").as_bytes())
.await
.unwrap();
}
log_store
.save_checkpoint(5, "checkpoint_uncompressed".as_bytes())
.await
.unwrap();
// change compress type
log_store.compress_type = CompressionType::Gzip;
// test load_last_checkpoint work correctly for previously uncompressed data
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
assert_eq!(v, 5);
assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes());
// write compressed data to stimulate compress alogorithom take effect
for v in 5..10 {
log_store
.save(v, format!("hello, {v}").as_bytes())
.await
.unwrap();
}
log_store
.save_checkpoint(10, "checkpoint_compressed".as_bytes())
.await
.unwrap();
// test data reading
let mut it = log_store.scan(0, 10).await.unwrap();
for v in 0..10 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
}
let (v, checkpoint) = log_store.load_checkpoint(5).await.unwrap().unwrap();
assert_eq!(v, 5);
assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes());
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
assert_eq!(v, 10);
assert_eq!(checkpoint, "checkpoint_compressed".as_bytes());
// Delete previously uncompressed checkpoint
log_store.delete_checkpoint(5).await.unwrap();
assert!(log_store.load_checkpoint(5).await.unwrap().is_none());
// Delete [3, 7), contain uncompressed/compressed data
log_store.delete(3, 7).await.unwrap();
// [3, 7) deleted
let mut it = log_store.scan(3, 7).await.unwrap();
assert!(it.next_log().await.unwrap().is_none());
// Delete util 10, contain uncompressed/compressed data
// log 0, 1, 2, 7, 8, 9 will be delete
assert_eq!(6, log_store.delete_until(10, false).await.unwrap());
let mut it = log_store.scan(0, 10).await.unwrap();
assert!(it.next_log().await.unwrap().is_none());
}
}

View File

@@ -1,83 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use datatypes::type_id::LogicalTypeId;
use store_api::storage::SequenceNumber;
use crate::manifest::action::*;
use crate::metadata::RegionMetadata;
use crate::sst::{FileId, FileMeta};
use crate::test_util::descriptor_util::RegionDescBuilder;
pub const DEFAULT_TEST_FILE_SIZE: u64 = 1024;
pub fn build_region_meta() -> RegionMetadata {
let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name)
.id(0)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_field_column(("v1", LogicalTypeId::Float32, true))
.build();
desc.try_into().unwrap()
}
pub fn build_altered_region_meta() -> RegionMetadata {
let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name)
.id(0)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_field_column(("v1", LogicalTypeId::Float32, true))
.push_field_column(("v2", LogicalTypeId::Float32, true))
.build();
desc.try_into().unwrap()
}
pub fn build_region_edit(
sequence: SequenceNumber,
files_to_add: &[FileId],
files_to_remove: &[FileId],
) -> RegionEdit {
RegionEdit {
region_version: 0,
flushed_sequence: Some(sequence),
files_to_add: files_to_add
.iter()
.map(|f| FileMeta {
region_id: 0.into(),
file_id: *f,
time_range: None,
level: 0,
file_size: DEFAULT_TEST_FILE_SIZE,
})
.collect(),
files_to_remove: files_to_remove
.iter()
.map(|f| FileMeta {
region_id: 0.into(),
file_id: *f,
time_range: None,
level: 0,
file_size: DEFAULT_TEST_FILE_SIZE,
})
.collect(),
compaction_time_window: None,
}
}
pub fn build_region_truncate(committed_sequence: u64) -> RegionTruncate {
RegionTruncate {
region_id: 0.into(),
committed_sequence,
}
}

View File

@@ -1,294 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
mod btree;
mod inserter;
#[cfg(test)]
pub mod tests;
mod version;
use std::fmt;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering};
use std::sync::Arc;
use api::v1::OpType;
use common_time::range::TimestampRange;
use common_time::Timestamp;
use datatypes::vectors::VectorRef;
use store_api::storage::{consts, SequenceNumber};
use crate::error::Result;
use crate::flush::FlushStrategyRef;
use crate::memtable::btree::BTreeMemtable;
pub use crate::memtable::inserter::Inserter;
pub use crate::memtable::version::MemtableVersion;
use crate::metrics::WRITE_BUFFER_BYTES;
use crate::read::Batch;
use crate::schema::{ProjectedSchemaRef, RegionSchemaRef};
/// Unique id for memtables under same region.
pub type MemtableId = u32;
#[derive(Debug, Default)]
pub struct MemtableStats {
/// The estimated bytes allocated by this memtable from heap. Result
/// of this method may be larger than the estimated based on `num_rows` because
/// of the implementor's pre-alloc behavior.
pub estimated_bytes: usize,
/// The max timestamp that this memtable contains.
pub max_timestamp: Timestamp,
/// The min timestamp that this memtable contains.
pub min_timestamp: Timestamp,
}
impl MemtableStats {
pub fn bytes_allocated(&self) -> usize {
self.estimated_bytes
}
}
/// In memory storage.
pub trait Memtable: Send + Sync + fmt::Debug {
/// Returns id of this memtable.
fn id(&self) -> MemtableId;
/// Returns schema of the memtable.
fn schema(&self) -> RegionSchemaRef;
/// Write key/values to the memtable.
///
/// # Panics
/// Panics if the schema of key/value differs from memtable's schema.
fn write(&self, kvs: &KeyValues) -> Result<()>;
/// Iterates the memtable.
fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator>;
/// Returns the number of rows in the memtable.
fn num_rows(&self) -> usize;
/// Returns stats of this memtable.
fn stats(&self) -> MemtableStats;
/// Mark the memtable is immutable.
///
/// The region MUST call this inside the region writer's write lock.
fn mark_immutable(&self);
}
pub type MemtableRef = Arc<dyn Memtable>;
/// Context for iterating memtable.
///
/// Should be cheap to clone.
#[derive(Debug, Clone)]
pub struct IterContext {
/// The suggested batch size of the iterator.
pub batch_size: usize,
/// Max visible sequence (inclusive).
pub visible_sequence: SequenceNumber,
/// Schema the reader expect to read.
///
/// Set to `None` to read all columns.
pub projected_schema: Option<ProjectedSchemaRef>,
/// Timestamp range
pub time_range: Option<TimestampRange>,
}
impl Default for IterContext {
fn default() -> Self {
Self {
batch_size: consts::READ_BATCH_SIZE,
// All data in memory is visible by default.
visible_sequence: SequenceNumber::MAX,
projected_schema: None,
time_range: None,
}
}
}
/// The ordering of the iterator output.
#[derive(Debug, PartialEq, Eq)]
pub enum RowOrdering {
/// The output rows are unordered.
Unordered,
/// The output rows are ordered by key.
Key,
}
/// Iterator of memtable.
///
/// Since data of memtable are stored in memory, so avoid defining this trait
/// as an async trait.
pub trait BatchIterator: Iterator<Item = Result<Batch>> + Send + Sync {
/// Returns the schema of this iterator.
fn schema(&self) -> ProjectedSchemaRef;
/// Returns the ordering of the output rows from this iterator.
fn ordering(&self) -> RowOrdering;
}
pub type BoxedBatchIterator = Box<dyn BatchIterator>;
pub trait MemtableBuilder: Send + Sync + fmt::Debug {
fn build(&self, schema: RegionSchemaRef) -> MemtableRef;
}
pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
/// Key-value pairs in columnar format.
pub struct KeyValues {
pub sequence: SequenceNumber,
pub op_type: OpType,
/// Start index of these key-value paris in batch. Each row in the same batch has
/// a unique index to identify it.
pub start_index_in_batch: usize,
pub keys: Vec<VectorRef>,
pub values: Vec<VectorRef>,
pub timestamp: Option<VectorRef>,
}
impl KeyValues {
// Note that `sequence` is not reset.
fn reset(&mut self, op_type: OpType, index_in_batch: usize) {
self.op_type = op_type;
self.start_index_in_batch = index_in_batch;
self.keys.clear();
self.values.clear();
self.timestamp = None;
}
pub fn len(&self) -> usize {
self.timestamp.as_ref().map(|v| v.len()).unwrap_or_default()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn estimated_memory_size(&self) -> usize {
self.keys.iter().fold(0, |acc, v| acc + v.memory_size())
+ self.values.iter().fold(0, |acc, v| acc + v.memory_size())
+ self
.timestamp
.as_ref()
.map(|t| t.memory_size())
.unwrap_or_default()
}
}
/// Memtable memory allocation tracker.
pub struct AllocTracker {
flush_strategy: Option<FlushStrategyRef>,
/// Bytes allocated by the tracker.
bytes_allocated: AtomicUsize,
/// Whether allocating is done.
is_done_allocating: AtomicBool,
}
impl fmt::Debug for AllocTracker {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("AllocTracker")
.field("bytes_allocated", &self.bytes_allocated)
.field("is_done_allocating", &self.is_done_allocating)
.finish()
}
}
impl AllocTracker {
/// Returns a new [AllocTracker].
pub fn new(flush_strategy: Option<FlushStrategyRef>) -> AllocTracker {
AllocTracker {
flush_strategy,
bytes_allocated: AtomicUsize::new(0),
is_done_allocating: AtomicBool::new(false),
}
}
/// Tracks `bytes` memory is allocated.
pub(crate) fn on_allocate(&self, bytes: usize) {
let _ = self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
WRITE_BUFFER_BYTES.add(bytes as i64);
if let Some(flush_strategy) = &self.flush_strategy {
flush_strategy.reserve_mem(bytes);
}
}
/// Marks we have finished allocating memory so we can free it from
/// the write buffer's limit.
///
/// The region MUST ensure that it calls this method inside the region writer's write lock.
pub(crate) fn done_allocating(&self) {
if let Some(flush_strategy) = &self.flush_strategy {
if self
.is_done_allocating
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
.is_ok()
{
flush_strategy.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed));
}
}
}
/// Returns bytes allocated.
pub(crate) fn bytes_allocated(&self) -> usize {
self.bytes_allocated.load(Ordering::Relaxed)
}
}
impl Drop for AllocTracker {
fn drop(&mut self) {
if !self.is_done_allocating.load(Ordering::Relaxed) {
self.done_allocating();
}
let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed);
WRITE_BUFFER_BYTES.sub(bytes_allocated as i64);
// Memory tracked by this tracker is freed.
if let Some(flush_strategy) = &self.flush_strategy {
flush_strategy.free_mem(bytes_allocated);
}
}
}
/// Default memtable builder that builds `BTreeMemtable`.
#[derive(Debug, Default)]
pub struct DefaultMemtableBuilder {
memtable_id: AtomicU32,
flush_strategy: Option<FlushStrategyRef>,
}
impl DefaultMemtableBuilder {
/// Returns a new [DefaultMemtableBuilder] with specific `flush_strategy`.
///
/// If `flush_strategy` is `Some`, the memtable will report its memory usage
/// to the `flush_strategy`.
pub fn with_flush_strategy(flush_strategy: Option<FlushStrategyRef>) -> Self {
Self {
memtable_id: AtomicU32::new(0),
flush_strategy,
}
}
}
impl MemtableBuilder for DefaultMemtableBuilder {
fn build(&self, schema: RegionSchemaRef) -> MemtableRef {
let id = self.memtable_id.fetch_add(1, Ordering::Relaxed);
Arc::new(BTreeMemtable::new(id, schema, self.flush_strategy.clone()))
}
}

View File

@@ -1,573 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::Ordering;
use std::collections::{btree_map, BTreeMap};
use std::fmt;
use std::ops::Bound;
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, RwLock};
use api::v1::OpType;
use common_time::range::TimestampRange;
use datatypes::data_type::DataType;
use datatypes::prelude::*;
use datatypes::value::Value;
use datatypes::vectors::{UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder};
use store_api::storage::{SequenceNumber, MIN_OP_TYPE};
use crate::error::Result;
use crate::flush::FlushStrategyRef;
use crate::memtable::{
AllocTracker, BatchIterator, BoxedBatchIterator, IterContext, KeyValues, Memtable, MemtableId,
MemtableStats, RowOrdering,
};
use crate::read::Batch;
use crate::schema::compat::ReadAdapter;
use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
type RwLockMap = RwLock<BTreeMap<InnerKey, RowValue>>;
/// A simple memtable implementation based on std's [`BTreeMap`].
///
/// Mainly for test purpose, don't use in production.
pub struct BTreeMemtable {
id: MemtableId,
schema: RegionSchemaRef,
map: Arc<RwLockMap>,
alloc_tracker: AllocTracker,
max_timestamp: AtomicI64,
min_timestamp: AtomicI64,
}
impl BTreeMemtable {
pub fn new(
id: MemtableId,
schema: RegionSchemaRef,
flush_strategy: Option<FlushStrategyRef>,
) -> BTreeMemtable {
BTreeMemtable {
id,
schema,
map: Arc::new(RwLock::new(BTreeMap::new())),
alloc_tracker: AllocTracker::new(flush_strategy),
max_timestamp: AtomicI64::new(i64::MIN),
min_timestamp: AtomicI64::new(i64::MAX),
}
}
/// Updates memtable stats.
/// This function is guarded by `BTreeMemtable::map` so that store-after-load is safe.
fn update_stats(&self, request_size: usize, min: Option<Value>, max: Option<Value>) {
self.alloc_tracker.on_allocate(request_size);
if let Some(min) = min {
let min_val = min
.as_timestamp()
.expect("Min timestamp must be a valid timestamp value")
.value();
let cur_min = self.min_timestamp.load(AtomicOrdering::Relaxed);
if min_val < cur_min {
self.min_timestamp.store(min_val, AtomicOrdering::Relaxed);
}
}
if let Some(max) = max {
let cur_max = self.max_timestamp.load(AtomicOrdering::Relaxed);
let max_val = max
.as_timestamp()
.expect("Max timestamp must be a valid timestamp value")
.value();
if max_val > cur_max {
self.max_timestamp.store(max_val, AtomicOrdering::Relaxed);
}
}
}
}
impl fmt::Debug for BTreeMemtable {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let len = self.map.read().unwrap().len();
f.debug_struct("BTreeMemtable")
.field("id", &self.id)
// Only show StoreSchema
.field("schema", &self.schema)
.field("rows", &len)
.field("alloc_tracker", &self.alloc_tracker)
.field("max_timestamp", &self.max_timestamp)
.field("min_timestamp", &self.min_timestamp)
.finish()
}
}
impl Memtable for BTreeMemtable {
fn id(&self) -> MemtableId {
self.id
}
fn schema(&self) -> RegionSchemaRef {
self.schema.clone()
}
fn write(&self, kvs: &KeyValues) -> Result<()> {
debug_assert!(kvs.timestamp.is_some());
let iter_row = IterRow::new(kvs);
let mut map = self.map.write().unwrap();
let mut min_ts = None;
let mut max_ts = None;
for (inner_key, row_value) in iter_row {
let ts = inner_key.timestamp();
let min_ts = min_ts.get_or_insert_with(|| ts.clone());
let max_ts = max_ts.get_or_insert_with(|| ts.clone());
if ts < min_ts {
*min_ts = ts.clone();
}
if ts > max_ts {
*max_ts = ts.clone();
}
let _ = map.insert(inner_key, row_value);
}
self.update_stats(kvs.estimated_memory_size(), min_ts, max_ts);
Ok(())
}
fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator> {
assert!(ctx.batch_size > 0);
let iter = BTreeIterator::new(ctx, self.schema.clone(), self.map.clone())?;
Ok(Box::new(iter))
}
fn num_rows(&self) -> usize {
self.map.read().unwrap().len()
}
fn stats(&self) -> MemtableStats {
let ts_meta = self.schema.column_metadata(self.schema.timestamp_index());
let Some(timestamp_type) = ts_meta.desc.data_type.as_timestamp() else {
// safety: timestamp column always has timestamp type, otherwise it's a bug.
panic!(
"Timestamp column is not a valid timestamp type: {:?}",
self.schema
);
};
MemtableStats {
estimated_bytes: self.alloc_tracker.bytes_allocated(),
max_timestamp: timestamp_type
.create_timestamp(self.max_timestamp.load(AtomicOrdering::Relaxed)),
min_timestamp: timestamp_type
.create_timestamp(self.min_timestamp.load(AtomicOrdering::Relaxed)),
}
}
fn mark_immutable(&self) {
self.alloc_tracker.done_allocating();
}
}
struct BTreeIterator {
ctx: IterContext,
/// Schema of this memtable.
schema: RegionSchemaRef,
/// Projected schema that user expect to read.
projected_schema: ProjectedSchemaRef,
adapter: ReadAdapter,
map: Arc<RwLockMap>,
last_key: Option<InnerKey>,
}
impl BatchIterator for BTreeIterator {
fn schema(&self) -> ProjectedSchemaRef {
self.projected_schema.clone()
}
fn ordering(&self) -> RowOrdering {
RowOrdering::Key
}
}
impl Iterator for BTreeIterator {
type Item = Result<Batch>;
fn next(&mut self) -> Option<Result<Batch>> {
self.next_batch().transpose()
}
}
impl BTreeIterator {
fn new(
ctx: IterContext,
schema: RegionSchemaRef,
map: Arc<RwLockMap>,
) -> Result<BTreeIterator> {
let projected_schema = ctx
.projected_schema
.clone()
.unwrap_or_else(|| Arc::new(ProjectedSchema::no_projection(schema.clone())));
let adapter = ReadAdapter::new(schema.store_schema().clone(), projected_schema.clone())?;
Ok(BTreeIterator {
ctx,
schema,
projected_schema,
adapter,
map,
last_key: None,
})
}
fn next_batch(&mut self) -> Result<Option<Batch>> {
let map = self.map.read().unwrap();
let iter = if let Some(last_key) = &self.last_key {
map.range((Bound::Excluded(last_key), Bound::Unbounded))
} else {
map.range(..)
};
let iter = MapIterWrapper::new(iter, self.ctx.visible_sequence, self.ctx.time_range);
let (keys, sequences, op_types, values) = collect_iter(iter, self.ctx.batch_size);
if keys.is_empty() {
return Ok(None);
}
self.last_key = keys.last().map(|k| {
let mut last_key = (*k).clone();
last_key.reset_for_seek();
last_key
});
let key_data_types = self
.schema
.row_key_columns()
.map(|column_meta| column_meta.desc.data_type.clone());
let value_data_types = self
.schema
.field_columns()
.map(|column_meta| column_meta.desc.data_type.clone());
let key_columns = rows_to_vectors(
key_data_types,
self.adapter.source_key_needed(),
keys.as_slice(),
);
let field_columns = rows_to_vectors(
value_data_types,
self.adapter.source_value_needed(),
values.as_slice(),
);
let batch = self.adapter.batch_from_parts(
key_columns,
field_columns,
Arc::new(sequences),
Arc::new(op_types),
)?;
Ok(Some(batch))
}
}
fn collect_iter<'a, I: Iterator<Item = (&'a InnerKey, &'a RowValue)>>(
iter: I,
batch_size: usize,
) -> (
Vec<&'a InnerKey>,
UInt64Vector,
UInt8Vector,
Vec<&'a RowValue>,
) {
let mut keys = Vec::with_capacity(batch_size);
let mut sequences = UInt64VectorBuilder::with_capacity(batch_size);
let mut op_types = UInt8VectorBuilder::with_capacity(batch_size);
let mut values = Vec::with_capacity(batch_size);
for (inner_key, row_value) in iter.take(batch_size) {
keys.push(inner_key);
sequences.push(Some(inner_key.sequence));
op_types.push(Some(inner_key.op_type as u8));
values.push(row_value);
}
(keys, sequences.finish(), op_types.finish(), values)
}
/// `MapIterWrapper` removes same user key with invisible sequence.
struct MapIterWrapper<'a, InnerKey, RowValue> {
iter: btree_map::Range<'a, InnerKey, RowValue>,
prev_key: Option<InnerKey>,
visible_sequence: SequenceNumber,
time_range: Option<TimestampRange>,
}
impl<'a> MapIterWrapper<'a, InnerKey, RowValue> {
fn new(
iter: btree_map::Range<'a, InnerKey, RowValue>,
visible_sequence: SequenceNumber,
time_range: Option<TimestampRange>,
) -> MapIterWrapper<'a, InnerKey, RowValue> {
MapIterWrapper {
iter,
prev_key: None,
visible_sequence,
time_range,
}
}
fn next_visible_entry(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> {
for (k, v) in self.iter.by_ref() {
if k.is_visible(self.visible_sequence) && k.is_in_time_range(&self.time_range) {
return Some((k, v));
}
}
None
}
}
impl<'a> Iterator for MapIterWrapper<'a, InnerKey, RowValue> {
type Item = (&'a InnerKey, &'a RowValue);
fn next(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> {
let (mut current_key, mut current_value) = self.next_visible_entry()?;
if self.prev_key.is_none() {
self.prev_key = Some(current_key.clone());
return Some((current_key, current_value));
}
let prev_key = self.prev_key.take().unwrap();
while prev_key.is_row_key_equal(current_key) {
if let Some((next_key, next_value)) = self.next_visible_entry() {
(current_key, current_value) = (next_key, next_value);
} else {
return None;
}
}
self.prev_key = Some(current_key.clone());
Some((current_key, current_value))
}
}
struct IterRow<'a> {
kvs: &'a KeyValues,
index: usize,
len: usize,
}
impl<'a> IterRow<'a> {
fn new(kvs: &KeyValues) -> IterRow {
IterRow {
kvs,
index: 0,
len: kvs.len(),
}
}
fn fetch_row(&mut self) -> (InnerKey, RowValue) {
let mut row_key: Vec<_> = self
.kvs
.keys
.iter()
.map(|vector| vector.get(self.index))
.collect();
// unwrap safety: KeyValues always contains a timestamp as guaranteed in [Inserter::write_one_mutation]
row_key.push(self.kvs.timestamp.as_ref().unwrap().get(self.index));
let inner_key = InnerKey {
row_key,
sequence: self.kvs.sequence,
index_in_batch: self.kvs.start_index_in_batch + self.index,
op_type: self.kvs.op_type,
};
let row_value = RowValue {
values: self
.kvs
.values
.iter()
.map(|vector| vector.get(self.index))
.collect(),
};
self.index += 1;
(inner_key, row_value)
}
}
impl<'a> Iterator for IterRow<'a> {
type Item = (InnerKey, RowValue);
fn next(&mut self) -> Option<(InnerKey, RowValue)> {
if self.index >= self.len {
return None;
}
Some(self.fetch_row())
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.kvs.keys.len(), Some(self.kvs.keys.len()))
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
struct InnerKey {
/// User defined primary keys
row_key: Vec<Value>,
/// Sequence number of row
sequence: SequenceNumber,
index_in_batch: usize,
op_type: OpType,
}
impl Ord for InnerKey {
fn cmp(&self, other: &InnerKey) -> Ordering {
// Order by (row_key asc, sequence desc, index_in_batch desc, op_type desc), though (key,
// sequence, index_in_batch) should be enough to disambiguate.
self.row_key
.cmp(&other.row_key)
.then_with(|| other.sequence.cmp(&self.sequence))
.then_with(|| other.index_in_batch.cmp(&self.index_in_batch))
.then_with(|| other.op_type.cmp(&self.op_type))
}
}
impl PartialOrd for InnerKey {
fn partial_cmp(&self, other: &InnerKey) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl InnerKey {
#[inline]
fn timestamp(&self) -> &Value {
// safety: row key shall at least contain a timestamp column
self.row_key.last().unwrap()
}
#[inline]
fn is_row_key_equal(&self, other: &InnerKey) -> bool {
self.row_key == other.row_key
}
#[inline]
fn is_visible(&self, sequence: SequenceNumber) -> bool {
self.sequence <= sequence
}
#[inline]
fn is_in_time_range(&self, range: &Option<TimestampRange>) -> bool {
let Some(range) = range else {
return true;
};
range.contains(
&self
.timestamp()
.as_timestamp()
.expect("Timestamp field must be a valid timestamp value"),
)
}
/// Reset the `InnerKey` so that we can use it to seek next key that
/// has different row key.
fn reset_for_seek(&mut self) {
// sequence, index_in_batch, op_type are ordered in desc order, so
// we can represent the last inner key with same row key by setting them
// to zero (Minimum value).
self.sequence = 0;
self.index_in_batch = 0;
self.op_type = MIN_OP_TYPE;
}
}
#[derive(Clone, Debug)]
struct RowValue {
values: Vec<Value>,
}
trait RowsProvider {
fn row_num(&self) -> usize;
fn column_num(&self) -> usize {
self.row_by_index(0).len()
}
fn is_empty(&self) -> bool {
self.row_num() == 0
}
fn row_by_index(&self, idx: usize) -> &Vec<Value>;
}
impl<'a> RowsProvider for &'a [&InnerKey] {
fn row_num(&self) -> usize {
self.len()
}
fn row_by_index(&self, idx: usize) -> &Vec<Value> {
&self[idx].row_key
}
}
impl<'a> RowsProvider for &'a [&RowValue] {
fn row_num(&self) -> usize {
self.len()
}
fn row_by_index(&self, idx: usize) -> &Vec<Value> {
&self[idx].values
}
}
fn rows_to_vectors<I: Iterator<Item = ConcreteDataType>, T: RowsProvider>(
data_types: I,
column_needed: &[bool],
provider: T,
) -> Vec<VectorRef> {
if provider.is_empty() {
return Vec::new();
}
let column_num = provider.column_num();
let row_num = provider.row_num();
let mut builders = Vec::with_capacity(column_num);
for data_type in data_types {
builders.push(data_type.create_mutable_vector(row_num));
}
let mut vectors = Vec::with_capacity(column_num);
for (col_idx, builder) in builders.iter_mut().enumerate() {
if !column_needed[col_idx] {
continue;
}
for row_idx in 0..row_num {
let row = provider.row_by_index(row_idx);
let value = &row[col_idx];
builder.as_mut().push_value_ref(value.as_value_ref());
}
vectors.push(builder.to_vector());
}
vectors
}

View File

@@ -1,251 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::OpType;
use store_api::storage::SequenceNumber;
use super::MemtableRef;
use crate::error::Result;
use crate::memtable::KeyValues;
use crate::metrics::MEMTABLE_WRITE_ELAPSED;
use crate::write_batch::{Mutation, Payload};
/// Wraps logic of inserting key/values in [WriteBatch](crate::write_batch::WriteBatch) to [Memtable](crate::memtable::Memtable).
pub struct Inserter {
/// Sequence of the batch to be inserted.
sequence: SequenceNumber,
/// Used to calculate the start index in batch for `KeyValues`.
index_in_batch: usize,
}
impl Inserter {
pub fn new(sequence: SequenceNumber) -> Inserter {
Inserter {
sequence,
index_in_batch: 0,
}
}
/// Insert write batch payload into memtable.
///
/// Won't do schema validation if not configured. Caller (mostly the `RegionWriter` should ensure the
/// schemas of `memtable` are consistent with `payload`'s.
pub fn insert_memtable(&mut self, payload: &Payload, memtable: &MemtableRef) -> Result<()> {
let _timer = MEMTABLE_WRITE_ELAPSED.start_timer();
if payload.is_empty() {
return Ok(());
}
// This function only makes effect in debug mode.
validate_input_and_memtable_schemas(payload, memtable);
// Enough to hold all key or value columns.
let total_column_num = payload.schema.num_columns();
// Reusable KeyValues buffer.
let mut kvs = KeyValues {
sequence: self.sequence,
op_type: OpType::Put,
start_index_in_batch: self.index_in_batch,
keys: Vec::with_capacity(total_column_num),
values: Vec::with_capacity(total_column_num),
timestamp: None,
};
for mutation in &payload.mutations {
self.write_one_mutation(mutation, memtable, &mut kvs)?;
}
Ok(())
}
fn write_one_mutation(
&mut self,
mutation: &Mutation,
memtable: &MemtableRef,
kvs: &mut KeyValues,
) -> Result<()> {
let schema = memtable.schema();
let num_rows = mutation.record_batch.num_rows();
kvs.reset(mutation.op_type, self.index_in_batch);
let ts_idx = schema.timestamp_index();
kvs.timestamp = Some(mutation.record_batch.column(ts_idx).clone());
for key_idx in 0..ts_idx {
kvs.keys.push(mutation.record_batch.column(key_idx).clone());
}
for value_idx in schema.value_indices() {
kvs.values
.push(mutation.record_batch.column(value_idx).clone());
}
memtable.write(kvs)?;
self.index_in_batch += num_rows;
Ok(())
}
}
fn validate_input_and_memtable_schemas(payload: &Payload, memtable: &MemtableRef) {
if cfg!(debug_assertions) {
let payload_schema = &payload.schema;
let memtable_schema = memtable.schema();
let user_schema = memtable_schema.user_schema();
debug_assert_eq!(payload_schema.version(), user_schema.version());
// Only validate column schemas.
debug_assert_eq!(
payload_schema.column_schemas(),
user_schema.column_schemas()
);
}
}
/// Holds `start` and `end` indexes to get a slice `[start, end)` from the vector whose
/// timestamps belong to same time range at `range_index`.
#[derive(Debug, PartialEq)]
struct SliceIndex {
start: usize,
end: usize,
/// Index in time ranges.
range_index: usize,
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::sync::Arc;
use common_time::timestamp::Timestamp;
use datatypes::type_id::LogicalTypeId;
use datatypes::value::Value;
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
use store_api::storage::WriteRequest;
use super::*;
use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
use crate::metadata::RegionMetadata;
use crate::schema::RegionSchemaRef;
use crate::test_util::descriptor_util::RegionDescBuilder;
use crate::test_util::write_batch_util;
use crate::write_batch::WriteBatch;
fn new_test_write_batch() -> WriteBatch {
write_batch_util::new_write_batch(
&[
("ts", LogicalTypeId::TimestampMillisecond, false),
("value", LogicalTypeId::Int64, true),
],
Some(0),
1,
)
}
fn new_region_schema() -> RegionSchemaRef {
let desc = RegionDescBuilder::new("test")
.timestamp(("ts", LogicalTypeId::TimestampMillisecond, false))
.push_field_column(("value", LogicalTypeId::Int64, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
metadata.schema().clone()
}
fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option<i64>)]) {
let ts = TimestampMillisecondVector::from_values(data.iter().map(|v| v.0));
let value = Int64Vector::from(data.iter().map(|v| v.1).collect::<Vec<_>>());
let put_data = HashMap::from([
("ts".to_string(), Arc::new(ts) as VectorRef),
("value".to_string(), Arc::new(value) as VectorRef),
]);
batch.put(put_data).unwrap();
}
fn check_memtable_content(
mem: &MemtableRef,
sequence: SequenceNumber,
data: &[(i64, Option<i64>)],
max_ts: i64,
min_ts: i64,
) {
let iter = mem.iter(IterContext::default()).unwrap();
assert_eq!(min_ts, mem.stats().min_timestamp.value());
assert_eq!(max_ts, mem.stats().max_timestamp.value());
let mut index = 0;
for batch in iter {
let batch = batch.unwrap();
let row_num = batch.column(0).len();
for i in 0..row_num {
let ts = batch.column(0).get(i);
let v = batch.column(1).get(i);
assert_eq!(
Value::Timestamp(Timestamp::new_millisecond(data[index].0)),
ts
);
assert_eq!(Value::from(data[index].1), v);
assert_eq!(Value::from(sequence), batch.column(2).get(i));
index += 1;
}
}
assert_eq!(data.len(), index);
}
#[test]
fn test_inserter_put_one_memtable() {
let sequence = 11111;
let memtable_schema = new_region_schema();
let mutable_memtable = DefaultMemtableBuilder::default().build(memtable_schema);
let mut inserter = Inserter::new(sequence);
let mut batch = new_test_write_batch();
put_batch(&mut batch, &[(1, Some(1)), (2, None)]);
// Also test multiple put data in one batch.
put_batch(
&mut batch,
&[
(3, None),
(2, None), // Duplicate entries in same put data.
(2, Some(2)),
(4, Some(4)),
(201, Some(201)),
(102, None),
(101, Some(101)),
],
);
inserter
.insert_memtable(batch.payload(), &mutable_memtable)
.unwrap();
check_memtable_content(
&mutable_memtable,
sequence,
&[
(1, Some(1)),
(2, Some(2)),
(3, None),
(4, Some(4)),
(101, Some(101)),
(102, None),
(201, Some(201)),
],
201,
1,
);
}
}

View File

@@ -1,595 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_time::Timestamp;
use datatypes::prelude::*;
use datatypes::timestamp::TimestampMillisecond;
use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::{
TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64Vector,
UInt64VectorBuilder, UInt8Vector,
};
use super::*;
use crate::metadata::RegionMetadata;
use crate::schema::{ProjectedSchema, RegionSchemaRef};
use crate::test_util::descriptor_util::RegionDescBuilder;
// Schema for testing memtable:
// - key: Int64(timestamp), UInt64(version),
// - value: UInt64, UInt64
pub fn schema_for_test() -> RegionSchemaRef {
// Just build a region desc and use its columns metadata.
let desc = RegionDescBuilder::new("test")
.push_field_column(("v0", LogicalTypeId::UInt64, true))
.push_field_column(("v1", LogicalTypeId::UInt64, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
metadata.schema().clone()
}
fn kvs_for_test_with_index(
sequence: SequenceNumber,
op_type: OpType,
start_index_in_batch: usize,
keys: &[TimestampMillisecond],
values: &[(Option<u64>, Option<u64>)],
) -> KeyValues {
assert_eq!(keys.len(), values.len());
let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(keys.len());
for key in keys {
key_builders.push(Some(*key));
}
let ts_col = Arc::new(key_builders.finish()) as _;
let mut value_builders = (
UInt64VectorBuilder::with_capacity(values.len()),
UInt64VectorBuilder::with_capacity(values.len()),
);
for value in values {
value_builders.0.push(value.0);
value_builders.1.push(value.1);
}
let row_values = vec![
Arc::new(value_builders.0.finish()) as _,
Arc::new(value_builders.1.finish()) as _,
];
let kvs = KeyValues {
sequence,
op_type,
start_index_in_batch,
keys: vec![],
values: row_values,
timestamp: Some(ts_col),
};
assert_eq!(keys.len(), kvs.len());
assert_eq!(keys.is_empty(), kvs.is_empty());
kvs
}
fn kvs_for_test(
sequence: SequenceNumber,
op_type: OpType,
keys: &[TimestampMillisecond],
values: &[(Option<u64>, Option<u64>)],
) -> KeyValues {
kvs_for_test_with_index(sequence, op_type, 0, keys, values)
}
pub fn write_kvs(
memtable: &dyn Memtable,
sequence: SequenceNumber,
op_type: OpType,
keys: &[i64],
values: &[(Option<u64>, Option<u64>)],
) {
let keys: Vec<TimestampMillisecond> = keys.iter().map(|l| ((*l).into())).collect();
let kvs = kvs_for_test(sequence, op_type, &keys, values);
memtable.write(&kvs).unwrap();
}
fn check_batch_valid(batch: &Batch) {
assert_eq!(5, batch.num_columns());
let row_num = batch.column(0).len();
for i in 1..5 {
assert_eq!(row_num, batch.column(i).len());
}
}
fn check_iter_content(
iter: &mut dyn BatchIterator,
keys: &[i64],
sequences: &[u64],
op_types: &[OpType],
values: &[(Option<u64>, Option<u64>)],
) {
let keys: Vec<TimestampMillisecond> = keys.iter().map(|l| (*l).into()).collect();
let mut index = 0;
for batch in iter {
let batch = batch.unwrap();
check_batch_valid(&batch);
let row_num = batch.column(0).len();
for i in 0..row_num {
let k0 = batch.column(0).get(i);
let (v0, v1) = (batch.column(1).get(i), batch.column(2).get(i));
let sequence = batch.column(3).get(i);
let op_type = batch.column(4).get(i);
assert_eq!(Value::from(keys[index]), k0);
assert_eq!(Value::from(values[index].0), v0);
assert_eq!(Value::from(values[index].1), v1);
assert_eq!(Value::from(sequences[index]), sequence);
assert_eq!(Value::from(op_types[index] as u8), op_type);
index += 1;
}
}
assert_eq!(keys.len(), index);
}
struct MemtableTester {
schema: RegionSchemaRef,
builders: Vec<MemtableBuilderRef>,
}
impl Default for MemtableTester {
fn default() -> MemtableTester {
MemtableTester::new()
}
}
impl MemtableTester {
fn new() -> MemtableTester {
let schema = schema_for_test();
let builders = vec![Arc::new(DefaultMemtableBuilder::default()) as _];
MemtableTester { schema, builders }
}
fn new_memtables(&self) -> Vec<MemtableRef> {
self.builders
.iter()
.map(|b| b.build(self.schema.clone()))
.collect()
}
fn run_testcase<F>(&self, testcase: F)
where
F: Fn(TestContext),
{
for memtable in self.new_memtables() {
let test_ctx = TestContext {
schema: self.schema.clone(),
memtable,
};
testcase(test_ctx);
}
}
}
struct TestContext {
schema: RegionSchemaRef,
memtable: MemtableRef,
}
fn write_iter_memtable_case(ctx: &TestContext) {
// Test iterating an empty memtable.
let mut iter = ctx.memtable.iter(IterContext::default()).unwrap();
assert!(iter.next().is_none());
// Poll the empty iterator again.
assert!(iter.next().is_none());
assert_eq!(0, ctx.memtable.stats().bytes_allocated());
// Init test data.
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1000, 2002, 2003, 2003, 1001], // keys
&[
(Some(1), None),
(Some(2), None),
(Some(7), None),
(Some(8), None),
(Some(9), None),
(Some(3), None),
], // values
);
write_kvs(
&*ctx.memtable,
11, // sequence
OpType::Put,
&[1002, 1003, 1004], // keys
&[(None, None), (Some(5), None), (None, None)], // values
);
// 9 key value pairs (6 + 3).
assert_eq!(576, ctx.memtable.stats().bytes_allocated());
let batch_sizes = [1, 4, 8, consts::READ_BATCH_SIZE];
for batch_size in batch_sizes {
let iter_ctx = IterContext {
batch_size,
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
assert_eq!(
ctx.schema.user_schema(),
iter.schema().projected_user_schema()
);
assert_eq!(RowOrdering::Key, iter.ordering());
check_iter_content(
&mut *iter,
&[1000, 1001, 1002, 1003, 1004, 2002, 2003], // keys
&[10, 10, 11, 11, 11, 10, 10], // sequences
&[
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
OpType::Put,
], // op_types
&[
(Some(2), None),
(Some(3), None),
(None, None),
(Some(5), None),
(None, None),
(Some(7), None),
(Some(9), None),
], // values
);
}
}
#[test]
fn test_iter_context_default() {
let ctx = IterContext::default();
assert_eq!(SequenceNumber::MAX, ctx.visible_sequence);
}
#[test]
fn test_write_iter_memtable() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_iter_memtable_case(&ctx);
});
}
fn check_iter_batch_size(iter: &mut dyn BatchIterator, total: usize, batch_size: usize) {
let mut remains = total;
for batch in iter {
let batch = batch.unwrap();
check_batch_valid(&batch);
let row_num = batch.column(0).len();
if remains >= batch_size {
assert_eq!(batch_size, row_num);
remains -= batch_size;
} else {
assert_eq!(remains, row_num);
remains = 0;
}
}
assert_eq!(0, remains);
}
#[test]
fn test_iter_batch_size() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1000, 1001, 2002, 2003, 2003], // keys
&[
(Some(1), None),
(Some(2), None),
(Some(3), None),
(Some(4), None),
(None, None),
(None, None),
], // values
);
let total = 4;
// Batch size [less than, equal to, greater than] total
let batch_sizes = [1, 6, 8];
for batch_size in batch_sizes {
let iter_ctx = IterContext {
batch_size,
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
check_iter_batch_size(&mut *iter, total, batch_size);
}
});
}
#[test]
fn test_duplicate_key_across_batch() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1001, 2000, 2001], // keys
&[(Some(1), None), (None, None), (None, None), (None, None)], // values
);
write_kvs(
&*ctx.memtable,
11, // sequence
OpType::Put,
&[1000, 2001], // keys
&[(Some(1231), None), (Some(1232), None)], // values
);
let batch_sizes = [1, 2, 3, 4, 5];
for batch_size in batch_sizes {
let iter_ctx = IterContext {
batch_size,
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
check_iter_content(
&mut *iter,
&[1000, 1001, 2000, 2001], // keys
&[11, 10, 10, 11], // sequences
&[OpType::Put, OpType::Put, OpType::Put, OpType::Put], // op_types
&[
(Some(1231), None),
(None, None),
(None, None),
(Some(1232), None),
], // values
);
}
});
}
#[test]
fn test_duplicate_key_in_batch() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1000, 1001, 2001], // keys
&[(None, None), (None, None), (Some(1234), None), (None, None)], // values
);
let batch_sizes = [1, 2, 3, 4, 5];
for batch_size in batch_sizes {
let iter_ctx = IterContext {
batch_size,
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap();
check_iter_content(
&mut *iter,
&[1000, 1001, 2001], // keys
&[10, 10, 10], // sequences
&[OpType::Put, OpType::Put, OpType::Put], // op_types
&[(None, None), (Some(1234), None), (None, None)], // values
);
}
});
}
#[test]
fn test_sequence_visibility() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1000], // keys
&[(Some(1), None), (Some(2), None)], // values
);
write_kvs(
&*ctx.memtable,
11, // sequence
OpType::Put,
&[1000, 1000], // keys
&[(Some(11), None), (Some(12), None)], // values
);
write_kvs(
&*ctx.memtable,
12, // sequence
OpType::Put,
&[1000, 1000], // keys
&[(Some(21), None), (Some(22), None)], // values
);
{
let iter_ctx = IterContext {
batch_size: 1,
visible_sequence: 9,
projected_schema: None,
time_range: None,
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
check_iter_content(
&mut *iter,
&[], // keys
&[], // sequences
&[], // op_types
&[], // values
);
}
{
let iter_ctx = IterContext {
batch_size: 1,
visible_sequence: 10,
projected_schema: None,
time_range: None,
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
check_iter_content(
&mut *iter,
&[1000], // keys
&[10], // sequences
&[OpType::Put, OpType::Put], // op_types
&[(Some(2), None)], // values
);
}
{
let iter_ctx = IterContext {
batch_size: 1,
visible_sequence: 11,
projected_schema: None,
time_range: None,
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
check_iter_content(
&mut *iter,
&[1000], // keys
&[11], // sequences
&[OpType::Put, OpType::Put], // op_types
&[(Some(12), None)], // values
);
}
});
}
#[test]
fn test_iter_after_none() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1001, 1002], // keys
&[(Some(0), None), (Some(1), None), (Some(2), None)], // values
);
let iter_ctx = IterContext {
batch_size: 4,
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
let _ = iter.next().unwrap();
assert!(iter.next().is_none());
assert!(iter.next().is_none());
});
}
#[test]
fn test_filter_memtable() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
OpType::Put,
&[1000, 1001, 1002], // keys
&[(Some(0), None), (Some(1), None), (Some(2), None)], // values
);
let iter_ctx = IterContext {
batch_size: 4,
time_range: Some(
TimestampRange::new(
Timestamp::new_millisecond(0),
Timestamp::new_millisecond(1001),
)
.unwrap(),
),
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
let batch = iter.next().unwrap().unwrap();
assert_eq!(5, batch.columns.len());
assert_eq!(
Arc::new(TimestampMillisecondVector::from_slice([1000])) as Arc<_>,
batch.columns[0]
);
});
}
#[test]
fn test_memtable_projection() {
let tester = MemtableTester::default();
// Only need v0, but row key columns and internal columns would also be read.
let projected_schema =
Arc::new(ProjectedSchema::new(tester.schema.clone(), Some(vec![2])).unwrap());
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
9, // sequence
OpType::Put,
&[1000, 1001, 1002], // keys
&[
(Some(10), Some(20)),
(Some(11), Some(21)),
(Some(12), Some(22)),
], // values
);
let iter_ctx = IterContext {
batch_size: 4,
projected_schema: Some(projected_schema.clone()),
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
let batch = iter.next().unwrap().unwrap();
assert!(iter.next().is_none());
assert_eq!(4, batch.num_columns());
let k0 = Arc::new(TimestampMillisecondVector::from_slice([1000, 1001, 1002])) as VectorRef;
let v0 = Arc::new(UInt64Vector::from_slice([20, 21, 22])) as VectorRef;
let sequences = Arc::new(UInt64Vector::from_slice([9, 9, 9])) as VectorRef;
let op_types = Arc::new(UInt8Vector::from_slice([1, 1, 1])) as VectorRef;
assert_eq!(k0, *batch.column(0));
assert_eq!(v0, *batch.column(1));
assert_eq!(sequences, *batch.column(2));
assert_eq!(op_types, *batch.column(3));
});
}

View File

@@ -1,166 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::Ordering;
use common_time::RangeMillis;
use crate::memtable::{MemtableId, MemtableRef};
/// A version of all memtables.
///
/// This structure is immutable now.
#[derive(Debug)]
pub struct MemtableVersion {
mutable: MemtableRef,
/// Immutable memtables.
immutables: Vec<MemtableRef>,
}
impl MemtableVersion {
pub fn new(mutable: MemtableRef) -> MemtableVersion {
Self {
mutable,
immutables: vec![],
}
}
#[inline]
pub fn mutable_memtable(&self) -> &MemtableRef {
&self.mutable
}
#[inline]
pub fn immutable_memtables(&self) -> &[MemtableRef] {
&self.immutables
}
pub fn num_memtables(&self) -> usize {
// the last `1` is for `mutable`
self.immutable_memtables().len() + 1
}
/// Clone current memtable version and freeze its mutable memtables, which moves
/// all mutable memtables to immutable memtable list.
///
/// This method also calls [Memtable::mark_immutable()](crate::memtable::Memtable::mark_immutable()) to
/// mark the mutable memtable as immutable.
pub fn freeze_mutable(&self, new_mutable: MemtableRef) -> MemtableVersion {
let mut immutables = self.immutables.clone();
// Marks the mutable memtable as immutable so it can free the memory usage from our
// soft limit.
self.mutable.mark_immutable();
immutables.push(self.mutable.clone());
MemtableVersion {
mutable: new_mutable,
immutables,
}
}
pub fn mutable_bytes_allocated(&self) -> usize {
self.mutable.stats().bytes_allocated()
}
pub fn total_bytes_allocated(&self) -> usize {
self.immutables
.iter()
.map(|m| m.stats().bytes_allocated())
.sum::<usize>()
+ self.mutable.stats().bytes_allocated()
}
/// Creates a new `MemtableVersion` that removes immutable memtables
/// less than or equal to max_memtable_id.
pub fn remove_immutables(&self, max_memtable_id: MemtableId) -> MemtableVersion {
let immutables = self
.immutables
.iter()
.filter(|immem| immem.id() > max_memtable_id)
.cloned()
.collect();
MemtableVersion {
mutable: self.mutable.clone(),
immutables,
}
}
pub fn memtables_to_flush(&self) -> (Option<MemtableId>, Vec<MemtableRef>) {
let max_memtable_id = self.immutables.iter().map(|immem| immem.id()).max();
let memtables = self.immutables.clone();
(max_memtable_id, memtables)
}
}
// We use a new type to order time ranges by (end, start).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct RangeKey(RangeMillis);
impl Ord for RangeKey {
fn cmp(&self, other: &RangeKey) -> Ordering {
self.0
.end()
.cmp(other.0.end())
.then_with(|| self.0.start().cmp(other.0.start()))
}
}
impl PartialOrd for RangeKey {
fn partial_cmp(&self, other: &RangeKey) -> Option<Ordering> {
Some(self.cmp(other))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::*;
use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder};
use crate::test_util::schema_util;
#[test]
fn test_memtable_version() {
let memtable_builder = DefaultMemtableBuilder::default();
let region_schema = Arc::new(schema_util::new_region_schema(1, 1));
let memtable_1 = memtable_builder.build(region_schema.clone());
let v1 = MemtableVersion::new(memtable_1);
assert_eq!(1, v1.num_memtables());
// Freeze and add new mutable.
let memtable_2 = memtable_builder.build(region_schema.clone());
let v2 = v1.freeze_mutable(memtable_2);
let v2_immutables = v2.immutable_memtables();
assert_eq!(1, v2_immutables.len());
assert_eq!(0, v2_immutables[0].id());
assert_eq!(1, v2.mutable_memtable().id());
assert_eq!(2, v2.num_memtables());
// Add another one and check immutable memtables that need flush
let memtable_3 = memtable_builder.build(region_schema);
let v3 = v2.freeze_mutable(memtable_3);
let (max_table_id, immutables) = v3.memtables_to_flush();
assert_eq!(1, max_table_id.unwrap());
assert_eq!(2, immutables.len());
// Remove memtables
let v4 = v3.remove_immutables(1);
assert_eq!(1, v4.num_memtables());
assert_eq!(0, v4.immutable_memtables().len());
assert_eq!(2, v4.mutable_memtable().id());
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,66 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! storage metrics
use lazy_static::lazy_static;
use prometheus::*;
/// Reason to flush.
pub const FLUSH_REASON: &str = "reason";
lazy_static! {
/// Elapsed time of updating manifest when creating regions.
pub static ref CREATE_REGION_UPDATE_MANIFEST: Histogram =
register_histogram!("storage_create_region_update_manifest", "storage create region update manifest").unwrap();
/// Counter of scheduled flush requests.
pub static ref FLUSH_REQUESTS_TOTAL: IntCounterVec =
register_int_counter_vec!("storage_flush_requests_total", "storage flush requests total", &[FLUSH_REASON]).unwrap();
/// Counter of scheduled failed flush jobs.
pub static ref FLUSH_ERRORS_TOTAL: IntCounter =
register_int_counter!("storage_flush_errors_total", "storage flush errors total").unwrap();
//// Elapsed time of a flush job.
pub static ref FLUSH_ELAPSED: Histogram =
register_histogram!("storage_flush_elapsed", "storage flush elapsed").unwrap();
/// Counter of flushed bytes.
pub static ref FLUSH_BYTES_TOTAL: IntCounter =
register_int_counter!("storage_flush_bytes_total", "storage flush bytes total").unwrap();
/// Gauge for open regions
pub static ref REGION_COUNT: IntGauge =
register_int_gauge!("storage_region_count", "storage region count").unwrap();
/// Timer for logstore write
pub static ref LOG_STORE_WRITE_ELAPSED: Histogram =
register_histogram!("storage_logstore_write_elapsed", "storage logstore write elapsed").unwrap();
/// Elapsed time of a compact job.
pub static ref COMPACT_ELAPSED: Histogram =
register_histogram!("storage_compact_elapsed", "storage compact elapsed").unwrap();
/// Elapsed time for merging SST files.
pub static ref MERGE_ELAPSED: Histogram =
register_histogram!("storage_compaction_merge_elapsed", "storage compaction merge elapsed").unwrap();
/// Global write buffer size in bytes.
pub static ref WRITE_BUFFER_BYTES: IntGauge =
register_int_gauge!("storage_write_buffer_bytes", "storage write buffer bytes").unwrap();
/// Elapsed time of inserting memtable.
pub static ref MEMTABLE_WRITE_ELAPSED: Histogram =
register_histogram!("storage_memtable_write_elapsed", "storage memtable write elapsed").unwrap();
/// Elapsed time of preprocessing write batch.
pub static ref PREPROCESS_ELAPSED: Histogram =
register_histogram!("storage_write_preprocess_elapsed", "storage write preprocess elapsed").unwrap();
/// Elapsed time for windowed scan
pub static ref WINDOW_SCAN_ELAPSED: Histogram =
register_histogram!("query_scan_window_scan_elapsed", "query scan window scan elapsed").unwrap();
/// Rows per window during window scan
pub static ref WINDOW_SCAN_ROWS_PER_WINDOW: Histogram =
register_histogram!("query_scan_window_scan_window_row_size", "query scan window scan window row size").unwrap();
}

View File

@@ -1,15 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod wal;

View File

@@ -1,40 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#![allow(clippy::all)]
tonic::include_proto!("greptime.storage.wal.v1");
use api::v1::OpType;
use crate::write_batch::Payload;
pub fn gen_mutation_types(payload: &Payload) -> Vec<i32> {
payload
.mutations
.iter()
.map(|m| match m.op_type {
OpType::Delete => MutationType::Delete.into(),
OpType::Put => MutationType::Put.into(),
})
.collect::<Vec<_>>()
}
impl WalHeader {
pub fn with_last_manifest_version(last_manifest_version: u64) -> Self {
Self {
last_manifest_version,
..Default::default()
}
}
}

View File

@@ -1,271 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Common structs and utilities for read.
mod chain;
mod dedup;
mod merge;
mod windowed;
use std::cmp::Ordering;
use async_trait::async_trait;
use common_base::BitVec;
use datatypes::data_type::DataType;
use datatypes::prelude::ConcreteDataType;
use datatypes::vectors::{BooleanVector, MutableVector, VectorRef};
use snafu::{ensure, ResultExt};
use crate::error::{self, Result};
pub use crate::read::chain::ChainReader;
pub use crate::read::dedup::DedupReader;
pub use crate::read::merge::{MergeReader, MergeReaderBuilder};
pub use crate::read::windowed::WindowedReader;
/// Storage internal representation of a batch of rows.
// Now the structure of `Batch` is still unstable, all pub fields may be changed.
#[derive(Debug, Default, PartialEq, Eq, Clone)]
pub struct Batch {
/// Rows organized in columnar format.
///
/// Columns follow the same order convention of region schema:
/// key, value, internal columns.
pub columns: Vec<VectorRef>,
}
impl Batch {
/// Create a new `Batch` from `columns`.
///
/// # Panics
/// Panics if vectors in `columns` have different length.
pub fn new(columns: Vec<VectorRef>) -> Batch {
Self::assert_columns(&columns);
Batch { columns }
}
#[inline]
pub fn num_columns(&self) -> usize {
self.columns.len()
}
#[inline]
pub fn num_rows(&self) -> usize {
self.columns.get(0).map(|v| v.len()).unwrap_or(0)
}
#[inline]
pub fn is_empty(&self) -> bool {
self.num_rows() == 0
}
#[inline]
pub fn columns(&self) -> &[VectorRef] {
&self.columns
}
#[inline]
pub fn column(&self, idx: usize) -> &VectorRef {
&self.columns[idx]
}
/// Slice the batch, returning a new batch.
///
/// # Panics
/// Panics if `offset + length > self.num_rows()`.
fn slice(&self, offset: usize, length: usize) -> Batch {
let columns = self
.columns
.iter()
.map(|v| v.slice(offset, length))
.collect();
Batch { columns }
}
fn assert_columns(columns: &[VectorRef]) {
if columns.is_empty() {
return;
}
let length = columns[0].len();
assert!(columns.iter().all(|col| col.len() == length));
}
}
/// Compute operations for Batch.
pub trait BatchOp {
/// Compare `i-th` in `left` to `j-th` row in `right` by key (row key + internal columns).
///
/// The caller should ensure `left` and `right` have same schema as `self`.
///
/// # Panics
/// Panics if
/// - `i` or `j` is out of bound.
/// - `left` or `right` has insufficient column num.
fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering;
/// Find unique rows in `batch` by row key.
///
/// If `prev` is `Some` and not empty, the last row of `prev` would be used to dedup
/// current `batch`. Set `i-th` bit of `selected` to `true` if `i-th` row is unique,
/// which means the row key of `i-th` row is different from `i+1-th`'s.
///
/// The caller could use `selected` to build a [BooleanVector] to filter the
/// batch, and must ensure `selected` is initialized by filling `batch.num_rows()` bits
/// to zero.
///
/// # Panics
/// Panics if
/// - `batch` and `prev` have different number of columns (unless `prev` is
/// empty).
/// - `selected.len()` is less than the number of rows.
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>);
/// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
/// are true).
///
/// Note that the nulls of `filter` are interpreted as `false` will lead to these elements
/// being masked out.
fn filter(&self, batch: &Batch, filter: &BooleanVector) -> Result<Batch>;
/// Unselect deleted rows according to the [`OpType`](api::v1::OpType).
///
/// # Panics
/// Panics if
/// - `batch` doesn't have a valid op type column.
/// - `selected.len()` is less than the number of rows.
fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec);
}
/// Reusable [Batch] builder.
pub struct BatchBuilder {
builders: Vec<Box<dyn MutableVector>>,
}
impl BatchBuilder {
/// Create a new `BatchBuilder` from data types with given `capacity`.
///
/// # Panics
/// Panics if `types` is empty.
pub fn with_capacity<'a, I>(types: I, capacity: usize) -> BatchBuilder
where
I: IntoIterator<Item = &'a ConcreteDataType>,
{
let builders: Vec<_> = types
.into_iter()
.map(|t| t.create_mutable_vector(capacity))
.collect();
assert!(!builders.is_empty());
BatchBuilder { builders }
}
/// Returns number of rows already in this builder.
#[inline]
pub fn num_rows(&self) -> usize {
self.builders[0].len()
}
/// Returns true if no rows in this builder.
#[inline]
pub fn is_empty(&self) -> bool {
self.num_rows() == 0
}
/// Extend the builder by slice of batch.
///
/// # Panics
/// Panics if
/// - `offset + length > batch.num_rows()`.
/// - Number of columns in `batch` is not equal to the builder's.
pub fn extend_slice_of(&mut self, batch: &Batch, offset: usize, length: usize) -> Result<()> {
assert_eq!(self.builders.len(), batch.num_columns());
for (builder, column) in self.builders.iter_mut().zip(batch.columns()) {
builder
.extend_slice_of(&**column, offset, length)
.context(error::PushBatchSnafu)?;
}
Ok(())
}
/// Push `i-th` row of batch into the builder.
///
/// # Panics
/// Panics if
/// - `i` is out of bound.
/// - Number of columns in `batch` is not equal to the builder's.
pub fn push_row_of(&mut self, batch: &Batch, i: usize) -> Result<()> {
assert_eq!(self.builders.len(), batch.num_columns());
for (builder, column) in self.builders.iter_mut().zip(batch.columns()) {
let value = column.get_ref(i);
builder
.try_push_value_ref(value)
.context(error::PushBatchSnafu)?;
}
Ok(())
}
/// Create a new [Batch] and reset this builder.
pub fn build(&mut self) -> Result<Batch> {
// Checks length of each builder.
let rows = self.num_rows();
for (i, builder) in self.builders.iter().enumerate() {
ensure!(
rows == builder.len(),
error::BuildBatchSnafu {
msg: format!(
"expect row num {} but builder {} has {}",
rows,
i,
builder.len()
),
}
);
}
let columns = self.builders.iter_mut().map(|b| b.to_vector()).collect();
Ok(Batch { columns })
}
}
/// Async batch reader.
#[async_trait]
pub trait BatchReader: Send {
// TODO(yingwen): Schema of batch.
/// Fetch next [Batch].
///
/// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()`
/// again won't return batch again.
///
/// If `Err` is returned, caller **must** not call this method again, the implementor
/// may or may not panic in such case.
async fn next_batch(&mut self) -> Result<Option<Batch>>;
}
/// Pointer to [BatchReader].
pub type BoxedBatchReader = Box<dyn BatchReader>;
#[async_trait::async_trait]
impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
(**self).next_batch().await
}
}

View File

@@ -1,124 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::error::Result;
use crate::read::{Batch, BatchReader};
use crate::schema::ProjectedSchemaRef;
/// A reader that simply chain the outputs of input readers.
pub struct ChainReader<R> {
/// Schema to read
pub schema: ProjectedSchemaRef,
/// Each reader reads a slice of time window
pub readers: Vec<R>,
}
impl<R> ChainReader<R> {
/// Returns a new [ChainReader] with specific input `readers`.
pub fn new(schema: ProjectedSchemaRef, mut readers: Vec<R>) -> Self {
// Reverse readers since we iter them backward.
readers.reverse();
Self { schema, readers }
}
}
#[async_trait::async_trait]
impl<R> BatchReader for ChainReader<R>
where
R: BatchReader,
{
async fn next_batch(&mut self) -> Result<Option<Batch>> {
while let Some(reader) = self.readers.last_mut() {
if let Some(batch) = reader.next_batch().await? {
return Ok(Some(batch));
} else {
// Remove the exhausted reader.
self.readers.pop();
}
}
Ok(None)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_util::read_util::{self, Batches, VecBatchReader};
fn build_chain_reader(sources: &[Batches]) -> ChainReader<VecBatchReader> {
let schema = read_util::new_projected_schema();
let readers = sources
.iter()
.map(|source| read_util::build_vec_reader(source))
.collect();
ChainReader::new(schema, readers)
}
async fn check_chain_reader_result(
mut reader: ChainReader<VecBatchReader>,
input: &[Batches<'_>],
) {
let expect: Vec<_> = input
.iter()
.flat_map(|v| v.iter())
.flat_map(|v| v.iter().copied())
.collect();
let result = read_util::collect_kv_batch(&mut reader).await;
assert_eq!(expect, result);
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
#[tokio::test]
async fn test_chain_empty() {
let mut reader = build_chain_reader(&[]);
assert!(reader.next_batch().await.unwrap().is_none());
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
#[tokio::test]
async fn test_chain_one() {
let input: &[Batches] = &[&[
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (4, Some(4))],
&[(5, Some(5))],
]];
let reader = build_chain_reader(input);
check_chain_reader_result(reader, input).await;
}
#[tokio::test]
async fn test_chain_multi() {
let input: &[Batches] = &[
&[
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (4, Some(4))],
&[(5, Some(5))],
],
&[&[(6, Some(3)), (7, Some(4)), (8, Some(8))], &[(9, Some(9))]],
&[&[(10, Some(10)), (11, Some(11))], &[(12, Some(12))]],
];
let reader = build_chain_reader(input);
check_chain_reader_result(reader, input).await;
}
}

View File

@@ -1,181 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use async_trait::async_trait;
use common_base::BitVec;
use datatypes::prelude::ScalarVector;
use datatypes::vectors::BooleanVector;
use crate::error::Result;
use crate::read::{Batch, BatchOp, BatchReader};
use crate::schema::ProjectedSchemaRef;
/// A reader that dedup rows from inner reader.
pub struct DedupReader<R> {
/// Projected schema to read.
schema: ProjectedSchemaRef,
/// The inner reader.
reader: R,
/// Previous batch from the reader.
prev_batch: Option<Batch>,
/// Reused bitmap buffer.
selected: BitVec,
}
impl<R> DedupReader<R> {
pub fn new(schema: ProjectedSchemaRef, reader: R) -> DedupReader<R> {
DedupReader {
schema,
reader,
prev_batch: None,
selected: BitVec::default(),
}
}
/// Take `batch` and then returns a new batch with no duplicated rows.
///
/// This method may returns empty `Batch`.
fn dedup_batch(&mut self, batch: Batch) -> Result<Batch> {
if batch.is_empty() {
// No need to update `prev_batch` if current batch is empty.
return Ok(batch);
}
// Reinitialize the bit map to zeros.
self.selected.clear();
self.selected.resize(batch.num_rows(), false);
self.schema
.find_unique(&batch, &mut self.selected, self.prev_batch.as_ref());
// Store current batch to `prev_batch` so we could compare the next batch
// with this batch. We store batch before filtering it mainly for correctness, as
// once we supports `DELETE`, rows with `OpType::Delete` would be removed from the
// batch after filter, then we may store an incorrect `last row` of previous batch.
self.prev_batch
.get_or_insert_with(Batch::default)
.clone_from(&batch); // Use `clone_from` to reuse allocated memory if possible.
// Find all rows whose op_types are `OpType::Delete`, mark their `selected` to false.
self.schema.unselect_deleted(&batch, &mut self.selected);
let filter = BooleanVector::from_iterator(self.selected.iter().by_vals());
// Filter duplicate rows.
self.schema.filter(&batch, &filter)
}
}
#[async_trait]
impl<R: BatchReader> BatchReader for DedupReader<R> {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
while let Some(batch) = self.reader.next_batch().await? {
let filtered = self.dedup_batch(batch)?;
// Skip empty batch.
if !filtered.is_empty() {
return Ok(Some(filtered));
}
}
Ok(None)
}
}
#[cfg(test)]
mod tests {
use api::v1::OpType;
use super::*;
use crate::test_util::read_util;
#[tokio::test]
async fn test_dedup_reader_empty() {
let schema = read_util::new_projected_schema();
let reader = read_util::build_vec_reader(&[]);
let mut reader = DedupReader::new(schema, reader);
assert!(reader.next_batch().await.unwrap().is_none());
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
#[tokio::test]
async fn test_dedup_by_sequence() {
let schema = read_util::new_projected_schema();
let reader = read_util::build_full_vec_reader(&[
// key, value, sequence, op_type
&[
(100, 1, 1000, OpType::Put),
(100, 2, 999, OpType::Put),
(100, 3, 998, OpType::Put),
(101, 1, 1000, OpType::Put),
],
&[
(101, 2, 999, OpType::Put),
(102, 12, 1000, OpType::Put),
(103, 13, 1000, OpType::Put),
],
&[(103, 2, 999, OpType::Put)],
]);
let mut reader = DedupReader::new(schema, reader);
let result = read_util::collect_kv_batch(&mut reader).await;
let expect = [
(100, Some(1)),
(101, Some(1)),
(102, Some(12)),
(103, Some(13)),
];
assert_eq!(&expect, &result[..]);
}
#[tokio::test]
async fn test_dedup_contains_empty_input() {
let schema = read_util::new_projected_schema();
let reader = read_util::build_full_vec_reader(&[
// key, value, sequence, op_type
&[
(100, 1, 1000, OpType::Put),
(100, 2, 999, OpType::Put),
(101, 1, 1000, OpType::Put),
],
&[],
&[(101, 2, 999, OpType::Put), (102, 12, 1000, OpType::Put)],
]);
let mut reader = DedupReader::new(schema, reader);
let result = read_util::collect_kv_batch(&mut reader).await;
let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))];
assert_eq!(&expect, &result[..]);
}
#[tokio::test]
async fn test_dedup_contains_empty_output() {
let schema = read_util::new_projected_schema();
let reader = read_util::build_full_vec_reader(&[
// key, value, sequence, op_type
&[
(100, 1, 1000, OpType::Put),
(100, 2, 999, OpType::Put),
(101, 1, 1000, OpType::Put),
],
&[(101, 2, 999, OpType::Put)],
&[(101, 3, 998, OpType::Put), (101, 4, 997, OpType::Put)],
&[(102, 12, 998, OpType::Put)],
]);
let mut reader = DedupReader::new(schema, reader);
let result = read_util::collect_kv_batch(&mut reader).await;
let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))];
assert_eq!(&expect, &result[..]);
}
}

View File

@@ -1,828 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Merge reader.
//!
//! The implementation of [`MergeReader`] is inspired by
//! [`kudu's MergeIterator`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L107)
//! and [`CeresDB's MergeIterator`](https://github.com/CeresDB/ceresdb/blob/02a7e3100f47cf16aa6c245ed529a6978be20fbd/analytic_engine/src/row_iter/merge.rs)
//!
//! The main idea of the merge algorithm is to maintain a `merge window`. The window describes,
//! at any given time, the key range where we expect to find the row with the smallest key.
//! A [`Node`] (known as the sub-iterator in kudu) whose NEXT overlaps with the `merge window`
//! is said to be actively participating in the merge.
//!
//! The `merge window` is defined as follows:
//! 1. The window's start is the smallest lower bound of all nodes. We
//! refer to the node that owns this lower bound as LOW.
//! 2. The windows end is the smallest upper bound of all nodes whose
//! lower bounds are less than or equal to LOW's upper bound.
//! 2a. The window's end could be LOW's upper bound itself, if it is the smallest
//! upper bound, but this isn't necessarily the case.
//! 3. The merge window's dimensions change as the merge proceeds, though it
//! only ever moves "to the right" (i.e. the window start/end only increase).
//!
//! We can divide the nodes into two sets, one for whose next rows overlap with the `merge window`,
//! another for whose next rows do not. The merge steady state resembles that of a traditional
//! heap-based merge: the top-most node is popped from HOT, the lower bound is copied to the output
//! and advanced, and the node is pushed back to HOT.
//!
//! In the steady state, we need to move nodes from COLD to HOT whenever the end of the merge window
//! moves; that's a sign that the window may now overlap with a NEXT belonging to a nodes in the
//! second set (COLD). The end of the merge window moves when a node is fully exhausted (i.e. all rows have
//! been copied to the output), or when a node finishes its NEXT and needs to peek again.
//!
//! At any given time, the NEXT belonging to the top-most node in COLD is nearest the merge window.
//! When the merge window's end has moved and we need to refill HOT, the top-most node in COLD is
//! the best candidate. To figure out whether it should be moved, we compare its NEXT's lower bound
//! against the upper bound in HOT's first node: if the lower bound is less than or equal to the key,
//! we move the node from COLD to HOT. On the flip side, when a node from HOT finishes its NEXT and peeks
//! again, we also need to check whether it has exited the merge window. The approach is similar: if
//! its NEXT's lower bound is greater than the upper bound of HOT'S first node, it's time to move it to COLD.
//!
//! A full description of the merge algorithm could be found in [`kudu's comment`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L349)
//! and the [google doc](https://docs.google.com/document/d/1uP0ubjM6ulnKVCRrXtwT_dqrTWjF9tlFSRk0JN2e_O0/edit#).
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::fmt;
use async_trait::async_trait;
use store_api::storage::consts;
use crate::error::Result;
use crate::memtable::BoxedBatchIterator;
use crate::read::{Batch, BatchBuilder, BatchOp, BatchReader, BoxedBatchReader};
use crate::schema::{ProjectedSchema, ProjectedSchemaRef};
/// Batch data source.
enum Source {
// To avoid the overhead of async-trait (typically a heap allocation), wraps the
// BatchIterator into an enum instead of converting the iterator into a BatchReader.
Iter(BoxedBatchIterator),
Reader(BoxedBatchReader),
}
impl Source {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
match self {
Source::Iter(iter) => iter.next().transpose(),
Source::Reader(reader) => reader.next_batch().await,
}
}
/// Fetch next non empty batch.
async fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
while let Some(batch) = self.next_batch().await? {
if !batch.is_empty() {
return Ok(Some(batch));
}
}
Ok(None)
}
}
impl fmt::Debug for Source {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Source::Iter(_) => write!(f, "Iter(..)"),
Source::Reader(_) => write!(f, "Reader(..)"),
}
}
}
/// Reference to a row in [BatchCursor].
#[derive(Debug)]
struct RowCursor<'a> {
batch: &'a Batch,
pos: usize,
}
impl<'a> RowCursor<'a> {
#[inline]
fn compare(&self, schema: &ProjectedSchema, other: &RowCursor) -> Ordering {
schema.compare_row(self.batch, self.pos, other.batch, other.pos)
}
}
/// A `BatchCursor` wraps the `Batch` and allows reading the `Batch` by row.
#[derive(Debug)]
struct BatchCursor {
/// Current buffered `Batch`.
///
/// `Batch` must contains at least one row.
batch: Batch,
/// Index of current row.
///
/// `pos == batch.num_rows()` indicates no more rows to read.
pos: usize,
}
impl BatchCursor {
/// Create a new `BatchCursor`.
///
/// # Panics
/// Panics if `batch` is empty.
fn new(batch: Batch) -> BatchCursor {
assert!(!batch.is_empty());
BatchCursor { batch, pos: 0 }
}
/// Returns true if there are remaining rows to read.
#[inline]
fn is_valid(&self) -> bool {
!self.is_empty()
}
/// Returns first row of current batch.
///
/// # Panics
/// Panics if `self` is invalid.
fn first_row(&self) -> RowCursor {
assert!(self.is_valid());
RowCursor {
batch: &self.batch,
pos: self.pos,
}
}
/// Returns last row of current batch.
///
/// # Panics
/// Panics if `self` is invalid.
fn last_row(&self) -> RowCursor {
assert!(self.is_valid());
RowCursor {
batch: &self.batch,
pos: self.batch.num_rows() - 1,
}
}
#[inline]
fn is_empty(&self) -> bool {
self.pos >= self.batch.num_rows()
}
/// Take slice of batch with at most `length` rows from the cursor, then
/// advance the cursor.
///
/// # Panics
/// Panics if `self` is invalid.
fn take_batch_slice(&mut self, length: usize) -> Batch {
let length = length.min(self.batch.num_rows() - self.pos);
let batch = self.batch.slice(self.pos, length);
self.pos += batch.num_rows();
batch
}
/// Push at most `length` rows from `self` to the `builder` and advance the cursor.
///
/// # Panics
/// Panics if `self` is invalid.
fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> {
let length = length.min(self.batch.num_rows() - self.pos);
builder.extend_slice_of(&self.batch, self.pos, length)?;
self.pos += length;
Ok(())
}
/// Push next row from `self` to the `builder` and advance the cursor.
///
/// # Panics
/// Panics if `self` is invalid.
fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> {
builder.push_row_of(&self.batch, self.pos)?;
self.pos += 1;
Ok(())
}
}
/// A `Node` represent an individual input data source to be merged.
struct Node {
/// Schema of data source.
schema: ProjectedSchemaRef,
/// Data source of this `Node`.
source: Source,
/// Current batch to be read.
///
/// `None` means the `source` has reached EOF.
cursor: Option<BatchCursor>,
}
impl fmt::Debug for Node {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Node")
.field("source", &self.source)
.field("cursor", &self.cursor)
.finish_non_exhaustive()
}
}
impl Node {
async fn new(schema: ProjectedSchemaRef, mut source: Source) -> Result<Node> {
let cursor = source.next_non_empty_batch().await?.map(BatchCursor::new);
Ok(Node {
schema,
source,
cursor,
})
}
/// Returns the reference to the cursor.
///
/// # Panics
/// Panics if `self` is EOF.
fn cursor_ref(&self) -> &BatchCursor {
self.cursor.as_ref().unwrap()
}
/// Returns first row in cursor.
///
/// # Panics
/// Panics if `self` is EOF.
fn first_row(&self) -> RowCursor {
self.cursor_ref().first_row()
}
/// Returns last row in cursor.
///
/// # Panics
/// Panics if `self` is EOF.
fn last_row(&self) -> RowCursor {
self.cursor_ref().last_row()
}
/// Compare first row of two nodes.
///
/// # Panics
/// Panics if
/// - either `self` or `other` is EOF.
fn compare_first_row(&self, other: &Node) -> Ordering {
self.first_row().compare(&self.schema, &other.first_row())
}
/// Returns true if no more batch could be fetched from this node.
fn is_eof(&self) -> bool {
self.cursor.is_none()
}
/// Returns true if the key range of current batch in `self` is behind (exclusive) current
/// batch in `other`.
///
/// # Panics
/// Panics if
/// - either `self` or `other` is EOF.
fn is_behind(&self, other: &Node) -> bool {
let first = self.first_row();
let last = other.last_row();
// `self` is after `other` if min (first) row of `self` is greater than
// max (last) row of `other`.
first.compare(&self.schema, &last) == Ordering::Greater
}
/// Fetch next batch and reset its cursor if `self` isn't EOF and the cursor
/// is empty.
///
/// Returns true if a new batch has been fetched.
async fn maybe_fetch_next_batch(&mut self) -> Result<bool> {
let need_fetch = !self.is_eof() && self.cursor_ref().is_empty();
if !need_fetch {
// Still has remaining rows, no need to fetch.
return Ok(false);
}
// This ensure the cursor is either non empty or None (EOF).
match self.source.next_non_empty_batch().await? {
Some(batch) => {
self.cursor = Some(BatchCursor::new(batch));
Ok(true)
}
None => {
// EOF
self.cursor = None;
Ok(false)
}
}
}
/// Returns the mutable reference to the cursor.
///
/// # Panics
/// Panics if `self` is EOF.
fn cursor_mut(&mut self) -> &mut BatchCursor {
self.cursor.as_mut().unwrap()
}
/// Take batch from this node.
///
/// # Panics
/// Panics if `self` is EOF.
fn take_batch_slice(&mut self, length: usize) -> Batch {
self.cursor_mut().take_batch_slice(length)
}
/// Push at most `length` rows from `self` to the `builder`.
///
/// # Panics
/// Panics if `self` is EOF.
fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> {
self.cursor_mut().push_rows_to(builder, length)
}
/// Push next row from `self` to the `builder`.
///
/// # Panics
/// Panics if `self` is EOF.
fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> {
self.cursor_mut().push_next_row_to(builder)
}
}
impl PartialEq for Node {
fn eq(&self, other: &Node) -> bool {
self.compare_first_row(other) == Ordering::Equal
}
}
impl Eq for Node {}
impl PartialOrd for Node {
fn partial_cmp(&self, other: &Node) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Node {
fn cmp(&self, other: &Node) -> Ordering {
// The std binary heap is a max heap, but we want the nodes are ordered in
// ascend order, so we compare the nodes in reverse order.
other.compare_first_row(self)
}
}
/// A reader that would sort and merge `Batch` from multiple sources by key.
///
/// `Batch` from each `Source` **must** be sorted.
pub struct MergeReader {
/// Whether the reader has been initialized.
initialized: bool,
/// Schema of data source.
schema: ProjectedSchemaRef,
/// Input data sources.
///
/// All data source must have same schema. Initialize the reader would
/// convert all `Source`s into `Node`s and then clear this vector.
sources: Vec<Source>,
/// Holds `Node` whose key range of current batch **is** overlapped with the merge window.
///
/// `Node` in this heap **must** not be empty. A `merge window` is the key range of the
/// root node in the `hot` heap.
hot: BinaryHeap<Node>,
/// Holds `Node` whose key range of current batch **isn't** overlapped with the merge window.
///
/// `Node` in this heap **must** not be empty.
cold: BinaryHeap<Node>,
/// Suggested row number of each batch.
///
/// The size of the batch yield from this reader may not always equal to this suggested size.
batch_size: usize,
/// Buffered batch.
batch_builder: BatchBuilder,
}
#[async_trait]
impl BatchReader for MergeReader {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
self.fetch_next_batch().await
}
}
pub struct MergeReaderBuilder {
schema: ProjectedSchemaRef,
sources: Vec<Source>,
batch_size: usize,
}
impl MergeReaderBuilder {
pub fn new(schema: ProjectedSchemaRef) -> Self {
MergeReaderBuilder::with_capacity(schema, 0)
}
pub fn with_capacity(schema: ProjectedSchemaRef, capacity: usize) -> Self {
MergeReaderBuilder {
schema,
sources: Vec::with_capacity(capacity),
batch_size: consts::READ_BATCH_SIZE,
}
}
pub fn push_batch_iter(mut self, iter: BoxedBatchIterator) -> Self {
self.sources.push(Source::Iter(iter));
self
}
pub fn push_batch_reader(mut self, reader: BoxedBatchReader) -> Self {
self.sources.push(Source::Reader(reader));
self
}
pub fn batch_size(mut self, size: usize) -> Self {
self.batch_size = size;
self
}
pub fn build(self) -> MergeReader {
let num_sources = self.sources.len();
let column_schemas = self.schema.schema_to_read().schema().column_schemas();
let batch_builder = BatchBuilder::with_capacity(
column_schemas.iter().map(|c| &c.data_type),
self.batch_size,
);
MergeReader {
initialized: false,
schema: self.schema,
sources: self.sources,
hot: BinaryHeap::with_capacity(num_sources),
cold: BinaryHeap::with_capacity(num_sources),
batch_size: self.batch_size,
batch_builder,
}
}
}
impl MergeReader {
/// Initialize the reader if it has not yet been initialized.
async fn try_init(&mut self) -> Result<()> {
if self.initialized {
return Ok(());
}
if self.sources.is_empty() {
self.initialized = true;
return Ok(());
}
for source in self.sources.drain(..) {
let node = Node::new(self.schema.clone(), source).await?;
if !node.is_eof() {
self.cold.push(node);
}
}
self.refill_hot();
self.initialized = true;
Ok(())
}
async fn fetch_next_batch(&mut self) -> Result<Option<Batch>> {
self.try_init().await?;
while !self.hot.is_empty() && self.batch_builder.num_rows() < self.batch_size {
if self.hot.len() == 1 {
// No need to do merge sort if only one batch in the hot heap.
let fetch_row_num = self.batch_size - self.batch_builder.num_rows();
if let Some(batch) = self.fetch_batch_from_hottest(fetch_row_num).await? {
// The builder is empty and we have fetched a new batch from this node.
return Ok(Some(batch));
}
// Otherwise, some rows may have been pushed into the builder.
} else {
// We could only fetch one row from the hottest node.
self.fetch_one_row_from_hottest().await?;
}
}
// Check buffered rows in the builder.
if self.batch_builder.is_empty() {
Ok(None)
} else {
self.batch_builder.build().map(Some)
}
}
/// Move nodes in `cold` heap, whose key range is overlapped with current merge
/// window to `hot` heap.
fn refill_hot(&mut self) {
while !self.cold.is_empty() {
if let Some(merge_window) = self.hot.peek() {
let warmest = self.cold.peek().unwrap();
if warmest.is_behind(merge_window) {
// if the warmest node in the `cold` heap is totally after the
// `merge_window`, then no need to add more nodes into the `hot`
// heap for merge sorting.
break;
}
}
let warmest = self.cold.pop().unwrap();
self.hot.push(warmest);
}
}
/// Fetch at most `fetch_row_num` from the hottest node and attempt to return them directly
/// instead of pushing into the builder if the `self.batch_builder` is empty.
async fn fetch_batch_from_hottest(&mut self, fetch_row_num: usize) -> Result<Option<Batch>> {
assert_eq!(1, self.hot.len());
let mut hottest = self.hot.pop().unwrap();
let batch = if self.batch_builder.is_empty() {
Some(hottest.take_batch_slice(fetch_row_num))
} else {
hottest.push_rows_to(&mut self.batch_builder, fetch_row_num)?;
None
};
self.reheap(hottest).await?;
Ok(batch)
}
/// Fetch one row from the hottest node.
async fn fetch_one_row_from_hottest(&mut self) -> Result<()> {
let mut hottest = self.hot.pop().unwrap();
hottest.push_next_row_to(&mut self.batch_builder)?;
self.reheap(hottest).await
}
/// Fetch next batch from this node and reset its cursor, then push the node back to a
/// proper heap.
async fn reheap(&mut self, mut node: Node) -> Result<()> {
let fetched_new_batch = node.maybe_fetch_next_batch().await?;
if node.is_eof() {
// The merge window would be updated, need to refill the hot heap.
self.refill_hot();
} else if fetched_new_batch {
// A new batch has been fetched from the node, thus the key range of this node
// has been changed. Try to find a proper heap for this node.
let node_is_cold = if let Some(hottest) = self.hot.peek() {
// Now key range of this node is behind the hottest node's.
node.is_behind(hottest)
} else {
// Setting this to false should not affect correctness but performance because
// `refille_hot()` ensures the hottest node is correct.
true
};
if node_is_cold {
self.cold.push(node);
} else {
self.hot.push(node);
}
// Anyway, the merge window has been changed, we need to refill the hot heap.
self.refill_hot();
} else {
// No new batch has been fetched, so the end key of merge window has not been
// changed, we could just put the node back to the hot heap.
self.hot.push(node);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use datatypes::prelude::ScalarVector;
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
use super::*;
use crate::test_util::read_util::{self, Batches};
#[tokio::test]
async fn test_merge_reader_empty() {
let schema = read_util::new_projected_schema();
let mut reader = MergeReaderBuilder::new(schema).build();
assert!(reader.next_batch().await.unwrap().is_none());
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
#[tokio::test]
async fn test_node() {
let schema = read_util::new_projected_schema();
let left_source = read_util::build_boxed_iter(&[&[(1, None), (3, None), (5, None)]]);
let mut left = Node::new(schema.clone(), Source::Iter(left_source))
.await
.unwrap();
let right_source = read_util::build_boxed_reader(&[&[(2, None), (3, None), (6, None)]]);
let mut right = Node::new(schema.clone(), Source::Reader(right_source))
.await
.unwrap();
// We use reverse order for a node.
assert!(left > right);
assert_ne!(left, right);
// Advance the left and right node.
left.cursor_mut().pos += 1;
right.cursor_mut().pos += 1;
assert_eq!(left, right);
// Check Debug is implemented.
let output = format!("{left:?}");
assert!(output.contains("cursor"));
assert!(output.contains("pos: 1"));
let output = format!("{right:?}");
assert!(output.contains("cursor"));
let output = format!("{:?}", left.first_row());
assert!(output.contains("pos: 1"));
}
fn build_merge_reader(sources: &[Batches], num_iter: usize, batch_size: usize) -> MergeReader {
let schema = read_util::new_projected_schema();
let mut builder =
MergeReaderBuilder::with_capacity(schema, sources.len()).batch_size(batch_size);
for (i, source) in sources.iter().enumerate() {
if i < num_iter {
builder = builder.push_batch_iter(read_util::build_boxed_iter(source));
} else {
builder = builder.push_batch_reader(read_util::build_boxed_reader(source));
}
}
builder.build()
}
async fn check_merge_reader_result(mut reader: MergeReader, input: &[Batches<'_>]) {
let mut expect: Vec<_> = input
.iter()
.flat_map(|v| v.iter())
.flat_map(|v| v.iter().copied())
.collect();
expect.sort_by_key(|k| k.0);
let result = read_util::collect_kv_batch(&mut reader).await;
assert_eq!(expect, result);
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
async fn check_merge_reader_by_batch(mut reader: MergeReader, expect_batches: Batches<'_>) {
let mut result = Vec::new();
while let Some(batch) = reader.next_batch().await.unwrap() {
let key = batch
.column(0)
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap();
let value = batch
.column(1)
.as_any()
.downcast_ref::<Int64Vector>()
.unwrap();
let batch: Vec<_> = key
.iter_data()
.zip(value.iter_data())
.map(|(k, v)| (k.unwrap().into(), v))
.collect();
result.push(batch);
}
for (expect, actual) in expect_batches.iter().zip(result.iter()) {
assert_eq!(expect, actual);
}
}
#[tokio::test]
async fn test_merge_multiple_interleave() {
common_telemetry::init_default_ut_logging();
let input: &[Batches] = &[
&[&[(1, Some(1)), (5, Some(5)), (9, Some(9))]],
&[&[(2, Some(2)), (3, Some(3)), (8, Some(8))]],
&[&[(7, Some(7)), (12, Some(12))]],
];
let reader = build_merge_reader(input, 1, 3);
check_merge_reader_result(reader, input).await;
let input: &[Batches] = &[
&[
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (4, Some(4))],
&[(5, Some(5)), (12, Some(12))],
],
&[&[(6, Some(6)), (7, Some(7)), (18, Some(18))]],
&[&[(13, Some(13)), (15, Some(15))]],
];
let reader = build_merge_reader(input, 1, 3);
check_merge_reader_by_batch(
reader,
&[
// The former two batches could be returned directly.
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (4, Some(4))],
&[(5, Some(5)), (6, Some(6)), (7, Some(7))],
&[(12, Some(12)), (13, Some(13)), (15, Some(15))],
&[(18, Some(18))],
],
)
.await;
let input: &[Batches] = &[
&[
&[(1, Some(1)), (2, Some(2))],
&[(5, Some(5)), (9, Some(9))],
&[(14, Some(14)), (17, Some(17))],
],
&[&[(6, Some(6)), (7, Some(7))], &[(15, Some(15))]],
];
let reader = build_merge_reader(input, 1, 2);
check_merge_reader_by_batch(
reader,
&[
&[(1, Some(1)), (2, Some(2))],
// Could not return batch (6, 7) directly.
&[(5, Some(5)), (6, Some(6))],
&[(7, Some(7)), (9, Some(9))],
&[(14, Some(14)), (15, Some(15))],
&[(17, Some(17))],
],
)
.await;
}
#[tokio::test]
async fn test_merge_one_source() {
common_telemetry::init_default_ut_logging();
let input: &[Batches] = &[&[
&[(1, Some(1)), (2, Some(2)), (3, Some(3))],
&[(4, Some(4)), (5, Some(5)), (6, Some(6))],
]];
let reader = build_merge_reader(input, 1, 2);
check_merge_reader_result(reader, input).await;
}
#[tokio::test]
async fn test_merge_with_empty_batch() {
let input: &[Batches] = &[
&[
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (6, Some(6))],
&[],
&[],
&[(8, Some(8)), (12, Some(12))],
&[],
],
&[
&[(4, Some(4)), (5, Some(5))],
&[],
&[(15, None), (18, None), (20, None)],
],
&[&[(13, Some(13)), (19, None)], &[], &[]],
];
let reader = build_merge_reader(input, 1, 2);
check_merge_reader_result(reader, input).await;
}
#[tokio::test]
async fn test_merge_duplicate_key() {
let input: &[Batches] = &[
&[
&[(1, Some(1)), (5, Some(5)), (8, Some(8))],
&[(9, None), (11, None)],
&[(12, Some(12)), (15, None)],
],
&[&[(1, Some(1)), (3, Some(3)), (8, Some(8))], &[(16, None)]],
&[
&[(7, Some(7)), (12, Some(12))],
&[(15, None), (16, None), (17, None)],
],
&[&[(15, None)]],
];
let reader = build_merge_reader(input, 2, 2);
check_merge_reader_result(reader, input).await;
}
}

View File

@@ -1,171 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow::compute::SortOptions;
use arrow::row::{RowConverter, SortField};
use arrow_array::{Array, ArrayRef};
use common_recordbatch::OrderOption;
use datatypes::data_type::DataType;
use datatypes::vectors::Helper;
use snafu::ResultExt;
use crate::error::{self, Result};
use crate::read::{Batch, BatchReader};
use crate::schema::{ProjectedSchemaRef, StoreSchema};
/// [WindowedReader] provides a windowed record batch reader that scans all rows within a window
/// at a time and sort these rows ordered in `[<timestamp>, <PK>]` order.
pub struct WindowedReader<R> {
/// Schema to read
pub schema: ProjectedSchemaRef,
/// Each reader reads a slice of time window
pub readers: Vec<R>,
/// `order_options` defines how records within windows are sorted.
pub order_options: Vec<OrderOption>,
}
impl<R> WindowedReader<R> {
/// Creates a new [WindowedReader] from given schema and a set of boxed readers.
///
/// ### Note
/// [WindowedReader] always reads the readers in a reverse order. The last reader in `readers`
/// gets polled first.
pub fn new(
schema: ProjectedSchemaRef,
readers: Vec<R>,
order_options: Vec<OrderOption>,
) -> Self {
Self {
schema,
readers,
order_options,
}
}
}
#[async_trait::async_trait]
impl<R> BatchReader for WindowedReader<R>
where
R: BatchReader,
{
async fn next_batch(&mut self) -> Result<Option<Batch>> {
let _window_scan_elapsed = crate::metrics::WINDOW_SCAN_ELAPSED.start_timer();
let Some(mut reader) = self.readers.pop() else {
return Ok(None);
};
let store_schema = self.schema.schema_to_read();
let mut batches = vec![];
while let Some(batch) = reader.next_batch().await? {
batches.push(
batch
.columns
.into_iter()
.map(|v| v.to_arrow_array())
.collect::<Vec<_>>(),
);
}
let Some(num_columns) = batches.get(0).map(|b| b.len()) else {
// the reader does not yield data, a batch of empty vectors must be returned instead of
// an empty batch without any column.
let empty_columns = store_schema
.columns()
.iter()
.map(|s| s.desc.data_type.create_mutable_vector(0).to_vector())
.collect();
return Ok(Some(Batch::new(empty_columns)));
};
let mut vectors_in_batch = Vec::with_capacity(num_columns);
for idx in 0..num_columns {
let columns: Vec<&dyn Array> =
batches.iter().map(|b| b[idx].as_ref()).collect::<Vec<_>>();
vectors_in_batch
.push(arrow::compute::concat(&columns).context(error::ConvertColumnsToRowsSnafu)?);
}
if let Some(v) = vectors_in_batch.get(0) {
crate::metrics::WINDOW_SCAN_ROWS_PER_WINDOW.observe(v.len() as f64);
}
let sorted = sort_by_rows(&self.schema, vectors_in_batch, &self.order_options)?;
let vectors = sorted
.iter()
.zip(store_schema.columns().iter().map(|c| &c.desc.name))
.map(|(arr, name)| {
Helper::try_into_vector(arr).context(error::ConvertChunkSnafu { name })
})
.collect::<Result<_>>()?;
Ok(Some(Batch::new(vectors)))
}
}
fn sort_by_rows(
schema: &ProjectedSchemaRef,
arrays: Vec<ArrayRef>,
order_options: &[OrderOption],
) -> Result<Vec<ArrayRef>> {
let store_schema = schema.schema_to_read();
let sort_columns = build_sorted_columns(store_schema, order_options);
// Convert columns to rows to speed lexicographic sort
// TODO(hl): maybe optimize to lexsort_to_index when only timestamp column is involved.
let row_converter = RowConverter::new(
sort_columns
.iter()
.map(|(idx, descending)| {
SortField::new_with_options(
store_schema.columns()[*idx].desc.data_type.as_arrow_type(),
SortOptions {
descending: *descending,
nulls_first: true,
},
)
})
.collect(),
)
.context(error::ConvertColumnsToRowsSnafu)?;
let columns_to_sort = sort_columns
.into_iter()
.map(|(idx, _)| arrays[idx].clone())
.collect::<Vec<_>>();
let rows_to_sort = row_converter
.convert_columns(&columns_to_sort)
.context(error::ConvertColumnsToRowsSnafu)?;
let mut sort_pairs = rows_to_sort.iter().enumerate().collect::<Vec<_>>();
sort_pairs.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
let idx =
arrow::array::UInt32Array::from_iter_values(sort_pairs.iter().map(|(i, _)| *i as u32));
let sorted = arrays
.iter()
.map(|arr| arrow::compute::take(arr, &idx, None))
.collect::<arrow::error::Result<Vec<_>>>()
.context(error::SortArraysSnafu)?;
debug_assert_eq!(sorted.len(), store_schema.num_columns());
Ok(sorted)
}
/// Builds sorted columns from `order_options`.
/// Returns a vector of columns indices to sort and sort orders (true means descending order).
fn build_sorted_columns(schema: &StoreSchema, order_options: &[OrderOption]) -> Vec<(usize, bool)> {
order_options
.iter()
.map(|o| (schema.column_index(&o.name), o.options.descending))
.collect()
}

View File

@@ -1,808 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#[cfg(test)]
mod tests;
mod writer;
use std::collections::BTreeMap;
use std::fmt;
use std::sync::atomic::{AtomicI64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use common_telemetry::{info, logging};
use common_time::util;
use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::manifest::{
self, Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator,
};
use store_api::storage::{
AlterRequest, CloseContext, CompactContext, CompactionStrategy, FlushContext, FlushReason,
OpenOptions, ReadContext, Region, RegionId, SequenceNumber, WriteContext, WriteResponse,
};
use crate::compaction::{
compaction_strategy_to_picker, CompactionPickerRef, CompactionSchedulerRef,
};
use crate::config::EngineConfig;
use crate::error::{self, Error, Result};
use crate::file_purger::FilePurgerRef;
use crate::flush::{FlushSchedulerRef, FlushStrategyRef};
use crate::manifest::action::{
RawRegionMetadata, RegionChange, RegionCheckpoint, RegionMetaAction, RegionMetaActionList,
};
use crate::manifest::region::RegionManifest;
use crate::memtable::{MemtableBuilderRef, MemtableVersion};
use crate::metadata::{RegionMetaImpl, RegionMetadata, RegionMetadataRef};
pub(crate) use crate::region::writer::schedule_compaction;
pub use crate::region::writer::{
AlterContext, RegionWriter, RegionWriterRef, WriterCompactRequest, WriterContext,
};
use crate::region::writer::{DropContext, TruncateContext};
use crate::schema::compat::CompatWrite;
use crate::snapshot::SnapshotImpl;
use crate::sst::{AccessLayerRef, LevelMetas};
use crate::version::{
Version, VersionControl, VersionControlRef, VersionEdit, INIT_COMMITTED_SEQUENCE,
};
use crate::wal::Wal;
use crate::write_batch::WriteBatch;
/// [Region] implementation.
pub struct RegionImpl<S: LogStore> {
inner: Arc<RegionInner<S>>,
}
impl<S: LogStore> Clone for RegionImpl<S> {
fn clone(&self) -> Self {
Self {
inner: self.inner.clone(),
}
}
}
impl<S: LogStore> fmt::Debug for RegionImpl<S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("RegionImpl")
.field("id", &self.inner.shared.id)
.field("name", &self.inner.shared.name)
.field("wal", &self.inner.wal)
.field("flush_strategy", &self.inner.flush_strategy)
.field("compaction_scheduler", &self.inner.compaction_scheduler)
.field("sst_layer", &self.inner.sst_layer)
.field("manifest", &self.inner.manifest)
.finish()
}
}
#[async_trait]
impl<S: LogStore> Region for RegionImpl<S> {
type Error = Error;
type Meta = RegionMetaImpl;
type WriteRequest = WriteBatch;
type Snapshot = SnapshotImpl;
fn id(&self) -> RegionId {
self.inner.shared.id
}
fn name(&self) -> &str {
&self.inner.shared.name
}
fn in_memory_metadata(&self) -> RegionMetaImpl {
self.inner.in_memory_metadata()
}
async fn write(&self, ctx: &WriteContext, mut request: WriteBatch) -> Result<WriteResponse> {
// Compat the schema of the write batch outside of the write lock.
self.inner.compat_write_batch(&mut request)?;
self.inner.write(ctx, request).await
}
fn snapshot(&self, _ctx: &ReadContext) -> Result<SnapshotImpl> {
Ok(self.inner.create_snapshot())
}
fn write_request(&self) -> Self::WriteRequest {
let metadata = self.inner.version_control().metadata();
let user_schema = metadata.user_schema().clone();
let row_key_end = metadata.schema().store_schema().row_key_end();
WriteBatch::new(user_schema, row_key_end)
}
async fn alter(&self, request: AlterRequest) -> Result<()> {
self.inner.alter(request).await
}
async fn drop_region(&self) -> Result<()> {
crate::metrics::REGION_COUNT.dec();
self.inner.drop_region().await
}
fn disk_usage_bytes(&self) -> u64 {
let version = self.inner.version_control().current();
version
.ssts()
.levels()
.iter()
.map(|level_ssts| level_ssts.files().map(|sst| sst.file_size()).sum::<u64>())
.sum()
}
async fn flush(&self, ctx: &FlushContext) -> Result<()> {
self.inner.flush(ctx).await
}
async fn compact(&self, ctx: &CompactContext) -> std::result::Result<(), Self::Error> {
self.inner.compact(ctx).await
}
async fn truncate(&self) -> Result<()> {
self.inner.truncate().await
}
}
/// Storage related config for region.
///
/// Contains all necessary storage related components needed by the region, such as logstore,
/// manifest, memtable builder.
pub struct StoreConfig<S: LogStore> {
pub log_store: Arc<S>,
pub sst_layer: AccessLayerRef,
pub manifest: RegionManifest,
pub memtable_builder: MemtableBuilderRef,
pub flush_scheduler: FlushSchedulerRef<S>,
pub flush_strategy: FlushStrategyRef,
pub compaction_scheduler: CompactionSchedulerRef<S>,
pub engine_config: Arc<EngineConfig>,
pub file_purger: FilePurgerRef,
pub ttl: Option<Duration>,
pub write_buffer_size: usize,
pub compaction_strategy: CompactionStrategy,
}
pub type RecoveredMetadata = (SequenceNumber, (ManifestVersion, RawRegionMetadata));
pub type RecoveredMetadataMap = BTreeMap<SequenceNumber, (ManifestVersion, RawRegionMetadata)>;
impl<S: LogStore> RegionImpl<S> {
/// Create a new region and also persist the region metadata to manifest.
///
/// The caller should avoid calling this method simultaneously.
pub async fn create(
metadata: RegionMetadata,
store_config: StoreConfig<S>,
) -> Result<RegionImpl<S>> {
let metadata = Arc::new(metadata);
// Try to persist region data to manifest, ensure the new region could be recovered from
// the manifest.
let manifest_version = {
let _timer = crate::metrics::CREATE_REGION_UPDATE_MANIFEST.start_timer();
store_config
.manifest
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
RegionChange {
metadata: metadata.as_ref().into(),
committed_sequence: INIT_COMMITTED_SEQUENCE,
},
)))
.await?
};
let mutable_memtable = store_config
.memtable_builder
.build(metadata.schema().clone());
let version = Version::with_manifest_version(
metadata,
manifest_version,
mutable_memtable,
store_config.sst_layer.clone(),
store_config.file_purger.clone(),
);
let region = RegionImpl::new(version, store_config);
crate::metrics::REGION_COUNT.inc();
Ok(region)
}
/// Create a new region without persisting manifest.
fn new(version: Version, store_config: StoreConfig<S>) -> RegionImpl<S> {
let metadata = version.metadata();
let id = metadata.id();
let name = metadata.name().to_string();
let version_control = VersionControl::with_version(version);
let wal = Wal::new(id, store_config.log_store);
let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
let inner = Arc::new(RegionInner {
shared: Arc::new(SharedData {
id,
name,
version_control: Arc::new(version_control),
last_flush_millis: AtomicI64::new(0),
}),
writer: Arc::new(RegionWriter::new(
store_config.memtable_builder,
store_config.engine_config.clone(),
store_config.ttl,
store_config.write_buffer_size,
store_config.compaction_scheduler.clone(),
compaction_picker.clone(),
)),
wal,
flush_strategy: store_config.flush_strategy,
flush_scheduler: store_config.flush_scheduler,
compaction_scheduler: store_config.compaction_scheduler,
compaction_picker,
sst_layer: store_config.sst_layer,
manifest: store_config.manifest,
});
RegionImpl { inner }
}
/// Open an existing region and recover its data.
///
/// The caller should avoid calling this method simultaneously.
pub async fn open(
name: String,
store_config: StoreConfig<S>,
_opts: &OpenOptions,
) -> Result<Option<RegionImpl<S>>> {
// Load version meta data from manifest.
let (version, mut recovered_metadata) = match Self::recover_from_manifest(
&store_config.manifest,
&store_config.memtable_builder,
&store_config.sst_layer,
&store_config.file_purger,
)
.await?
{
(None, _) => return Ok(None),
(Some(v), m) => (v, m),
};
logging::debug!(
"Region recovered version from manifest, version: {:?}",
version
);
let metadata = version.metadata().clone();
let flushed_sequence = version.flushed_sequence();
let version_control = Arc::new(VersionControl::with_version(version));
let recovered_metadata_after_flushed =
recovered_metadata.split_off(&(flushed_sequence + 1));
// apply the last flushed metadata
if let Some((sequence, (manifest_version, metadata))) = recovered_metadata.pop_last() {
let metadata: RegionMetadataRef = Arc::new(
metadata
.try_into()
.context(error::InvalidRawRegionSnafu { region: &name })?,
);
let mutable_memtable = store_config
.memtable_builder
.build(metadata.schema().clone());
version_control.freeze_mutable_and_apply_metadata(
metadata,
manifest_version,
mutable_memtable,
);
logging::debug!(
"Applied the last flushed metadata to region: {}, sequence: {}, manifest: {}",
name,
sequence,
manifest_version,
);
}
let wal = Wal::new(metadata.id(), store_config.log_store);
wal.obsolete(flushed_sequence).await?;
info!(
"Obsolete WAL entries on startup, region: {}, flushed sequence: {}",
metadata.id(),
flushed_sequence
);
let shared = Arc::new(SharedData {
id: metadata.id(),
name,
version_control,
last_flush_millis: AtomicI64::new(0),
});
let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
let writer = Arc::new(RegionWriter::new(
store_config.memtable_builder,
store_config.engine_config.clone(),
store_config.ttl,
store_config.write_buffer_size,
store_config.compaction_scheduler.clone(),
compaction_picker.clone(),
));
let writer_ctx = WriterContext {
shared: &shared,
flush_strategy: &store_config.flush_strategy,
flush_scheduler: &store_config.flush_scheduler,
compaction_scheduler: &store_config.compaction_scheduler,
sst_layer: &store_config.sst_layer,
wal: &wal,
writer: &writer,
manifest: &store_config.manifest,
compaction_picker: compaction_picker.clone(),
};
// Replay all unflushed data.
writer
.replay(recovered_metadata_after_flushed, writer_ctx)
.await?;
let inner = Arc::new(RegionInner {
shared,
writer,
wal,
flush_strategy: store_config.flush_strategy,
flush_scheduler: store_config.flush_scheduler,
compaction_scheduler: store_config.compaction_scheduler,
compaction_picker,
sst_layer: store_config.sst_layer,
manifest: store_config.manifest,
});
crate::metrics::REGION_COUNT.inc();
Ok(Some(RegionImpl { inner }))
}
/// Get ID of this region.
pub fn id(&self) -> RegionId {
self.inner.shared.id()
}
/// Returns last flush timestamp in millis.
pub(crate) fn last_flush_millis(&self) -> i64 {
self.inner.shared.last_flush_millis()
}
/// Returns the [VersionControl] of the region.
pub(crate) fn version_control(&self) -> &VersionControl {
self.inner.version_control()
}
fn create_version_with_checkpoint(
checkpoint: RegionCheckpoint,
memtable_builder: &MemtableBuilderRef,
sst_layer: &AccessLayerRef,
file_purger: &FilePurgerRef,
) -> Result<Option<Version>> {
if checkpoint.checkpoint.is_none() {
return Ok(None);
}
// Safety: it's safe to unwrap here, checking it above.
let s = checkpoint.checkpoint.unwrap();
let region = s.metadata.name.clone();
let region_metadata: RegionMetadata = s
.metadata
.try_into()
.context(error::InvalidRawRegionSnafu { region })?;
let memtable = memtable_builder.build(region_metadata.schema().clone());
let mut version = Version::with_manifest_version(
Arc::new(region_metadata),
checkpoint.last_version,
memtable,
sst_layer.clone(),
file_purger.clone(),
);
if let Some(v) = s.version {
version.apply_checkpoint(
v.flushed_sequence,
v.manifest_version,
v.files.into_values(),
);
}
Ok(Some(version))
}
async fn recover_from_manifest(
manifest: &RegionManifest,
memtable_builder: &MemtableBuilderRef,
sst_layer: &AccessLayerRef,
file_purger: &FilePurgerRef,
) -> Result<(Option<Version>, RecoveredMetadataMap)> {
let checkpoint = manifest.last_checkpoint().await?;
let (start, end, mut version) = if let Some(checkpoint) = checkpoint {
(
checkpoint.last_version + 1,
manifest::MAX_VERSION,
Self::create_version_with_checkpoint(
checkpoint,
memtable_builder,
sst_layer,
file_purger,
)?,
)
} else {
(manifest::MIN_VERSION, manifest::MAX_VERSION, None)
};
let mut iter = manifest.scan(start, end).await?;
let mut actions = Vec::new();
let mut last_manifest_version = manifest::MIN_VERSION;
let mut recovered_metadata = BTreeMap::new();
while let Some((manifest_version, action_list)) = iter.next_action().await? {
last_manifest_version = manifest_version;
for action in action_list.actions {
match (action, version) {
(RegionMetaAction::Change(c), None) => {
let region = c.metadata.name.clone();
let region_metadata: RegionMetadata = c
.metadata
.try_into()
.context(error::InvalidRawRegionSnafu { region })?;
// Use current schema to build a memtable. This might be replaced later
// in `freeze_mutable_and_apply_metadata()`.
let memtable = memtable_builder.build(region_metadata.schema().clone());
version = Some(Version::with_manifest_version(
Arc::new(region_metadata),
last_manifest_version,
memtable,
sst_layer.clone(),
file_purger.clone(),
));
for (manifest_version, action) in actions.drain(..) {
version = Self::replay_edit(manifest_version, action, version);
}
}
(RegionMetaAction::Change(c), Some(v)) => {
let _ = recovered_metadata
.insert(c.committed_sequence, (manifest_version, c.metadata));
version = Some(v);
}
(RegionMetaAction::Remove(r), Some(v)) => {
manifest.stop().await?;
let files = v.ssts().mark_all_files_deleted();
logging::info!(
"Try to remove all SSTs, region: {}, files: {:?}",
r.region_id,
files
);
manifest
.manifest_store()
.delete_all(v.manifest_version())
.await?;
return Ok((None, recovered_metadata));
}
(RegionMetaAction::Truncate(t), Some(mut v)) => {
let files = v.ssts().mark_all_files_deleted();
logging::info!(
"Try to remove all SSTs on truncate, region: {}, files: {:?}",
t.region_id,
files
);
let region_metadata = v.metadata().clone();
let memtables = Arc::new(MemtableVersion::new(
memtable_builder.build(region_metadata.schema().clone()),
));
let ssts =
Arc::new(LevelMetas::new(sst_layer.clone(), file_purger.clone()));
v.reset(
v.manifest_version() + 1,
memtables,
ssts,
t.committed_sequence,
);
version = Some(v);
}
(action, None) => {
actions.push((manifest_version, action));
version = None;
}
(action, Some(v)) => {
version = Self::replay_edit(manifest_version, action, Some(v));
}
}
}
}
assert!(actions.is_empty() || version.is_none());
if let Some(version) = &version {
// update manifest state after recovering
let protocol = iter.last_protocol();
manifest.update_state(last_manifest_version + 1, protocol.clone());
manifest.set_flushed_manifest_version(version.manifest_version());
}
Ok((version, recovered_metadata))
}
fn replay_edit(
manifest_version: ManifestVersion,
action: RegionMetaAction,
version: Option<Version>,
) -> Option<Version> {
if let RegionMetaAction::Edit(e) = action {
let edit = VersionEdit {
files_to_add: e.files_to_add,
files_to_remove: e.files_to_remove,
flushed_sequence: e.flushed_sequence,
manifest_version,
max_memtable_id: None,
compaction_time_window: e.compaction_time_window,
};
version.map(|mut v| {
v.apply_edit(edit);
v
})
} else {
version
}
}
/// Compact the region manually.
pub async fn compact(&self, ctx: &CompactContext) -> Result<()> {
self.inner.compact(ctx).await
}
pub async fn close(&self, ctx: &CloseContext) -> Result<()> {
crate::metrics::REGION_COUNT.dec();
self.inner.close(ctx).await
}
}
// Private methods for tests.
#[cfg(test)]
impl<S: LogStore> RegionImpl<S> {
#[inline]
fn committed_sequence(&self) -> store_api::storage::SequenceNumber {
self.inner.version_control().committed_sequence()
}
fn current_manifest_version(&self) -> ManifestVersion {
self.inner.version_control().current_manifest_version()
}
/// Write to inner, also the `RegionWriter` directly.
async fn write_inner(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
self.inner.write(ctx, request).await
}
// Replay metadata to inner.
async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) -> Result<()> {
let inner = &self.inner;
let writer_ctx = WriterContext {
shared: &inner.shared,
flush_strategy: &inner.flush_strategy,
flush_scheduler: &inner.flush_scheduler,
compaction_scheduler: &inner.compaction_scheduler,
sst_layer: &inner.sst_layer,
wal: &inner.wal,
writer: &inner.writer,
manifest: &inner.manifest,
compaction_picker: inner.compaction_picker.clone(),
};
inner.writer.replay(recovered_metadata, writer_ctx).await
}
pub(crate) async fn write_buffer_size(&self) -> usize {
self.inner.writer.write_buffer_size().await
}
}
/// Shared data of region.
#[derive(Debug)]
pub struct SharedData {
// Region id and name is immutable, so we cache them in shared data to avoid loading
// current version from `version_control` each time we need to access them.
id: RegionId,
name: String,
// TODO(yingwen): Maybe no need to use Arc for version control.
pub version_control: VersionControlRef,
/// Last flush time in millis.
last_flush_millis: AtomicI64,
}
impl SharedData {
#[inline]
pub fn id(&self) -> RegionId {
self.id
}
#[inline]
pub fn name(&self) -> &str {
&self.name
}
/// Update flush time to current time.
pub(crate) fn update_flush_millis(&self) {
let now = util::current_time_millis();
self.last_flush_millis.store(now, Ordering::Relaxed);
}
/// Returns last flush timestamp in millis.
fn last_flush_millis(&self) -> i64 {
self.last_flush_millis.load(Ordering::Relaxed)
}
}
pub type SharedDataRef = Arc<SharedData>;
struct RegionInner<S: LogStore> {
shared: SharedDataRef,
writer: RegionWriterRef<S>,
wal: Wal<S>,
flush_strategy: FlushStrategyRef,
flush_scheduler: FlushSchedulerRef<S>,
compaction_scheduler: CompactionSchedulerRef<S>,
compaction_picker: CompactionPickerRef<S>,
sst_layer: AccessLayerRef,
manifest: RegionManifest,
}
impl<S: LogStore> RegionInner<S> {
#[inline]
fn version_control(&self) -> &VersionControl {
&self.shared.version_control
}
fn in_memory_metadata(&self) -> RegionMetaImpl {
let metadata = self.version_control().metadata();
RegionMetaImpl::new(metadata)
}
fn create_snapshot(&self) -> SnapshotImpl {
let version = self.version_control().current();
let sequence = self.version_control().committed_sequence();
SnapshotImpl::new(version, sequence, self.sst_layer.clone())
}
fn compat_write_batch(&self, request: &mut WriteBatch) -> Result<()> {
let metadata = self.version_control().metadata();
let schema = metadata.schema();
// Try to make request schema compatible with region's outside of write lock. Note that
// schema might be altered after this step.
request.compat_write(schema.user_schema())
}
/// Write to writer directly.
async fn write(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
let writer_ctx = WriterContext {
shared: &self.shared,
flush_strategy: &self.flush_strategy,
flush_scheduler: &self.flush_scheduler,
compaction_scheduler: &self.compaction_scheduler,
sst_layer: &self.sst_layer,
wal: &self.wal,
writer: &self.writer,
manifest: &self.manifest,
compaction_picker: self.compaction_picker.clone(),
};
// The writer would also try to compat the schema of write batch if it finds out the
// schema version of request is less than current schema version.
self.writer.write(ctx, request, writer_ctx).await
}
async fn alter(&self, request: AlterRequest) -> Result<()> {
logging::info!(
"Alter region {}, name: {}, request: {:?}",
self.shared.id,
self.shared.name,
request
);
let alter_ctx = AlterContext {
shared: &self.shared,
wal: &self.wal,
manifest: &self.manifest,
};
self.writer.alter(alter_ctx, request).await
}
async fn close(&self, ctx: &CloseContext) -> Result<()> {
self.writer.close().await?;
if ctx.flush {
let ctx = FlushContext {
wait: true,
reason: FlushReason::Manually,
force: true,
};
self.flush(&ctx).await?;
}
self.manifest.stop().await
}
async fn drop_region(&self) -> Result<()> {
logging::info!("Drop region {}, name: {}", self.shared.id, self.shared.name);
let drop_ctx = DropContext {
shared: &self.shared,
wal: &self.wal,
manifest: &self.manifest,
flush_scheduler: &self.flush_scheduler,
compaction_scheduler: &self.compaction_scheduler,
sst_layer: &self.sst_layer,
};
self.manifest.stop().await?;
self.writer.on_drop(drop_ctx).await
}
async fn flush(&self, ctx: &FlushContext) -> Result<()> {
let writer_ctx = WriterContext {
shared: &self.shared,
flush_strategy: &self.flush_strategy,
flush_scheduler: &self.flush_scheduler,
compaction_scheduler: &self.compaction_scheduler,
sst_layer: &self.sst_layer,
wal: &self.wal,
writer: &self.writer,
manifest: &self.manifest,
compaction_picker: self.compaction_picker.clone(),
};
self.writer.flush(writer_ctx, ctx).await
}
/// Compact the region manually.
async fn compact(&self, compact_ctx: &CompactContext) -> Result<()> {
self.writer
.compact(WriterCompactRequest {
shared_data: self.shared.clone(),
sst_layer: self.sst_layer.clone(),
manifest: self.manifest.clone(),
wal: self.wal.clone(),
region_writer: self.writer.clone(),
compact_ctx: *compact_ctx,
})
.await
}
async fn truncate(&self) -> Result<()> {
logging::info!(
"Truncate region {}, name: {}",
self.shared.id,
self.shared.name
);
let ctx = TruncateContext {
shared: &self.shared,
wal: &self.wal,
manifest: &self.manifest,
sst_layer: &self.sst_layer,
};
self.writer.truncate(&ctx).await?;
Ok(())
}
}

View File

@@ -1,833 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region tests.
use std::collections::{HashMap, HashSet};
use arrow::compute::SortOptions;
use common_base::readable_size::ReadableSize;
use common_datasource::compression::CompressionType;
use common_recordbatch::OrderOption;
use common_telemetry::logging;
use common_test_util::temp_dir::{create_temp_dir, TempDir};
use datatypes::prelude::{LogicalTypeId, ScalarVector, WrapperType};
use datatypes::timestamp::TimestampMillisecond;
use datatypes::vectors::{
BooleanVector, Int64Vector, StringVector, TimestampMillisecondVector, VectorRef,
};
use log_store::raft_engine::log_store::RaftEngineLogStore;
use log_store::NoopLogStore;
use object_store::services::Fs;
use object_store::ObjectStore;
use store_api::manifest::{Manifest, MAX_VERSION};
use store_api::storage::{
Chunk, ChunkReader, FlushContext, FlushReason, ReadContext, Region, RegionMeta, ScanRequest,
SequenceNumber, Snapshot, WriteContext, WriteRequest,
};
use super::*;
use crate::chunk::ChunkReaderImpl;
use crate::compaction::noop::NoopCompactionScheduler;
use crate::engine;
use crate::engine::RegionMap;
use crate::file_purger::noop::NoopFilePurgeHandler;
use crate::flush::{FlushScheduler, PickerConfig, SizeBasedStrategy};
use crate::manifest::action::{RegionChange, RegionMetaActionList};
use crate::manifest::manifest_compress_type;
use crate::manifest::region::RegionManifest;
use crate::manifest::test_utils::*;
use crate::memtable::DefaultMemtableBuilder;
use crate::metadata::RegionMetadata;
use crate::region::{RegionImpl, StoreConfig};
use crate::scheduler::{LocalScheduler, SchedulerConfig};
use crate::sst::{FileId, FsAccessLayer};
use crate::test_util::descriptor_util::RegionDescBuilder;
use crate::test_util::{self, config_util, schema_util, write_batch_util};
mod alter;
mod basic;
mod close;
mod compact;
mod drop;
mod flush;
mod projection;
mod truncate;
/// Create metadata of a region with schema: (timestamp, v0).
pub fn new_metadata(region_name: &str) -> RegionMetadata {
let desc = RegionDescBuilder::new(region_name)
.id(123)
.push_field_column(("v0", LogicalTypeId::String, true))
.build();
desc.try_into().unwrap()
}
/// Test region with schema (timestamp, v0).
pub struct TesterBase<S: LogStore> {
pub region: RegionImpl<S>,
pub write_ctx: WriteContext,
pub read_ctx: ReadContext,
}
impl<S: LogStore> TesterBase<S> {
pub fn with_region(region: RegionImpl<S>) -> TesterBase<S> {
TesterBase {
region,
write_ctx: WriteContext::default(),
read_ctx: ReadContext::default(),
}
}
pub async fn checkpoint_manifest(&self) {
let manifest = &self.region.inner.manifest;
manifest.set_flushed_manifest_version(manifest.last_version() - 1);
let _ = manifest.do_checkpoint().await.unwrap().unwrap();
}
pub async fn close(&self) {
self.region.inner.flush_scheduler.stop().await.unwrap();
self.region
.inner
.compaction_scheduler
.stop(true)
.await
.unwrap();
self.region.close(&CloseContext::default()).await.unwrap();
self.region.inner.wal.close().await.unwrap();
}
/// Put without version specified.
///
/// Format of data: (timestamp, v0), timestamp is key, v0 is value.
pub async fn put(&self, data: &[(i64, Option<String>)]) -> WriteResponse {
self.try_put(data).await.unwrap()
}
/// Put without version specified, returns [`Result<WriteResponse>`]
///
/// Format of data: (timestamp, v0), timestamp is key, v0 is value.
pub async fn try_put(&self, data: &[(i64, Option<String>)]) -> Result<WriteResponse> {
let data: Vec<(TimestampMillisecond, Option<String>)> =
data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect();
// Build a batch without version.
let mut batch = new_write_batch_for_test(false);
let put_data = new_put_data(&data);
batch.put(put_data).unwrap();
self.region.write(&self.write_ctx, batch).await
}
/// Put without version specified directly to inner writer.
pub async fn put_inner(&self, data: &[(i64, Option<String>)]) -> WriteResponse {
let data: Vec<(TimestampMillisecond, Option<String>)> =
data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect();
let mut batch = new_write_batch_for_test(false);
let put_data = new_put_data(&data);
batch.put(put_data).unwrap();
self.region
.write_inner(&self.write_ctx, batch)
.await
.unwrap()
}
pub async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) {
self.region.replay_inner(recovered_metadata).await.unwrap()
}
/// Scan all data.
pub async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
logging::info!("Full scan with ctx {:?}", self.read_ctx);
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
let resp = snapshot
.scan(&self.read_ctx, ScanRequest::default())
.await
.unwrap();
let mut reader = resp.reader;
let metadata = self.region.in_memory_metadata();
assert_eq!(metadata.schema(), reader.user_schema());
let mut dst = Vec::new();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let chunk = reader.project_chunk(chunk);
append_chunk_to(&chunk, &mut dst);
}
dst
}
pub async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option<String>)> {
logging::info!("Full scan with ctx {:?}", self.read_ctx);
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
let resp = snapshot.scan(&self.read_ctx, req).await.unwrap();
let mut reader = resp.reader;
let metadata = self.region.in_memory_metadata();
assert_eq!(metadata.schema(), reader.user_schema());
let mut dst = Vec::new();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let chunk = reader.project_chunk(chunk);
append_chunk_to(&chunk, &mut dst);
}
dst
}
pub fn committed_sequence(&self) -> SequenceNumber {
self.region.committed_sequence()
}
/// Delete by keys (timestamp).
pub async fn delete(&self, keys: &[i64]) -> WriteResponse {
let keys: Vec<TimestampMillisecond> = keys.iter().map(|v| (*v).into()).collect();
// Build a batch without version.
let mut batch = new_write_batch_for_test(false);
let keys = new_delete_data(&keys);
batch.delete(keys).unwrap();
self.region.write(&self.write_ctx, batch).await.unwrap()
}
/// Returns a reader to scan all data.
pub async fn full_scan_reader(&self) -> ChunkReaderImpl {
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
let resp = snapshot
.scan(&self.read_ctx, ScanRequest::default())
.await
.unwrap();
resp.reader
}
/// Collect data from the reader.
pub async fn collect_reader(&self, mut reader: ChunkReaderImpl) -> Vec<(i64, Option<String>)> {
let mut dst = Vec::new();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let chunk = reader.project_chunk(chunk);
append_chunk_to(&chunk, &mut dst);
}
dst
}
}
pub type FileTesterBase = TesterBase<RaftEngineLogStore>;
fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch {
if enable_version_column {
write_batch_util::new_write_batch(
&[
(
test_util::TIMESTAMP_NAME,
LogicalTypeId::TimestampMillisecond,
false,
),
("v0", LogicalTypeId::String, true),
],
Some(0),
2,
)
} else {
write_batch_util::new_write_batch(
&[
(
test_util::TIMESTAMP_NAME,
LogicalTypeId::TimestampMillisecond,
false,
),
("v0", LogicalTypeId::String, true),
],
Some(0),
1,
)
}
}
fn new_put_data(data: &[(TimestampMillisecond, Option<String>)]) -> HashMap<String, VectorRef> {
let timestamps =
TimestampMillisecondVector::from_vec(data.iter().map(|v| v.0.into()).collect());
let values = StringVector::from(data.iter().map(|kv| kv.1.clone()).collect::<Vec<_>>());
HashMap::from([
(
test_util::TIMESTAMP_NAME.to_string(),
Arc::new(timestamps) as VectorRef,
),
("v0".to_string(), Arc::new(values) as VectorRef),
])
}
fn new_delete_data(keys: &[TimestampMillisecond]) -> HashMap<String, VectorRef> {
let timestamps =
TimestampMillisecondVector::from_vec(keys.iter().map(|v| v.0.into()).collect());
HashMap::from([(
test_util::TIMESTAMP_NAME.to_string(),
Arc::new(timestamps) as VectorRef,
)])
}
fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option<String>)>) {
assert_eq!(2, chunk.columns.len());
let timestamps = chunk.columns[0]
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap();
let values = chunk.columns[1]
.as_any()
.downcast_ref::<StringVector>()
.unwrap();
for (ts, value) in timestamps.iter_data().zip(values.iter_data()) {
dst.push((ts.unwrap().into_native(), value.map(|s| s.to_string())));
}
}
#[tokio::test]
async fn test_new_region() {
let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_field_column(("v0", LogicalTypeId::Float32, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
let dir = create_temp_dir("test_new_region");
let store_dir = dir.path().to_str().unwrap();
let store_config =
config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await;
let placeholder_memtable = store_config
.memtable_builder
.build(metadata.schema().clone());
let region = RegionImpl::new(
Version::new(Arc::new(metadata), placeholder_memtable),
store_config,
);
let expect_schema = schema_util::new_schema_ref(
&[
("k1", LogicalTypeId::Int32, false),
(
test_util::TIMESTAMP_NAME,
LogicalTypeId::TimestampMillisecond,
false,
),
("v0", LogicalTypeId::Float32, true),
],
Some(1),
);
assert_eq!(region_name, region.name());
assert_eq!(expect_schema, *region.in_memory_metadata().schema());
}
#[tokio::test]
async fn test_recover_region_manifets_compress() {
test_recover_region_manifets(true).await;
}
#[tokio::test]
async fn test_recover_region_manifets_uncompress() {
test_recover_region_manifets(false).await;
}
async fn test_recover_region_manifets(compress: bool) {
common_telemetry::init_default_ut_logging();
let tmp_dir = create_temp_dir("test_recover_region_manifets");
let memtable_builder = Arc::new(DefaultMemtableBuilder::default()) as _;
let mut builder = Fs::default();
let _ = builder.root(&tmp_dir.path().to_string_lossy());
let object_store = ObjectStore::new(builder).unwrap().finish();
let manifest = RegionManifest::with_checkpointer(
"/manifest/",
object_store.clone(),
manifest_compress_type(compress),
None,
None,
);
let region_meta = Arc::new(build_region_meta());
let sst_layer = Arc::new(FsAccessLayer::new("sst", object_store)) as _;
let file_purger = Arc::new(LocalScheduler::new(
SchedulerConfig::default(),
NoopFilePurgeHandler,
));
// Recover from empty
assert!(RegionImpl::<NoopLogStore>::recover_from_manifest(
&manifest,
&memtable_builder,
&sst_layer,
&file_purger,
)
.await
.unwrap()
.0
.is_none());
let file_id_a = FileId::random();
let file_id_b = FileId::random();
let file_id_c = FileId::random();
{
// save some actions into region_meta
assert!(manifest
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 40,
},
)))
.await
.is_ok());
assert!(manifest
.update(RegionMetaActionList::new(vec![
RegionMetaAction::Edit(build_region_edit(1, &[file_id_a], &[])),
RegionMetaAction::Edit(build_region_edit(2, &[file_id_b, file_id_c], &[])),
]))
.await
.is_ok());
assert!(manifest
.update(RegionMetaActionList::with_action(RegionMetaAction::Change(
RegionChange {
metadata: region_meta.as_ref().into(),
committed_sequence: 42,
},
)))
.await
.is_ok());
}
// try to recover
let (version, recovered_metadata) = RegionImpl::<NoopLogStore>::recover_from_manifest(
&manifest,
&memtable_builder,
&sst_layer,
&file_purger,
)
.await
.unwrap();
assert_recovered_manifest(
version,
recovered_metadata,
&file_id_a,
&file_id_b,
&file_id_c,
&region_meta,
);
// do a manifest checkpoint
let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap();
assert_eq!(1, checkpoint.last_version);
assert_eq!(2, checkpoint.compacted_actions);
assert_eq!(
manifest.last_checkpoint().await.unwrap().unwrap(),
checkpoint
);
// recover from checkpoint
let (version, recovered_metadata) = RegionImpl::<NoopLogStore>::recover_from_manifest(
&manifest,
&memtable_builder,
&sst_layer,
&file_purger,
)
.await
.unwrap();
assert_recovered_manifest(
version,
recovered_metadata,
&file_id_a,
&file_id_b,
&file_id_c,
&region_meta,
);
// check manifest state
assert_eq!(3, manifest.last_version());
let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap();
let (version, action) = iter.next_action().await.unwrap().unwrap();
assert_eq!(2, version);
assert!(matches!(action.actions[0], RegionMetaAction::Change(..)));
assert!(iter.next_action().await.unwrap().is_none());
}
fn assert_recovered_manifest(
version: Option<Version>,
recovered_metadata: RecoveredMetadataMap,
file_id_a: &FileId,
file_id_b: &FileId,
file_id_c: &FileId,
region_meta: &Arc<RegionMetadata>,
) {
assert_eq!(42, *recovered_metadata.first_key_value().unwrap().0);
let version = version.unwrap();
assert_eq!(*version.metadata(), *region_meta);
assert_eq!(version.flushed_sequence(), 2);
assert_eq!(version.manifest_version(), 1);
let ssts = version.ssts();
let files = ssts.levels()[0]
.files()
.map(|f| f.file_name())
.collect::<HashSet<_>>();
assert_eq!(3, files.len());
assert_eq!(
HashSet::from([
file_id_a.as_parquet(),
file_id_b.as_parquet(),
file_id_c.as_parquet()
]),
files
);
}
fn create_region_meta(region_name: &str) -> RegionMetadata {
let desc = RegionDescBuilder::new(region_name)
.push_field_column(("v0", LogicalTypeId::Int64, true))
.push_field_column(("v1", LogicalTypeId::String, true))
.push_field_column(("v2", LogicalTypeId::Boolean, true))
.build();
desc.try_into().unwrap()
}
async fn create_store_config(region_name: &str, root: &str) -> StoreConfig<NoopLogStore> {
let mut builder = Fs::default();
let _ = builder.root(root);
let object_store = ObjectStore::new(builder).unwrap().finish();
let parent_dir = "";
let sst_dir = engine::region_sst_dir(parent_dir, region_name);
let manifest_dir = engine::region_manifest_dir(parent_dir, region_name);
let sst_layer = Arc::new(FsAccessLayer::new(&sst_dir, object_store.clone()));
let manifest = RegionManifest::with_checkpointer(
&manifest_dir,
object_store,
CompressionType::Uncompressed,
None,
None,
);
manifest.start().await.unwrap();
let compaction_scheduler = Arc::new(NoopCompactionScheduler::default());
let regions = Arc::new(RegionMap::new());
let flush_scheduler = Arc::new(
FlushScheduler::new(
SchedulerConfig::default(),
compaction_scheduler.clone(),
regions,
PickerConfig::default(),
)
.unwrap(),
);
let log_store = Arc::new(NoopLogStore);
let file_purger = Arc::new(LocalScheduler::new(
SchedulerConfig::default(),
NoopFilePurgeHandler,
));
StoreConfig {
log_store,
sst_layer,
manifest,
memtable_builder: Arc::new(DefaultMemtableBuilder::default()),
flush_scheduler,
flush_strategy: Arc::new(SizeBasedStrategy::default()),
compaction_scheduler,
engine_config: Default::default(),
file_purger,
ttl: None,
write_buffer_size: ReadableSize::mb(32).0 as usize,
compaction_strategy: Default::default(),
}
}
struct WindowedReaderTester {
data_written: Vec<Vec<(i64, i64, String, bool)>>,
expected: Vec<(i64, i64, String, bool)>,
region: RegionImpl<NoopLogStore>,
_temp_dir: TempDir,
}
impl WindowedReaderTester {
async fn new(
region_name: &'static str,
data_written: Vec<Vec<(i64, i64, String, bool)>>,
expected: Vec<(i64, i64, String, bool)>,
) -> Self {
let temp_dir = create_temp_dir(&format!("write_and_read_windowed_{}", region_name));
let root = temp_dir.path().to_str().unwrap();
let metadata = create_region_meta(region_name);
let store_config = create_store_config(region_name, root).await;
let region = RegionImpl::create(metadata, store_config).await.unwrap();
let tester = Self {
data_written,
expected,
region,
_temp_dir: temp_dir,
};
tester.prepare().await;
tester
}
async fn prepare(&self) {
for batch in &self.data_written {
let mut write_batch = self.region.write_request();
let ts = TimestampMillisecondVector::from_iterator(
batch
.iter()
.map(|(v, _, _, _)| TimestampMillisecond::new(*v)),
);
let v0 = Int64Vector::from_iterator(batch.iter().map(|(_, v, _, _)| *v));
let v1 = StringVector::from_iterator(batch.iter().map(|(_, _, v, _)| v.as_str()));
let v2 = BooleanVector::from_iterator(batch.iter().map(|(_, _, _, v)| *v));
let columns = [
("timestamp".to_string(), Arc::new(ts) as VectorRef),
("v0".to_string(), Arc::new(v0) as VectorRef),
("v1".to_string(), Arc::new(v1) as VectorRef),
("v2".to_string(), Arc::new(v2) as VectorRef),
]
.into_iter()
.collect::<HashMap<String, VectorRef>>();
write_batch.put(columns).unwrap();
assert!(self
.region
.write(&WriteContext {}, write_batch)
.await
.is_ok());
// flush the region to ensure data resides across SST files.
self.region
.flush(&FlushContext {
wait: true,
reason: FlushReason::Others,
..Default::default()
})
.await
.unwrap();
}
}
async fn check(&self, order_options: Vec<OrderOption>) {
let read_context = ReadContext::default();
let snapshot = self.region.snapshot(&read_context).unwrap();
let response = snapshot
.scan(
&read_context,
ScanRequest {
sequence: None,
projection: None,
filters: vec![],
limit: None,
output_ordering: Some(order_options),
},
)
.await
.unwrap();
let mut timestamps = Vec::with_capacity(self.expected.len());
let mut col1 = Vec::with_capacity(self.expected.len());
let mut col2 = Vec::with_capacity(self.expected.len());
let mut col3 = Vec::with_capacity(self.expected.len());
let mut reader = response.reader;
let ts_index = reader.user_schema().timestamp_index().unwrap();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let ts_col = &chunk.columns[ts_index];
let ts_col = ts_col
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap();
let v1_col = chunk.columns[1]
.as_any()
.downcast_ref::<Int64Vector>()
.unwrap();
let v2_col = chunk.columns[2]
.as_any()
.downcast_ref::<StringVector>()
.unwrap();
let v3_col = chunk.columns[3]
.as_any()
.downcast_ref::<BooleanVector>()
.unwrap();
for ts in ts_col.iter_data() {
timestamps.push(ts.unwrap().0.value());
}
for v in v1_col.iter_data() {
col1.push(v.unwrap());
}
for v in v2_col.iter_data() {
col2.push(v.unwrap().to_string());
}
for v in v3_col.iter_data() {
col3.push(v.unwrap());
}
}
assert_eq!(
timestamps,
self.expected
.iter()
.map(|(v, _, _, _)| *v)
.collect::<Vec<_>>()
);
assert_eq!(
col1,
self.expected
.iter()
.map(|(_, v, _, _)| *v)
.collect::<Vec<_>>()
);
assert_eq!(
col2,
self.expected
.iter()
.map(|(_, _, v, _)| v.clone())
.collect::<Vec<_>>()
);
assert_eq!(
col3,
self.expected
.iter()
.map(|(_, _, _, v)| *v)
.collect::<Vec<_>>()
);
}
}
#[tokio::test]
async fn test_read_by_chunk_reader() {
common_telemetry::init_default_ut_logging();
WindowedReaderTester::new(
"test_region",
vec![vec![(1, 1, "1".to_string(), false)]],
vec![(1, 1, "1".to_string(), false)],
)
.await
.check(vec![OrderOption {
name: "timestamp".to_string(),
options: SortOptions {
descending: true,
nulls_first: true,
},
}])
.await;
WindowedReaderTester::new(
"test_region",
vec![
vec![
(1, 1, "1".to_string(), false),
(2, 2, "2".to_string(), false),
],
vec![
(3, 3, "3".to_string(), false),
(4, 4, "4".to_string(), false),
],
],
vec![
(4, 4, "4".to_string(), false),
(3, 3, "3".to_string(), false),
(2, 2, "2".to_string(), false),
(1, 1, "1".to_string(), false),
],
)
.await
.check(vec![OrderOption {
name: "timestamp".to_string(),
options: SortOptions {
descending: true,
nulls_first: true,
},
}])
.await;
WindowedReaderTester::new(
"test_region",
vec![
vec![
(1, 1, "1".to_string(), false),
(2, 2, "2".to_string(), false),
(60000, 60000, "60".to_string(), false),
],
vec![
(3, 3, "3".to_string(), false),
(61000, 61000, "61".to_string(), false),
],
],
vec![
(61000, 61000, "61".to_string(), false),
(60000, 60000, "60".to_string(), false),
(3, 3, "3".to_string(), false),
(2, 2, "2".to_string(), false),
(1, 1, "1".to_string(), false),
],
)
.await
.check(vec![OrderOption {
name: "timestamp".to_string(),
options: SortOptions {
descending: true,
nulls_first: true,
},
}])
.await;
WindowedReaderTester::new(
"test_region",
vec![
vec![
(1, 1, "1".to_string(), false),
(2, 2, "2".to_string(), false),
(60000, 60000, "60".to_string(), false),
],
vec![
(3, 3, "3".to_string(), false),
(61000, 61000, "61".to_string(), false),
],
],
vec![
(1, 1, "1".to_string(), false),
(2, 2, "2".to_string(), false),
(3, 3, "3".to_string(), false),
(60000, 60000, "60".to_string(), false),
(61000, 61000, "61".to_string(), false),
],
)
.await
.check(vec![OrderOption {
name: "timestamp".to_string(),
options: SortOptions {
descending: false,
nulls_first: true,
},
}])
.await;
}

View File

@@ -1,491 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use common_test_util::temp_dir::create_temp_dir;
use datatypes::prelude::*;
use datatypes::timestamp::TimestampMillisecond;
use datatypes::vectors::{Int64Vector, StringVector, TimestampMillisecondVector, VectorRef};
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::storage::{
AddColumn, AlterOperation, AlterRequest, Chunk, ChunkReader, ColumnDescriptor,
ColumnDescriptorBuilder, ColumnId, FlushContext, FlushReason, Region, RegionMeta, ScanRequest,
SchemaRef, Snapshot, WriteRequest,
};
use crate::config::EngineConfig;
use crate::region::tests::{self, FileTesterBase};
use crate::region::{OpenOptions, RawRegionMetadata, RegionImpl, RegionMetadata};
use crate::test_util;
use crate::test_util::config_util;
use crate::test_util::descriptor_util::RegionDescBuilder;
const REGION_NAME: &str = "region-alter-0";
async fn create_region_for_alter(store_dir: &str) -> RegionImpl<RaftEngineLogStore> {
// Always disable version column in this test.
let metadata = tests::new_metadata(REGION_NAME);
let store_config =
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
RegionImpl::create(metadata, store_config).await.unwrap()
}
/// Tester for region alter.
struct AlterTester {
store_dir: String,
base: Option<FileTesterBase>,
}
#[derive(Debug, Clone, PartialEq)]
struct DataRow {
key: Option<i64>,
ts: TimestampMillisecond,
v0: Option<String>,
v1: Option<i64>,
}
impl DataRow {
fn new_with_string(key: Option<i64>, ts: i64, v0: Option<String>, v1: Option<i64>) -> Self {
DataRow {
key,
ts: ts.into(),
v0,
v1,
}
}
fn new(key: Option<i64>, ts: i64, v0: Option<i64>, v1: Option<i64>) -> Self {
Self::new_with_string(key, ts, v0.map(|s| s.to_string()), v1)
}
}
fn new_put_data(data: &[DataRow]) -> HashMap<String, VectorRef> {
let keys = Int64Vector::from(data.iter().map(|v| v.key).collect::<Vec<_>>());
let timestamps = TimestampMillisecondVector::from(
data.iter()
.map(|v| Some(v.ts.into_native()))
.collect::<Vec<_>>(),
);
let values1 = StringVector::from(data.iter().map(|v| v.v0.clone()).collect::<Vec<_>>());
let values2 = Int64Vector::from(data.iter().map(|kv| kv.v1).collect::<Vec<_>>());
HashMap::from([
("k0".to_string(), Arc::new(keys) as VectorRef),
(
test_util::TIMESTAMP_NAME.to_string(),
Arc::new(timestamps) as VectorRef,
),
("v0".to_string(), Arc::new(values1) as VectorRef),
("v1".to_string(), Arc::new(values2) as VectorRef),
])
}
impl AlterTester {
async fn new(store_dir: &str) -> AlterTester {
let region = create_region_for_alter(store_dir).await;
AlterTester {
base: Some(FileTesterBase::with_region(region)),
store_dir: store_dir.to_string(),
}
}
async fn reopen(&mut self) {
// Close the old region.
if let Some(base) = self.base.as_ref() {
base.close().await;
}
self.base = None;
// Reopen the region.
let store_config =
config_util::new_store_config(REGION_NAME, &self.store_dir, EngineConfig::default())
.await;
let opts = OpenOptions::default();
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
.await
.unwrap()
.unwrap();
self.base = Some(FileTesterBase::with_region(region));
}
async fn flush(&self, wait: Option<bool>) {
let ctx = wait
.map(|wait| FlushContext {
wait,
reason: FlushReason::Manually,
..Default::default()
})
.unwrap_or_default();
self.base().region.flush(&ctx).await.unwrap();
}
async fn checkpoint_manifest(&self) {
self.base().checkpoint_manifest().await
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
fn schema(&self) -> SchemaRef {
let metadata = self.base().region.in_memory_metadata();
metadata.schema().clone()
}
// Put with schema k0, ts, v0, v1
async fn put(&self, data: &[DataRow]) {
let mut batch = self.base().region.write_request();
let put_data = new_put_data(data);
batch.put(put_data).unwrap();
assert!(self
.base()
.region
.write(&self.base().write_ctx, batch)
.await
.is_ok());
}
/// Put data with initial schema.
async fn put_with_init_schema(&self, data: &[(i64, Option<i64>)]) {
// put of FileTesterBase always use initial schema version.
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
let _ = self.base().put(&data).await;
}
/// Put data to inner writer with initial schema.
async fn put_inner_with_init_schema(&self, data: &[(i64, Option<i64>)]) {
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
// put of FileTesterBase always use initial schema version.
let _ = self.base().put_inner(&data).await;
}
async fn alter(&self, mut req: AlterRequest) {
let version = self.version();
req.version = version;
self.base().region.alter(req).await.unwrap();
}
fn version(&self) -> u32 {
let metadata = self.base().region.in_memory_metadata();
metadata.version()
}
async fn full_scan_with_init_schema(&self) -> Vec<(i64, Option<String>)> {
self.base().full_scan().await
}
async fn full_scan(&self) -> Vec<DataRow> {
let read_ctx = &self.base().read_ctx;
let snapshot = self.base().region.snapshot(read_ctx).unwrap();
let resp = snapshot
.scan(read_ctx, ScanRequest::default())
.await
.unwrap();
let mut reader = resp.reader;
let metadata = self.base().region.in_memory_metadata();
assert_eq!(metadata.schema(), reader.user_schema());
let mut dst = Vec::new();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let chunk = reader.project_chunk(chunk);
append_chunk_to(&chunk, &mut dst);
}
dst
}
}
fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<DataRow>) {
assert_eq!(4, chunk.columns.len());
let k0_vector = chunk.columns[0]
.as_any()
.downcast_ref::<Int64Vector>()
.unwrap();
let ts_vector = chunk.columns[1]
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap();
let v0_vector = chunk.columns[2]
.as_any()
.downcast_ref::<StringVector>()
.unwrap();
let v1_vector = chunk.columns[3]
.as_any()
.downcast_ref::<Int64Vector>()
.unwrap();
for i in 0..k0_vector.len() {
dst.push(DataRow::new_with_string(
k0_vector.get_data(i),
ts_vector.get_data(i).unwrap().into(),
v0_vector.get_data(i).map(|s| s.to_string()),
v1_vector.get_data(i),
));
}
}
fn new_column_desc(id: ColumnId, name: &str) -> ColumnDescriptor {
ColumnDescriptorBuilder::new(id, name, ConcreteDataType::int64_datatype())
.is_nullable(true)
.build()
.unwrap()
}
fn add_column_req(desc_and_is_key: &[(ColumnDescriptor, bool)]) -> AlterRequest {
let columns = desc_and_is_key
.iter()
.map(|(desc, is_key)| AddColumn {
desc: desc.clone(),
is_key: *is_key,
})
.collect();
let operation = AlterOperation::AddColumns { columns };
AlterRequest {
operation,
version: 0,
}
}
fn drop_column_req(names: &[&str]) -> AlterRequest {
let names = names.iter().map(|s| s.to_string()).collect();
let operation = AlterOperation::DropColumns { names };
AlterRequest {
operation,
version: 0,
}
}
fn check_schema_names(schema: &SchemaRef, names: &[&str]) {
assert_eq!(names.len(), schema.num_columns());
for (idx, name) in names.iter().enumerate() {
assert_eq!(*name, schema.column_name_by_index(idx));
let _ = schema.column_schema_by_name(name).unwrap();
}
}
#[tokio::test]
async fn test_alter_region_with_reopen() {
test_alter_region_with_reopen0(true).await;
test_alter_region_with_reopen0(false).await;
}
async fn test_alter_region_with_reopen0(flush_and_checkpoint: bool) {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("alter-region");
let store_dir = dir.path().to_str().unwrap();
let mut tester = AlterTester::new(store_dir).await;
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
tester.put_with_init_schema(&data).await;
assert_eq!(3, tester.full_scan_with_init_schema().await.len());
let req = add_column_req(&[
(new_column_desc(4, "k0"), true), // key column k0
(new_column_desc(5, "v1"), false), // value column v1
]);
tester.alter(req).await;
let schema = tester.schema();
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
// Put data after schema altered.
let data = vec![
DataRow::new(Some(10000), 1003, Some(103), Some(201)),
DataRow::new(Some(10001), 1004, Some(104), Some(202)),
DataRow::new(Some(10002), 1005, Some(105), Some(203)),
];
tester.put(&data).await;
if flush_and_checkpoint {
tester.flush(None).await;
tester.checkpoint_manifest().await;
}
// Scan with new schema before reopen.
let mut expect = vec![
DataRow::new(None, 1000, Some(100), None),
DataRow::new(None, 1001, Some(101), None),
DataRow::new(None, 1002, Some(102), None),
];
expect.extend_from_slice(&data);
let scanned = tester.full_scan().await;
assert_eq!(expect, scanned);
// Reopen and put more data.
tester.reopen().await;
let data = vec![
DataRow::new(Some(10003), 1006, Some(106), Some(204)),
DataRow::new(Some(10004), 1007, Some(107), Some(205)),
DataRow::new(Some(10005), 1008, Some(108), Some(206)),
];
tester.put(&data).await;
// Extend expected result.
expect.extend_from_slice(&data);
// add columns,then remove them without writing data.
let req = add_column_req(&[
(new_column_desc(6, "v2"), false), // key column k0
(new_column_desc(7, "v3"), false), // value column v1
]);
tester.alter(req).await;
let req = drop_column_req(&["v2", "v3"]);
tester.alter(req).await;
if flush_and_checkpoint {
tester.flush(None).await;
tester.checkpoint_manifest().await;
}
// reopen and write again
tester.reopen().await;
let schema = tester.schema();
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
let data = vec![DataRow::new(Some(10006), 1009, Some(109), Some(207))];
tester.put(&data).await;
expect.extend_from_slice(&data);
// Scan with new schema after reopen and write.
let scanned = tester.full_scan().await;
assert_eq!(expect, scanned);
}
#[tokio::test]
async fn test_alter_region() {
let dir = create_temp_dir("alter-region");
let store_dir = dir.path().to_str().unwrap();
let tester = AlterTester::new(store_dir).await;
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
tester.put_with_init_schema(&data).await;
let schema = tester.schema();
check_schema_names(&schema, &["timestamp", "v0"]);
let req = add_column_req(&[
(new_column_desc(4, "k0"), true), // key column k0
(new_column_desc(5, "v1"), false), // value column v1
]);
tester.alter(req).await;
let schema = tester.schema();
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]);
let req = add_column_req(&[
(new_column_desc(6, "v2"), false),
(new_column_desc(7, "v3"), false),
]);
tester.alter(req).await;
let schema = tester.schema();
check_schema_names(&schema, &["k0", "timestamp", "v0", "v1", "v2", "v3"]);
// Remove v0, v1
let req = drop_column_req(&["v0", "v1"]);
tester.alter(req).await;
let schema = tester.schema();
check_schema_names(&schema, &["k0", "timestamp", "v2", "v3"]);
}
#[tokio::test]
async fn test_put_old_schema_after_alter() {
let dir = create_temp_dir("put-old");
let store_dir = dir.path().to_str().unwrap();
let tester = AlterTester::new(store_dir).await;
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
tester.put_with_init_schema(&data).await;
let req = add_column_req(&[
(new_column_desc(4, "k0"), true), // key column k0
(new_column_desc(5, "v1"), false), // value column v1
]);
tester.alter(req).await;
// Put with old schema.
let data = vec![(1005, Some(105)), (1006, Some(106))];
tester.put_with_init_schema(&data).await;
// Put data with old schema directly to the inner writer, to check that the region
// writer could compat the schema of write batch.
let data = vec![(1003, Some(103)), (1004, Some(104))];
tester.put_inner_with_init_schema(&data).await;
let expect = vec![
DataRow::new(None, 1000, Some(100), None),
DataRow::new(None, 1001, Some(101), None),
DataRow::new(None, 1002, Some(102), None),
DataRow::new(None, 1003, Some(103), None),
DataRow::new(None, 1004, Some(104), None),
DataRow::new(None, 1005, Some(105), None),
DataRow::new(None, 1006, Some(106), None),
];
let scanned = tester.full_scan().await;
assert_eq!(expect, scanned);
}
#[tokio::test]
async fn test_replay_metadata_after_open() {
let dir = create_temp_dir("replay-metadata-after-open");
let store_dir = dir.path().to_str().unwrap();
let mut tester = AlterTester::new(store_dir).await;
let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))];
tester.put_with_init_schema(&data).await;
tester.reopen().await;
let committed_sequence = tester.base().committed_sequence();
let manifest_version = tester.base().region.current_manifest_version();
let version = tester.version();
let desc = RegionDescBuilder::new(REGION_NAME)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_field_column(("v0", LogicalTypeId::Float32, true))
.build();
let metadata: &RegionMetadata = &desc.try_into().unwrap();
let mut raw_metadata: RawRegionMetadata = metadata.into();
raw_metadata.version = version + 1;
let recovered_metadata =
BTreeMap::from([(committed_sequence, (manifest_version + 1, raw_metadata))]);
tester.base().replay_inner(recovered_metadata).await;
let schema = tester.schema();
check_schema_names(&schema, &["k1", "timestamp", "v0"]);
}

View File

@@ -1,288 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region read/write tests.
use common_telemetry::info;
use common_test_util::temp_dir::create_temp_dir;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::storage::{OpenOptions, SequenceNumber};
use crate::config::EngineConfig;
use crate::error::Result;
use crate::region::tests::{self, FileTesterBase};
use crate::region::RegionImpl;
use crate::test_util::config_util;
const REGION_NAME: &str = "region-basic-0";
/// Create a new region for basic tests.
async fn create_region_for_basic(
region_name: &str,
store_dir: &str,
) -> RegionImpl<RaftEngineLogStore> {
let metadata = tests::new_metadata(region_name);
let store_config =
config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await;
RegionImpl::create(metadata, store_config).await.unwrap()
}
/// Tester for basic tests.
struct Tester {
region_name: String,
store_dir: String,
base: Option<FileTesterBase>,
}
impl Tester {
async fn new(region_name: &str, store_dir: &str) -> Tester {
let region = create_region_for_basic(region_name, store_dir).await;
Tester {
region_name: region_name.to_string(),
store_dir: store_dir.to_string(),
base: Some(FileTesterBase::with_region(region)),
}
}
async fn empty(region_name: &str, store_dir: &str) -> Tester {
Tester {
region_name: region_name.to_string(),
store_dir: store_dir.to_string(),
base: None,
}
}
async fn reopen(&mut self) {
let _ = self.try_reopen().await.unwrap();
}
async fn try_reopen(&mut self) -> Result<bool> {
// Close the old region.
if let Some(base) = self.base.as_ref() {
info!("Reopen tester base");
base.close().await;
}
self.base = None;
// Reopen the region.
let store_config = config_util::new_store_config(
&self.region_name,
&self.store_dir,
EngineConfig::default(),
)
.await;
let opts = OpenOptions::default();
let region = RegionImpl::open(self.region_name.clone(), store_config, &opts).await?;
match region {
None => Ok(false),
Some(region) => {
let base = FileTesterBase::with_region(region);
self.base = Some(base);
Ok(true)
}
}
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
#[inline]
fn set_batch_size(&mut self, batch_size: usize) {
self.base.as_mut().unwrap().read_ctx.batch_size = batch_size;
}
async fn put(&self, data: &[(i64, Option<String>)]) {
let _ = self.base().put(data).await;
}
async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
self.base().full_scan().await
}
fn committed_sequence(&self) -> SequenceNumber {
self.base().committed_sequence()
}
async fn delete(&self, keys: &[i64]) {
let _ = self.base().delete(keys).await;
}
}
#[tokio::test]
async fn test_simple_put_scan() {
let dir = create_temp_dir("put-scan");
let store_dir = dir.path().to_str().unwrap();
let tester = Tester::new(REGION_NAME, store_dir).await;
let data = vec![
(1000, Some(100.to_string())),
(1001, Some(101.to_string())),
(1002, None),
(1003, Some(103.to_string())),
(1004, Some(104.to_string())),
];
tester.put(&data).await;
let output = tester.full_scan().await;
assert_eq!(data, output);
}
#[tokio::test]
async fn test_sequence_increase() {
let dir = create_temp_dir("sequence");
let store_dir = dir.path().to_str().unwrap();
let tester = Tester::new(REGION_NAME, store_dir).await;
let mut committed_sequence = tester.committed_sequence();
for i in 0..100 {
tester.put(&[(i, Some(1234.to_string()))]).await;
committed_sequence += 1;
assert_eq!(committed_sequence, tester.committed_sequence());
}
}
#[tokio::test]
async fn test_reopen() {
common_telemetry::logging::init_default_ut_logging();
let dir = create_temp_dir("reopen");
let store_dir = dir.path().to_str().unwrap();
let mut tester = Tester::new(REGION_NAME, store_dir).await;
let mut all_data = Vec::new();
// Reopen region multiple times.
for i in 0..5 {
let data = (i, Some(i.to_string()));
tester.put(&[data.clone()]).await;
all_data.push(data.clone());
let output = tester.full_scan().await;
assert_eq!(all_data, output);
tester.reopen().await;
// Scan after reopen.
let output = tester.full_scan().await;
assert_eq!(all_data, output);
// Check committed sequence.
assert_eq!(i + 1, tester.committed_sequence() as i64);
}
}
#[tokio::test]
async fn test_open_empty() {
let dir = create_temp_dir("open-empty");
let store_dir = dir.path().to_str().unwrap();
let mut tester = Tester::empty(REGION_NAME, store_dir).await;
let ret = tester.try_reopen().await;
assert!(!ret.unwrap());
}
#[tokio::test]
async fn test_scan_different_batch() {
let dir = create_temp_dir("different-batch");
let store_dir = dir.path().to_str().unwrap();
let mut tester = Tester::new(REGION_NAME, store_dir).await;
let data: Vec<_> = (0..=2000).map(|i| (i, Some(i.to_string()))).collect();
for chunk in data.chunks(100) {
tester.put(chunk).await;
}
let batch_sizes = [1, 2, 4, 16, 64, 128, 256, 512];
for batch_size in batch_sizes {
tester.set_batch_size(batch_size);
let output = tester.full_scan().await;
assert_eq!(data, output);
}
}
#[tokio::test]
async fn test_put_delete_scan() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("put-delete-scan");
let store_dir = dir.path().to_str().unwrap();
let mut tester = Tester::new(REGION_NAME, store_dir).await;
let data = vec![
(1000, Some(100.to_string())),
(1001, Some(101.to_string())),
(1002, None),
(1003, None),
(1004, Some(104.to_string())),
];
tester.put(&data).await;
let keys = [1001, 1003];
tester.delete(&keys).await;
let output = tester.full_scan().await;
let expect = vec![
(1000, Some(100.to_string())),
(1002, None),
(1004, Some(104.to_string())),
];
assert_eq!(expect, output);
// Deletion is also persistent.
let _ = tester.try_reopen().await.unwrap();
let output = tester.full_scan().await;
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_put_delete_absent_key() {
let dir = create_temp_dir("put-delete-scan");
let store_dir = dir.path().to_str().unwrap();
let mut tester = Tester::new(REGION_NAME, store_dir).await;
let data = vec![
(1000, Some(100.to_string())),
(1001, Some(101.to_string())),
(1002, None),
(1003, None),
(1004, Some(104.to_string())),
];
tester.put(&data).await;
// 999 and 1006 is absent.
let keys = [999, 1002, 1004, 1006];
tester.delete(&keys).await;
let output = tester.full_scan().await;
let expect = vec![
(1000, Some(100.to_string())),
(1001, Some(101.to_string())),
(1003, None),
];
assert_eq!(expect, output);
// Deletion is also persistent.
let _ = tester.try_reopen().await.unwrap();
let output = tester.full_scan().await;
assert_eq!(expect, output);
}

View File

@@ -1,168 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region close tests.
use std::sync::Arc;
use common_test_util::temp_dir::create_temp_dir;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::storage::{
AlterOperation, AlterRequest, CloseContext, Region, RegionMeta, WriteResponse,
};
use crate::config::EngineConfig;
use crate::engine;
use crate::error::Error;
use crate::flush::FlushStrategyRef;
use crate::region::tests::{self, FileTesterBase};
use crate::region::RegionImpl;
use crate::test_util::config_util;
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
const REGION_NAME: &str = "region-close-0";
/// Tester for region close
struct CloseTester {
base: Option<FileTesterBase>,
}
/// Create a new region for close test
async fn create_region_for_close(
store_dir: &str,
flush_strategy: FlushStrategyRef,
) -> RegionImpl<RaftEngineLogStore> {
let metadata = tests::new_metadata(REGION_NAME);
let mut store_config =
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
store_config.flush_strategy = flush_strategy;
RegionImpl::create(metadata, store_config).await.unwrap()
}
impl CloseTester {
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> CloseTester {
let region = create_region_for_close(store_dir, flush_strategy.clone()).await;
CloseTester {
base: Some(FileTesterBase::with_region(region)),
}
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
async fn put(&self, data: &[(i64, Option<i64>)]) {
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
let _ = self.base().put(&data).await;
}
async fn try_put(&self, data: &[(i64, Option<i64>)]) -> Result<WriteResponse, Error> {
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
self.base().try_put(&data).await
}
async fn try_alter(&self, mut req: AlterRequest) -> Result<(), Error> {
let version = self.version();
req.version = version;
self.base().region.alter(req).await
}
fn version(&self) -> u32 {
let metadata = self.base().region.in_memory_metadata();
metadata.version()
}
}
#[tokio::test]
async fn test_close_basic() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("close-basic");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = CloseTester::new(store_dir, flush_switch).await;
tester
.base()
.region
.close(&CloseContext::default())
.await
.unwrap();
let data = [(1000, Some(100))];
let closed_region_error = "Try to write the closed region".to_string();
// Put one element should return ClosedRegion error
assert_eq!(
tester.try_put(&data).await.unwrap_err().to_string(),
closed_region_error
);
// Alter table should return ClosedRegion error
assert_eq!(
tester
.try_alter(AlterRequest {
operation: AlterOperation::AddColumns {
columns: Vec::new(),
},
version: 0,
})
.await
.unwrap_err()
.to_string(),
closed_region_error
);
}
#[tokio::test]
async fn test_close_wait_flush_done() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("close-basic");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = CloseTester::new(store_dir, flush_switch.clone()).await;
let data = [(1000, Some(100))];
// Now set should flush to true to trigger flush.
flush_switch.set_should_flush(true);
// Put one element so we have content to flush.
tester.put(&data).await;
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
assert!(!has_parquet_file(&sst_dir));
// Close should cancel the flush.
tester
.base()
.region
.close(&CloseContext::default())
.await
.unwrap();
assert!(!has_parquet_file(&sst_dir));
}

View File

@@ -1,458 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region compaction tests.
use std::env;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use common_telemetry::logging;
use common_test_util::temp_dir::create_temp_dir;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use object_store::services::{Fs, S3};
use object_store::ObjectStore;
use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region};
use tokio::sync::{Notify, RwLock};
use crate::compaction::CompactionHandler;
use crate::config::EngineConfig;
use crate::error::Result;
use crate::file_purger::{FilePurgeHandler, FilePurgeRequest};
use crate::region::tests::{self, FileTesterBase};
use crate::region::{CompactContext, FlushStrategyRef, RegionImpl};
use crate::scheduler::rate_limit::BoxedRateLimitToken;
use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig};
use crate::test_util::config_util;
use crate::test_util::flush_switch::FlushSwitch;
const REGION_NAME: &str = "region-compact-0";
fn new_object_store(store_dir: &str, s3_bucket: Option<String>) -> ObjectStore {
if let Some(bucket) = s3_bucket {
if !bucket.is_empty() {
logging::info!("Use S3 object store");
let root = uuid::Uuid::new_v4().to_string();
let mut builder = S3::default();
let _ = builder
.root(&root)
.access_key_id(&env::var("GT_S3_ACCESS_KEY_ID").unwrap())
.secret_access_key(&env::var("GT_S3_ACCESS_KEY").unwrap())
.region(&env::var("GT_S3_REGION").unwrap())
.bucket(&bucket);
return ObjectStore::new(builder).unwrap().finish();
}
}
logging::info!("Use local fs object store");
let mut builder = Fs::default();
let _ = builder.root(store_dir);
ObjectStore::new(builder).unwrap().finish()
}
/// Create a new region for compaction test
async fn create_region_for_compaction<
H: Handler<Request = FilePurgeRequest> + Send + Sync + 'static,
>(
store_dir: &str,
engine_config: EngineConfig,
purge_handler: H,
flush_strategy: FlushStrategyRef,
s3_bucket: Option<String>,
) -> (
RegionImpl<RaftEngineLogStore>,
ObjectStore,
Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
) {
let metadata = tests::new_metadata(REGION_NAME);
let object_store = new_object_store(store_dir, s3_bucket);
let (mut store_config, _) = config_util::new_store_config_with_object_store(
REGION_NAME,
store_dir,
object_store.clone(),
EngineConfig::default(),
)
.await;
store_config.engine_config = Arc::new(engine_config);
store_config.flush_strategy = flush_strategy;
let pending_compaction_tasks = Arc::new(RwLock::new(vec![]));
let handler = CompactionHandler::new_with_pending_tasks(pending_compaction_tasks.clone());
let config = SchedulerConfig::default();
// Overwrite test compaction scheduler and file purger.
store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
store_config.file_purger = Arc::new(LocalScheduler::new(
SchedulerConfig {
max_inflight_tasks: store_config.engine_config.max_purge_tasks,
},
purge_handler,
));
(
RegionImpl::create(metadata, store_config).await.unwrap(),
object_store,
pending_compaction_tasks,
)
}
#[derive(Debug, Default, Clone)]
struct MockFilePurgeHandler {
num_deleted: Arc<AtomicUsize>,
}
#[async_trait::async_trait]
impl Handler for MockFilePurgeHandler {
type Request = FilePurgeRequest;
async fn handle_request(
&self,
req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
logging::info!(
"Try to delete file: {:?}, num_deleted: {:?}",
req.file_id,
self.num_deleted
);
let handler = FilePurgeHandler;
handler
.handle_request(req, token, finish_notifier)
.await
.unwrap();
let _ = self.num_deleted.fetch_add(1, Ordering::Relaxed);
Ok(())
}
}
impl MockFilePurgeHandler {
fn num_deleted(&self) -> usize {
self.num_deleted.load(Ordering::Relaxed)
}
}
/// Tester for region compaction.
struct CompactionTester {
base: Option<FileTesterBase>,
purge_handler: MockFilePurgeHandler,
object_store: ObjectStore,
store_dir: String,
engine_config: EngineConfig,
flush_strategy: FlushStrategyRef,
pending_tasks: Arc<RwLock<Vec<tokio::task::JoinHandle<()>>>>,
}
impl CompactionTester {
async fn new(
store_dir: &str,
engine_config: EngineConfig,
flush_strategy: FlushStrategyRef,
s3_bucket: Option<String>,
) -> CompactionTester {
let purge_handler = MockFilePurgeHandler::default();
let (region, object_store, pending_tasks) = create_region_for_compaction(
store_dir,
engine_config.clone(),
purge_handler.clone(),
flush_strategy.clone(),
s3_bucket,
)
.await;
CompactionTester {
base: Some(FileTesterBase::with_region(region)),
purge_handler,
object_store,
store_dir: store_dir.to_string(),
engine_config,
flush_strategy,
pending_tasks,
}
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
#[inline]
fn base_mut(&mut self) -> &mut FileTesterBase {
self.base.as_mut().unwrap()
}
async fn put(&self, data: &[(i64, Option<i64>)]) {
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
let _ = self.base().put(&data).await;
}
async fn flush(&self, wait: Option<bool>) {
let ctx = wait
.map(|wait| FlushContext {
wait,
reason: FlushReason::Manually,
..Default::default()
})
.unwrap_or_default();
self.base().region.flush(&ctx).await.unwrap();
}
async fn compact(&self) {
// Trigger compaction and wait until it is done.
self.base()
.region
.compact(&CompactContext::default())
.await
.unwrap();
}
/// Close region and clean up files.
async fn clean_up(mut self) {
self.base = None;
self.object_store.remove_all("/").await.unwrap();
}
async fn reopen(&mut self) -> Result<bool> {
// Close the old region.
if let Some(base) = self.base.take() {
let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await;
base.close().await;
}
// Reopen the region.
let object_store = new_object_store(&self.store_dir, None);
let (mut store_config, _) = config_util::new_store_config_with_object_store(
REGION_NAME,
&self.store_dir,
object_store.clone(),
EngineConfig {
max_files_in_l0: usize::MAX,
..Default::default()
},
)
.await;
store_config.engine_config = Arc::new(self.engine_config.clone());
store_config.flush_strategy = self.flush_strategy.clone();
let handler = CompactionHandler::new_with_pending_tasks(Arc::new(Default::default()));
let config = SchedulerConfig::default();
// Overwrite test compaction scheduler and file purger.
store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
store_config.file_purger = Arc::new(LocalScheduler::new(
SchedulerConfig {
max_inflight_tasks: store_config.engine_config.max_purge_tasks,
},
MockFilePurgeHandler::default(),
));
let Some(region) = RegionImpl::open(
REGION_NAME.to_string(),
store_config,
&OpenOptions::default(),
)
.await?
else {
return Ok(false);
};
self.base = Some(FileTesterBase::with_region(region));
Ok(true)
}
}
async fn compact_during_read(s3_bucket: Option<String>) {
let dir = create_temp_dir("compact_read");
let store_dir = dir.path().to_str().unwrap();
// Use a large max_files_in_l0 to avoid compaction automatically.
let mut tester = CompactionTester::new(
store_dir,
EngineConfig {
max_files_in_l0: 100,
..Default::default()
},
// Disable auto-flush.
Arc::new(FlushSwitch::default()),
s3_bucket,
)
.await;
let expect: Vec<_> = (0..200).map(|v| (v, Some(v))).collect();
// Put elements so we have content to flush (In SST1).
tester.put(&expect[0..100]).await;
// Flush content to SST1.
tester.flush(None).await;
// Put element (In SST2).
tester.put(&expect[100..200]).await;
// Flush content to SST2.
tester.flush(None).await;
tester.base_mut().read_ctx.batch_size = 1;
// Create a reader.
let reader = tester.base().full_scan_reader().await;
assert_eq!(0, tester.purge_handler.num_deleted());
// Trigger compaction.
tester.compact().await;
// The files are still referenced.
assert_eq!(0, tester.purge_handler.num_deleted());
// Read from the reader.
let output = tester.base().collect_reader(reader).await;
assert_eq!(expect.len(), output.len());
tester.clean_up().await;
}
#[tokio::test]
async fn test_compact_during_read_on_fs() {
common_telemetry::init_default_ut_logging();
compact_during_read(None).await;
}
#[tokio::test]
async fn test_compact_during_read_on_s3() {
common_telemetry::init_default_ut_logging();
if let Ok(bucket) = env::var("GT_S3_BUCKET") {
if !bucket.is_empty() {
compact_during_read(Some(bucket)).await;
}
}
}
#[tokio::test]
async fn test_persist_region_compaction_time_window() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("put-delete-scan");
let store_dir = dir.path().to_str().unwrap();
let mut tester = CompactionTester::new(
store_dir,
EngineConfig {
max_files_in_l0: 100,
..Default::default()
},
// Disable auto-flush.
Arc::new(FlushSwitch::default()),
None,
)
.await;
// initially the time window is not present since no compaction ever happened.
assert_eq!(
None,
tester
.base
.as_ref()
.unwrap()
.region
.inner
.shared
.version_control
.current()
.ssts()
.compaction_time_window()
);
// write some data with one hour span
for idx in 0..10 {
tester
.put(&[(idx * 1000, Some(idx)), ((idx + 360) * 1000, Some(idx))])
.await;
tester.flush(Some(true)).await;
}
tester.compact().await;
// the inferred and persisted compaction time window should be 3600 seconds.
assert_eq!(
3600,
tester
.base
.as_ref()
.unwrap()
.region
.inner
.shared
.version_control
.current()
.ssts()
.compaction_time_window()
.unwrap()
);
// try write data with a larger time window
for idx in 0..10 {
tester
.put(&[
(idx * 1000, Some(idx)),
((idx + 2 * 60 * 60) * 1000, Some(idx)),
])
.await;
tester.flush(Some(true)).await;
}
tester.compact().await;
// but we won't changed persisted compaction window for now, so it remains unchanged.
assert_eq!(
3600,
tester
.base
.as_ref()
.unwrap()
.region
.inner
.shared
.version_control
.current()
.ssts()
.compaction_time_window()
.unwrap()
);
let reopened = tester.reopen().await.unwrap();
assert!(reopened);
assert_eq!(
3600,
tester
.base
.as_ref()
.unwrap()
.region
.inner
.shared
.version_control
.current()
.ssts()
.compaction_time_window()
.unwrap()
);
}

View File

@@ -1,192 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region drop tests.
use std::path::Path;
use std::sync::Arc;
use common_telemetry::info;
use common_test_util::temp_dir::create_temp_dir;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::manifest::{Manifest, MetaAction};
use store_api::storage::{FlushContext, OpenOptions, Region};
use crate::config::EngineConfig;
use crate::engine;
use crate::flush::FlushStrategyRef;
use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionRemove};
use crate::region::tests::{self, FileTesterBase};
use crate::region::RegionImpl;
use crate::test_util::config_util;
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
const REGION_NAME: &str = "region-drop-0";
/// Create a new region for drop tests.
async fn create_region_for_drop(
store_dir: &str,
flush_strategy: FlushStrategyRef,
) -> RegionImpl<RaftEngineLogStore> {
let metadata = tests::new_metadata(REGION_NAME);
let mut store_config =
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
store_config.flush_strategy = flush_strategy;
RegionImpl::create(metadata, store_config).await.unwrap()
}
/// Tester for drop tests.
struct DropTester {
base: Option<FileTesterBase>,
}
impl DropTester {
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> DropTester {
let region = create_region_for_drop(store_dir, flush_strategy).await;
DropTester {
base: Some(FileTesterBase::with_region(region)),
}
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
async fn put(&self, data: &[(i64, Option<i64>)]) {
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
let _ = self.base().put(&data).await;
}
async fn flush(&self) {
let ctx = FlushContext::default();
self.base().region.flush(&ctx).await.unwrap();
}
async fn close(&mut self) {
if let Some(base) = self.base.take() {
base.close().await;
}
}
}
fn get_all_files(path: &str) -> Vec<String> {
let mut files = Vec::new();
for entry in std::fs::read_dir(path).unwrap() {
let entry = entry.unwrap();
let path = entry.path();
if path.is_file() {
files.push(path.to_str().unwrap().to_string());
} else if path.is_dir() {
files.extend(get_all_files(path.to_str().unwrap()));
}
}
files
}
#[tokio::test]
async fn test_drop_basic() {
let dir = create_temp_dir("drop-basic");
common_telemetry::init_default_ut_logging();
let store_dir = dir.path().to_str().unwrap();
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
let manifest_dir = format!(
"{}/{}",
store_dir,
engine::region_manifest_dir("", REGION_NAME)
);
let flush_switch = Arc::new(FlushSwitch::default());
let mut tester = DropTester::new(store_dir, flush_switch.clone()).await;
let data = [(1000, Some(100))];
// Put one element so we have content to flush.
tester.put(&data).await;
// Manually trigger flush.
tester.flush().await;
assert!(has_parquet_file(&sst_dir));
tester.base().checkpoint_manifest().await;
let manifest_files = get_all_files(&manifest_dir);
info!("manifest_files: {:?}", manifest_files);
tester.base().region.drop_region().await.unwrap();
tester.close().await;
assert!(!Path::new(&manifest_dir).exists());
}
#[tokio::test]
async fn test_drop_reopen() {
let dir = create_temp_dir("drop-basic");
common_telemetry::init_default_ut_logging();
let store_dir = dir.path().to_str().unwrap();
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
let manifest_dir = format!(
"{}/{}",
store_dir,
engine::region_manifest_dir("", REGION_NAME)
);
let flush_switch = Arc::new(FlushSwitch::default());
let mut tester = DropTester::new(store_dir, flush_switch.clone()).await;
let data = [(1000, Some(100))];
// Put one element so we have content to flush.
tester.put(&data).await;
// Manually trigger flush.
tester.flush().await;
assert!(has_parquet_file(&sst_dir));
tester.base().checkpoint_manifest().await;
let version_control = tester.base().region.version_control();
let mut action_list =
RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove {
region_id: tester.base().region.id(),
}));
let prev_version = version_control.current_manifest_version();
action_list.set_prev_version(prev_version);
let manifest = &tester.base().region.inner.manifest;
let _ = manifest.update(action_list).await.unwrap();
tester.close().await;
// Reopen the region.
let store_config = config_util::new_store_config(
REGION_NAME,
store_dir,
EngineConfig {
max_files_in_l0: usize::MAX,
..Default::default()
},
)
.await;
let opts = OpenOptions::default();
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
.await
.unwrap();
assert!(region.is_none());
assert!(!Path::new(&manifest_dir).exists());
}

View File

@@ -1,462 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region flush tests.
use std::sync::Arc;
use std::time::Duration;
use arrow::compute::SortOptions;
use common_query::prelude::Expr;
use common_recordbatch::OrderOption;
use common_test_util::temp_dir::create_temp_dir;
use common_time::timestamp::TimeUnit;
use datafusion_common::Column;
use datatypes::value::timestamp_to_scalar_value;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region, ScanRequest};
use crate::config::EngineConfig;
use crate::engine::{self, RegionMap};
use crate::flush::{FlushStrategyRef, FlushType};
use crate::region::tests::{self, FileTesterBase};
use crate::region::RegionImpl;
use crate::test_util::config_util;
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
const REGION_NAME: &str = "region-flush-0";
/// Create a new region for flush test
async fn create_region_for_flush(
store_dir: &str,
flush_strategy: FlushStrategyRef,
) -> (
RegionImpl<RaftEngineLogStore>,
Arc<RegionMap<RaftEngineLogStore>>,
) {
let metadata = tests::new_metadata(REGION_NAME);
let (mut store_config, regions) = config_util::new_store_config_and_region_map(
REGION_NAME,
store_dir,
EngineConfig {
max_files_in_l0: usize::MAX,
..Default::default()
},
)
.await;
store_config.flush_strategy = flush_strategy;
(
RegionImpl::create(metadata, store_config).await.unwrap(),
regions,
)
}
/// Tester for region flush.
struct FlushTester {
base: Option<FileTesterBase>,
store_dir: String,
flush_strategy: FlushStrategyRef,
regions: Arc<RegionMap<RaftEngineLogStore>>,
}
impl FlushTester {
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> FlushTester {
let (region, regions) = create_region_for_flush(store_dir, flush_strategy.clone()).await;
FlushTester {
base: Some(FileTesterBase::with_region(region)),
store_dir: store_dir.to_string(),
flush_strategy: flush_strategy.clone(),
regions,
}
}
async fn reopen(&mut self) {
self.regions.clear();
// Close the old region.
if let Some(base) = self.base.take() {
base.close().await;
}
// Reopen the region.
let mut store_config = config_util::new_store_config(
REGION_NAME,
&self.store_dir,
EngineConfig {
max_files_in_l0: usize::MAX,
..Default::default()
},
)
.await;
store_config.flush_strategy = self.flush_strategy.clone();
let opts = OpenOptions::default();
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
.await
.unwrap()
.unwrap();
self.base = Some(FileTesterBase::with_region(region));
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
async fn put(&self, data: &[(i64, Option<i64>)]) {
let data = data
.iter()
.map(|(ts, v0)| (*ts, v0.map(|v| v.to_string())))
.collect::<Vec<_>>();
let _ = self.base().put(&data).await;
}
async fn full_scan(&self) -> Vec<(i64, Option<String>)> {
self.base().full_scan().await
}
async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option<String>)> {
self.base().scan(req).await
}
async fn flush(&self, wait: Option<bool>) {
let ctx = wait
.map(|wait| FlushContext {
wait,
reason: FlushReason::Manually,
..Default::default()
})
.unwrap_or_default();
self.base().region.flush(&ctx).await.unwrap();
}
}
impl Drop for FlushTester {
fn drop(&mut self) {
self.regions.clear();
}
}
#[tokio::test]
async fn test_flush_and_stall() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("flush-stall");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
let data = [(1000, Some(100))];
// Put one element so we have content to flush.
tester.put(&data).await;
// Now set should flush to true to trigger flush.
flush_switch.set_should_flush(true);
// Put element to trigger flush.
tester.put(&data).await;
// Now put another data to trigger write stall and wait until last flush done to
// ensure at least one parquet file is generated.
tester.put(&data).await;
// Check parquet files.
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
assert!(has_parquet_file(&sst_dir));
}
#[tokio::test]
async fn test_manual_flush() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("manual_flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
let data = [(1000, Some(100))];
// Put one element so we have content to flush.
tester.put(&data).await;
// No parquet file should be flushed.
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
assert!(!has_parquet_file(&sst_dir));
tester.flush(None).await;
assert!(has_parquet_file(&sst_dir));
}
#[tokio::test]
async fn test_flush_and_reopen() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("manual_flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let mut tester = FlushTester::new(store_dir, flush_switch.clone()).await;
tester.put(&[(1000, Some(100))]).await;
tester.flush(Some(true)).await;
tester.reopen().await;
let i = tester
.base()
.region
.inner
.shared
.version_control
.committed_sequence();
// we wrote a request and flushed the region (involving writing a manifest), thus
// committed_sequence should be 2.
assert_eq!(2, i);
}
#[tokio::test]
async fn test_flush_empty() {
let dir = create_temp_dir("flush-empty");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
// Flush empty table.
tester.flush(None).await;
let data = [(1000, Some(100))];
// Put element to trigger flush.
tester.put(&data).await;
// Put again.
let data = [(2000, Some(200))];
tester.put(&data).await;
// No parquet file should be flushed.
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
assert!(!has_parquet_file(&sst_dir));
let expect = vec![(1000, Some(100.to_string())), (2000, Some(200.to_string()))];
let output = tester.full_scan().await;
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_read_after_flush_across_window() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("read-flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
// Put elements so we have content to flush.
tester.put(&[(1000, Some(100))]).await;
tester.put(&[(2000, Some(200))]).await;
// Flush.
tester.flush(None).await;
// Put element again.
tester.put(&[(3000, Some(300))]).await;
let expect = vec![
(1000, Some(100.to_string())),
(2000, Some(200.to_string())),
(3000, Some(300.to_string())),
];
let output = tester.full_scan().await;
assert_eq!(expect, output);
// Reopen
let mut tester = tester;
tester.reopen().await;
// Scan after reopen.
let output = tester.full_scan().await;
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_read_after_flush_same_window() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("read-flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
// Put elements so we have content to flush.
tester.put(&[(1000, Some(100))]).await;
tester.put(&[(2000, Some(200))]).await;
// Flush.
tester.flush(None).await;
// Put element again.
tester.put(&[(1003, Some(300))]).await;
let expect = vec![
(1000, Some(100.to_string())),
(1003, Some(300.to_string())),
(2000, Some(200.to_string())),
];
let output = tester.full_scan().await;
assert_eq!(expect, output);
// Reopen
let mut tester = tester;
tester.reopen().await;
// Scan after reopen.
let output = tester.full_scan().await;
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_merge_read_after_flush() {
let dir = create_temp_dir("merge-read-flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
// Put elements so we have content to flush (In SST1).
tester.put(&[(3000, Some(300))]).await;
tester.put(&[(2000, Some(200))]).await;
// Flush content to SST1.
tester.flush(None).await;
// Put element (In SST2).
tester.put(&[(2000, Some(201))]).await;
// In SST2.
tester.put(&[(2000, Some(202))]).await;
tester.put(&[(1000, Some(100))]).await;
// Trigger flush.
tester.flush(None).await;
// Overwrite row (In memtable).
tester.put(&[(2000, Some(203))]).await;
let expect = vec![
(1000, Some(100.to_string())),
(2000, Some(203.to_string())),
(3000, Some(300.to_string())),
];
let output = tester.full_scan().await;
assert_eq!(expect, output);
// Reopen
let mut tester = tester;
tester.reopen().await;
// Scan after reopen.
let output = tester.full_scan().await;
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_schedule_engine_flush() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("engine-flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
assert_eq!(0, tester.base().region.last_flush_millis());
// Insert the region to the region map.
let _ = tester.regions.get_or_occupy_slot(
REGION_NAME,
engine::RegionSlot::Ready(tester.base().region.clone()),
);
// Put elements so we have content to flush.
tester.put(&[(1000, Some(100))]).await;
tester.put(&[(2000, Some(200))]).await;
flush_switch.set_flush_type(FlushType::Engine);
// Put element and trigger an engine level flush.
tester.put(&[(3000, Some(300))]).await;
// Wait for flush.
let mut count = 0;
while tester.base().region.last_flush_millis() == 0 && count < 50 {
tokio::time::sleep(Duration::from_millis(100)).await;
count += 1;
}
// Check parquet files.
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
assert!(has_parquet_file(&sst_dir));
}
#[tokio::test]
async fn test_flush_and_query_empty() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("flush_and_query_empty_range");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
tester
.put(
&(20000..30000)
.map(|v| (v as i64, Some(v as i64)))
.collect::<Vec<_>>(),
)
.await;
tester.flush(Some(true)).await;
tester
.put(
&(20100..20200)
.map(|v| (v as i64, Some(v as i64)))
.collect::<Vec<_>>(),
)
.await;
tester.flush(Some(true)).await;
use datafusion_expr::Expr as DfExpr;
let req = ScanRequest {
sequence: None,
projection: None,
filters: vec![Expr::from(datafusion_expr::binary_expr(
DfExpr::Column(Column::from("timestamp")),
datafusion_expr::Operator::GtEq,
datafusion_expr::lit(timestamp_to_scalar_value(
TimeUnit::Millisecond,
Some(20000),
)),
))],
output_ordering: Some(vec![OrderOption {
name: "timestamp".to_string(),
options: SortOptions {
descending: true,
nulls_first: true,
},
}]),
limit: Some(1),
};
let _ = tester.scan(req).await;
}

View File

@@ -1,206 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use common_test_util::temp_dir::create_temp_dir;
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::ScalarVector;
use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::logstore::LogStore;
use store_api::storage::{
Chunk, ChunkReader, ReadContext, Region, ScanRequest, Snapshot, WriteContext, WriteRequest,
};
use crate::config::EngineConfig;
use crate::region::{RegionImpl, RegionMetadata};
use crate::test_util::{self, config_util, descriptor_util, write_batch_util};
use crate::write_batch::WriteBatch;
/// Create metadata with schema (k0, timestamp, v0, v1)
fn new_metadata(region_name: &str) -> RegionMetadata {
let desc = descriptor_util::desc_with_field_columns(region_name, 2);
desc.try_into().unwrap()
}
fn new_write_batch_for_test() -> WriteBatch {
write_batch_util::new_write_batch(
&[
("k0", LogicalTypeId::Int64, false),
(
test_util::TIMESTAMP_NAME,
LogicalTypeId::TimestampMillisecond,
false,
),
("v0", LogicalTypeId::Int64, true),
("v1", LogicalTypeId::Int64, true),
],
Some(1),
2,
)
}
/// Build put data
///
/// ```text
/// k0: [key_start, key_start + 1, ... key_start + len - 1]
/// timestamp: [ts_start, ts_start + 1, ... ts_start + len - 1]
/// v0: [initial_value, ...., initial_value]
/// v1: [initial_value, ..., initial_value + len - 1]
/// ```
fn new_put_data(
len: usize,
key_start: i64,
ts_start: i64,
initial_value: i64,
) -> HashMap<String, VectorRef> {
let k0 = Arc::new(Int64Vector::from_values(
(0..len).map(|v| key_start + v as i64),
)) as VectorRef;
let ts = Arc::new(TimestampMillisecondVector::from_values(
(0..len).map(|v| ts_start + v as i64),
)) as VectorRef;
let v0 = Arc::new(Int64Vector::from_values(
std::iter::repeat(initial_value).take(len),
)) as VectorRef;
let v1 = Arc::new(Int64Vector::from_values(
(0..len).map(|v| initial_value + v as i64),
)) as VectorRef;
HashMap::from([
("k0".to_string(), k0),
(test_util::TIMESTAMP_NAME.to_string(), ts),
("v0".to_string(), v0),
("v1".to_string(), v1),
])
}
fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<Vec<i64>>) {
if chunk.columns.is_empty() {
return;
}
let num_rows = chunk.columns[0].len();
dst.resize(num_rows, Vec::new());
for (i, row) in dst.iter_mut().enumerate() {
for col in &chunk.columns {
match col.data_type() {
ConcreteDataType::Int64(_) => {
let val = col
.as_any()
.downcast_ref::<Int64Vector>()
.unwrap()
.get_data(i)
.unwrap();
row.push(val);
}
ConcreteDataType::Timestamp(_) => {
let val = col
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.get_data(i)
.unwrap();
row.push(val.into());
}
_ => unreachable!(),
}
}
}
}
struct ProjectionTester<S: LogStore> {
region: RegionImpl<S>,
write_ctx: WriteContext,
read_ctx: ReadContext,
}
impl<S: LogStore> ProjectionTester<S> {
fn with_region(region: RegionImpl<S>) -> ProjectionTester<S> {
ProjectionTester {
region,
write_ctx: WriteContext::default(),
read_ctx: ReadContext::default(),
}
}
async fn put(&self, len: usize, key_start: i64, ts_start: i64, initial_value: i64) {
let mut batch = new_write_batch_for_test();
let put_data = new_put_data(len, key_start, ts_start, initial_value);
batch.put(put_data).unwrap();
let _ = self.region.write(&self.write_ctx, batch).await.unwrap();
}
async fn scan(&self, projection: Option<Vec<usize>>) -> Vec<Vec<i64>> {
let snapshot = self.region.snapshot(&self.read_ctx).unwrap();
let request = ScanRequest {
projection,
..Default::default()
};
let resp = snapshot.scan(&self.read_ctx, request).await.unwrap();
let mut reader = resp.reader;
let mut dst = Vec::new();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let chunk = reader.project_chunk(chunk);
append_chunk_to(&chunk, &mut dst);
}
dst
}
}
const REGION_NAME: &str = "region-projection-0";
async fn new_tester(store_dir: &str) -> ProjectionTester<RaftEngineLogStore> {
let metadata = new_metadata(REGION_NAME);
let store_config =
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
let region = RegionImpl::create(metadata, store_config).await.unwrap();
ProjectionTester::with_region(region)
}
#[tokio::test]
async fn test_projection_ordered() {
let dir = create_temp_dir("projection-ordered");
let store_dir = dir.path().to_str().unwrap();
let tester = new_tester(store_dir).await;
tester.put(4, 1, 10, 100).await;
// timestamp, v1
let output = tester.scan(Some(vec![1, 3])).await;
let expect = vec![vec![10, 100], vec![11, 101], vec![12, 102], vec![13, 103]];
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_projection_unordered() {
let dir = create_temp_dir("projection-unordered");
let store_dir = dir.path().to_str().unwrap();
let tester = new_tester(store_dir).await;
tester.put(4, 1, 10, 100).await;
// v1, k0
let output = tester.scan(Some(vec![3, 0])).await;
let expect = vec![vec![100, 1], vec![101, 2], vec![102, 3], vec![103, 4]];
assert_eq!(expect, output);
}

View File

@@ -1,242 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Region truncate tests.
use std::sync::Arc;
use common_test_util::temp_dir::create_temp_dir;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use store_api::manifest::{Manifest, MetaAction};
use store_api::storage::{FlushContext, OpenOptions, Region};
use crate::config::EngineConfig;
use crate::engine;
use crate::flush::FlushStrategyRef;
use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionTruncate};
use crate::region::tests::{self, FileTesterBase};
use crate::region::RegionImpl;
use crate::test_util::config_util;
use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch};
const REGION_NAME: &str = "region-truncate-0";
/// Create a new region for truncate tests.
async fn create_region_for_truncate(
store_dir: &str,
flush_strategy: FlushStrategyRef,
) -> RegionImpl<RaftEngineLogStore> {
let metadata = tests::new_metadata(REGION_NAME);
let mut store_config =
config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await;
store_config.flush_strategy = flush_strategy;
RegionImpl::create(metadata, store_config).await.unwrap()
}
/// Tester for truncate tests.
struct TruncateTester {
store_dir: String,
base: Option<FileTesterBase>,
}
impl TruncateTester {
async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> TruncateTester {
let region = create_region_for_truncate(store_dir, flush_strategy).await;
TruncateTester {
store_dir: store_dir.to_string(),
base: Some(FileTesterBase::with_region(region)),
}
}
#[inline]
fn base(&self) -> &FileTesterBase {
self.base.as_ref().unwrap()
}
async fn flush(&self) {
let ctx = FlushContext::default();
self.base().region.flush(&ctx).await.unwrap();
}
async fn truncate(&self) {
self.base().region.truncate().await.unwrap();
}
async fn reopen(&mut self) {
// Close the old region.
if let Some(base) = self.base.as_ref() {
base.close().await;
}
self.base = None;
// Reopen the region.
let store_config = config_util::new_store_config(
REGION_NAME,
&self.store_dir,
EngineConfig {
max_files_in_l0: usize::MAX,
..Default::default()
},
)
.await;
let opts = OpenOptions::default();
let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts)
.await
.unwrap()
.unwrap();
self.base = Some(FileTesterBase::with_region(region));
}
}
#[tokio::test]
async fn test_truncate_basic() {
let dir = create_temp_dir("truncate-basic");
common_telemetry::init_default_ut_logging();
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
let data = [
(1000, Some("1000".to_string())),
(1001, Some("1001".to_string())),
(1002, Some("1002".to_string())),
(1003, Some("1003".to_string())),
];
// Data in Memtable
tester.base().put(&data).await;
let res = tester.base().full_scan().await;
assert_eq!(4, res.len());
// Truncate region.
tester.truncate().await;
let res = tester.base().full_scan().await;
assert_eq!(0, res.len());
}
#[tokio::test]
async fn test_put_data_after_truncate() {
let dir = create_temp_dir("put_data_after_truncate");
common_telemetry::init_default_ut_logging();
let store_dir = dir.path().to_str().unwrap();
let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME));
let flush_switch = Arc::new(FlushSwitch::default());
let tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
let data = [
(1000, Some("1000".to_string())),
(1001, Some("1001".to_string())),
(1002, None),
(1003, Some("1003".to_string())),
];
tester.base().put(&data).await;
// Manually trigger flush.
tester.flush().await;
assert!(has_parquet_file(&sst_dir));
let data = [
(1002, Some("1002".to_string())),
(1004, Some("1004".to_string())),
(1005, Some("1005".to_string())),
];
tester.base().put(&data).await;
// Truncate region.
tester.truncate().await;
let res = tester.base().full_scan().await;
assert_eq!(0, res.len());
let new_data = [
(1010, Some("0".to_string())),
(1011, Some("1".to_string())),
(1012, Some("2".to_string())),
(1013, Some("3".to_string())),
];
tester.base().put(&new_data).await;
let res = tester.base().full_scan().await;
assert_eq!(new_data, res.as_slice());
}
#[tokio::test]
async fn test_truncate_reopen() {
let dir = create_temp_dir("put_data_after_truncate");
common_telemetry::init_default_ut_logging();
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let mut tester = TruncateTester::new(store_dir, flush_switch.clone()).await;
let data = [
(1000, Some("1000".to_string())),
(1001, Some("1001".to_string())),
(1002, None),
(1003, Some("1003".to_string())),
];
tester.base().put(&data).await;
// Manually trigger flush.
tester.flush().await;
let data = [
(1002, Some("1002".to_string())),
(1004, Some("1004".to_string())),
(1005, Some("1005".to_string())),
];
tester.base().put(&data).await;
let manifest = &tester.base().region.inner.manifest;
let manifest_version = tester
.base()
.region
.version_control()
.current_manifest_version();
let committed_sequence = tester.base().committed_sequence();
let mut action_list =
RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
region_id: 0.into(),
committed_sequence,
}));
// Persist the meta action.
let prev_version = manifest_version;
action_list.set_prev_version(prev_version);
manifest.update(action_list).await.unwrap();
// Reopen and put data.
tester.reopen().await;
let res = tester.base().full_scan().await;
assert_eq!(0, res.len());
let new_data = [
(0, Some("0".to_string())),
(1, Some("1".to_string())),
(2, Some("2".to_string())),
(3, Some("3".to_string())),
];
tester.base().put(&new_data).await;
let res = tester.base().full_scan().await;
assert_eq!(new_data, res.as_slice());
}

View File

@@ -1,984 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_telemetry::logging;
use futures::TryStreamExt;
use snafu::{ensure, ResultExt};
use store_api::logstore::LogStore;
use store_api::manifest::{Manifest, ManifestLogStorage, ManifestVersion, MetaAction};
use store_api::storage::{
AlterRequest, FlushContext, FlushReason, SequenceNumber, WriteContext, WriteResponse,
};
use tokio::sync::{oneshot, Mutex};
use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
use crate::config::EngineConfig;
use crate::error::{self, Result};
use crate::flush::{
FlushHandle, FlushRegionRequest, FlushSchedulerRef, FlushStrategyRef, FlushType, RegionStatus,
};
use crate::manifest::action::{
RawRegionMetadata, RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList,
RegionRemove, RegionTruncate,
};
use crate::memtable::{Inserter, MemtableBuilderRef, MemtableId, MemtableRef, MemtableVersion};
use crate::metadata::RegionMetadataRef;
use crate::metrics::{FLUSH_REQUESTS_TOTAL, PREPROCESS_ELAPSED};
use crate::proto::wal::WalHeader;
use crate::region::{
CompactContext, RecoveredMetadata, RecoveredMetadataMap, RegionManifest, SharedDataRef,
};
use crate::schema::compat::CompatWrite;
use crate::sst::{AccessLayerRef, LevelMetas};
use crate::version::{VersionControl, VersionControlRef, VersionEdit};
use crate::wal::Wal;
use crate::write_batch::WriteBatch;
pub type RegionWriterRef<S> = Arc<RegionWriter<S>>;
// TODO(yingwen): Add benches for write and support group commit to improve write throughput.
/// Region writer manages all write operations to the region.
#[derive(Debug)]
pub struct RegionWriter<S: LogStore> {
// To avoid dead lock, we need to ensure the lock order is: inner -> version_mutex.
/// Inner writer guarded by write lock, the write lock is used to ensure
/// all write operations are serialized.
inner: Mutex<WriterInner>,
/// Version lock, protects read-write-update to region `Version`.
///
/// Increasing committed sequence should be guarded by this lock.
version_mutex: Mutex<()>,
compaction_scheduler: CompactionSchedulerRef<S>,
compaction_picker: CompactionPickerRef<S>,
}
impl<S> RegionWriter<S>
where
S: LogStore,
{
pub fn new(
memtable_builder: MemtableBuilderRef,
config: Arc<EngineConfig>,
ttl: Option<Duration>,
write_buffer_size: usize,
compaction_scheduler: CompactionSchedulerRef<S>,
compaction_picker: CompactionPickerRef<S>,
) -> RegionWriter<S> {
RegionWriter {
inner: Mutex::new(WriterInner::new(
memtable_builder,
config,
ttl,
write_buffer_size,
)),
version_mutex: Mutex::new(()),
compaction_scheduler,
compaction_picker,
}
}
/// Write to region in the write lock.
pub async fn write(
&self,
ctx: &WriteContext,
request: WriteBatch,
writer_ctx: WriterContext<'_, S>,
) -> Result<WriteResponse> {
let mut inner = self.inner.lock().await;
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
inner
.write(&self.version_mutex, ctx, request, writer_ctx)
.await
}
/// Replay data to memtables.
pub async fn replay(
&self,
recovered_metadata: RecoveredMetadataMap,
writer_ctx: WriterContext<'_, S>,
) -> Result<()> {
let mut inner = self.inner.lock().await;
inner
.replay(&self.version_mutex, recovered_metadata, writer_ctx)
.await
}
/// Write and apply the region edit.
pub(crate) async fn write_edit_and_apply(
&self,
wal: &Wal<S>,
shared: &SharedDataRef,
manifest: &RegionManifest,
edit: RegionEdit,
max_memtable_id: Option<MemtableId>,
) -> Result<()> {
let _lock = self.version_mutex.lock().await;
// HACK: We won't acquire the write lock here because write stall would hold
// write lock thus we have no chance to get the lock and apply the version edit.
// So we add a version lock to ensure modification to `VersionControl` is
// serialized.
let version_control = &shared.version_control;
let prev_version = version_control.current_manifest_version();
logging::debug!(
"Write region edit: {:?} to manifest, prev_version: {}.",
edit,
prev_version,
);
let files_to_add = edit.files_to_add.clone();
let files_to_remove = edit.files_to_remove.clone();
let flushed_sequence = edit.flushed_sequence;
let compaction_time_window = edit.compaction_time_window;
// Persist the meta action.
let mut action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit));
action_list.set_prev_version(prev_version);
let manifest_version = manifest.update(action_list).await?;
// Notify checkpointer the flushed manifest version after flushing memtable
if flushed_sequence.is_some() {
manifest.set_flushed_manifest_version(manifest_version);
}
let version_edit = VersionEdit {
files_to_add,
files_to_remove,
flushed_sequence,
manifest_version,
max_memtable_id,
compaction_time_window,
};
// We could tolerate failure during persisting manifest version to the WAL, since it won't
// affect how we applying the edit to the version.
version_control.apply_edit(version_edit);
// TODO(yingwen): We should set the flush handle to `None`, but we can't acquire
// write lock here.
// Persist the manifest version to notify subscriber of the wal that the manifest has been
// updated. This should be done at the end of the method.
self.persist_manifest_version(wal, version_control, manifest_version)
.await
}
/// Alter schema of the region.
pub async fn alter(&self, alter_ctx: AlterContext<'_, S>, request: AlterRequest) -> Result<()> {
// To alter the schema, we need to acquire the write lock first, so we could
// avoid other writers write to the region and switch the memtable safely.
// Another potential benefit is that the write lock also protect against concurrent
// alter request to the region.
let inner = self.inner.lock().await;
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
let version_control = alter_ctx.version_control();
let old_metadata = version_control.metadata();
old_metadata
.validate_alter(&request)
.context(error::InvalidAlterRequestSnafu)?;
// The write lock protects us against other alter request, so we could build the new
// metadata struct outside of the version mutex.
let new_metadata = old_metadata
.alter(&request)
.context(error::AlterMetadataSnafu)?;
let raw = RawRegionMetadata::from(&new_metadata);
// Acquire the version lock before altering the metadata.
let _lock = self.version_mutex.lock().await;
let committed_sequence = version_control.committed_sequence();
let mut action_list =
RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange {
metadata: raw,
committed_sequence,
}));
let new_metadata = Arc::new(new_metadata);
// Persist the meta action.
let prev_version = version_control.current_manifest_version();
action_list.set_prev_version(prev_version);
logging::debug!(
"Try to alter schema of region {}, region_id: {}, action_list: {:?}",
new_metadata.name(),
new_metadata.id(),
action_list
);
let manifest_version = alter_ctx.manifest.update(action_list).await?;
// Now we could switch memtables and apply the new metadata to the version.
let new_mutable = inner.memtable_builder.build(new_metadata.schema().clone());
version_control.freeze_mutable_and_apply_metadata(
new_metadata,
manifest_version,
new_mutable,
);
self.persist_manifest_version(alter_ctx.wal, version_control, manifest_version)
.await
}
/// Allocate a sequence and persist the manifest version using that sequence to the wal.
///
/// This method should be protected by the `version_mutex`.
async fn persist_manifest_version(
&self,
wal: &Wal<S>,
version_control: &VersionControlRef,
manifest_version: ManifestVersion,
) -> Result<()> {
// We always bump the committed sequence regardless whether persisting the manifest version
// to wal is success, to avoid RegionMetaAction use same committed sequence in accident.
let next_sequence = version_control.committed_sequence() + 1;
version_control.set_committed_sequence(next_sequence);
let header = WalHeader::with_last_manifest_version(manifest_version);
let _ = wal.write_to_wal(next_sequence, header, None).await?;
Ok(())
}
pub async fn close(&self) -> Result<()> {
// In order to close a writer
// 1. Acquires the write lock.
// 2. Sets a memory flag to reject any potential writing.
// 3. Waits for the pending flush task.
{
let mut inner = self.inner.lock().await;
if inner.is_closed() {
return Ok(());
}
inner.mark_closed();
}
// we release the writer lock once for rejecting any following potential writing requests immediately.
self.wait_flush().await?;
// TODO: cancel the compaction task
Ok(())
}
pub async fn on_drop(&self, drop_ctx: DropContext<'_, S>) -> Result<()> {
// 1. Acquires the write lock.
// 2. Close writer reject any potential writing.
// 3. Waits or cancels the flush job.
// 4. Add `RegionMetaAction::Remove` to recover from manifest in case of failure.
// The main task is to restore the cleaning of sst files. If there is a failure
// in the previous stops, it can be restored through the `Procedure` framework.
// 5. Mark all data obsolete in the WAL.
// 6. Delete the namespace of the region from the WAL.
// 7. Mark all SSTs deleted.
// 8. Remove all manifests.
let mut inner = self.inner.lock().await;
inner.mark_closed();
if let Some(handle) = inner.flush_handle.take() {
handle.wait().await?;
}
let version_control = drop_ctx.version_control();
let _lock = self.version_mutex.lock().await;
let committed_sequence = version_control.committed_sequence();
let current_version = version_control.current();
let mut action_list =
RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove {
region_id: drop_ctx.shared.id,
}));
// Persist the meta action.
let prev_version = version_control.current_manifest_version();
action_list.set_prev_version(prev_version);
logging::info!(
"Try to remove region {}, action_list: {:?}",
drop_ctx.shared.id(),
action_list
);
let remove_action_version = drop_ctx.manifest.update(action_list).await?;
// Mark all data obsolete and delete the namespace in the WAL
drop_ctx.wal.obsolete(committed_sequence).await?;
drop_ctx.wal.delete_namespace().await?;
logging::info!(
"Remove WAL entries in region: {}, committed sequence: {}",
drop_ctx.shared.id(),
committed_sequence
);
// Mark all SSTs deleted
let files = current_version.ssts().mark_all_files_deleted();
logging::info!(
"Try to remove all SSTs, region: {}, files: {:?}",
drop_ctx.shared.id(),
files
);
drop_ctx
.manifest
.manifest_store()
.delete_all(remove_action_version)
.await?;
Ok(())
}
/// Flush task manually
pub async fn flush(&self, writer_ctx: WriterContext<'_, S>, ctx: &FlushContext) -> Result<()> {
let mut inner = self.inner.lock().await;
if !ctx.force {
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
}
inner.manual_flush(writer_ctx, ctx.reason).await?;
if ctx.wait {
if let Some(handle) = inner.flush_handle.take() {
handle.wait().await?;
}
}
Ok(())
}
/// Compact manually.
pub async fn compact(&self, request: WriterCompactRequest<S>) -> Result<()> {
let mut inner = self.inner.lock().await;
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
let sst_write_buffer_size = ReadableSize::mb(8); // deprecated usage
inner
.manual_compact(
request,
self.compaction_picker.clone(),
self.compaction_scheduler.clone(),
sst_write_buffer_size,
)
.await
}
/// Wait flush task if any
async fn wait_flush(&self) -> Result<()> {
let mut inner = self.inner.lock().await;
if let Some(handle) = inner.flush_handle.take() {
handle.wait().await?;
}
Ok(())
}
pub async fn truncate(&self, ctx: &TruncateContext<'_, S>) -> Result<()> {
// Acquires the write lock.
let mut inner = self.inner.lock().await;
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
if let Some(handle) = inner.flush_handle.take() {
handle.wait().await?;
}
let version_control = ctx.version_control();
let _lock = self.version_mutex.lock().await;
let committed_sequence = version_control.committed_sequence();
// Add `RegionMetaAction::Truncate` to recover from manifest in case of failure.
let mut action_list =
RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate {
region_id: ctx.shared.id,
committed_sequence,
}));
// Persist the meta action.
let current_version = version_control.current();
let manifest_version = version_control.current_manifest_version();
let prev_version = manifest_version;
action_list.set_prev_version(prev_version);
ctx.manifest.update(action_list).await?;
// Mark all data obsolete
ctx.wal.obsolete(committed_sequence).await?;
// Mark all SSTs deleted
let files = current_version.ssts().mark_all_files_deleted();
logging::info!(
"Try to remove all SSTs, region: {}, files: {:?}",
ctx.shared.id(),
files
);
// Reset version
let memtables = Arc::new(MemtableVersion::new(inner.alloc_memtable(version_control)));
let ssts = Arc::new(LevelMetas::new(
ctx.sst_layer.clone(),
current_version.ssts().file_purger(),
));
version_control.reset_version(manifest_version + 1, memtables, ssts);
Ok(())
}
}
// Methods for tests.
#[cfg(test)]
impl<S> RegionWriter<S>
where
S: LogStore,
{
pub(crate) async fn write_buffer_size(&self) -> usize {
self.inner.lock().await.write_buffer_size
}
}
/// Structs needed by triggering a compaction.
pub struct WriterCompactRequest<S: LogStore> {
pub shared_data: SharedDataRef,
pub sst_layer: AccessLayerRef,
pub manifest: RegionManifest,
pub wal: Wal<S>,
pub region_writer: RegionWriterRef<S>,
pub compact_ctx: CompactContext,
}
pub struct WriterContext<'a, S: LogStore> {
pub shared: &'a SharedDataRef,
pub flush_strategy: &'a FlushStrategyRef,
pub flush_scheduler: &'a FlushSchedulerRef<S>,
pub compaction_scheduler: &'a CompactionSchedulerRef<S>,
pub sst_layer: &'a AccessLayerRef,
pub wal: &'a Wal<S>,
pub writer: &'a RegionWriterRef<S>,
pub manifest: &'a RegionManifest,
pub compaction_picker: CompactionPickerRef<S>,
}
impl<'a, S: LogStore> WriterContext<'a, S> {
#[inline]
fn version_control(&self) -> &VersionControlRef {
&self.shared.version_control
}
}
pub struct AlterContext<'a, S: LogStore> {
pub shared: &'a SharedDataRef,
pub wal: &'a Wal<S>,
pub manifest: &'a RegionManifest,
}
impl<'a, S: LogStore> AlterContext<'a, S> {
#[inline]
fn version_control(&self) -> &VersionControlRef {
&self.shared.version_control
}
}
pub struct DropContext<'a, S: LogStore> {
pub shared: &'a SharedDataRef,
pub wal: &'a Wal<S>,
pub manifest: &'a RegionManifest,
pub flush_scheduler: &'a FlushSchedulerRef<S>,
pub compaction_scheduler: &'a CompactionSchedulerRef<S>,
pub sst_layer: &'a AccessLayerRef,
}
impl<'a, S: LogStore> DropContext<'a, S> {
#[inline]
fn version_control(&self) -> &VersionControlRef {
&self.shared.version_control
}
}
pub struct TruncateContext<'a, S: LogStore> {
pub shared: &'a SharedDataRef,
pub wal: &'a Wal<S>,
pub manifest: &'a RegionManifest,
pub sst_layer: &'a AccessLayerRef,
}
impl<'a, S: LogStore> TruncateContext<'a, S> {
#[inline]
fn version_control(&self) -> &VersionControlRef {
&self.shared.version_control
}
}
#[derive(Debug)]
struct WriterInner {
memtable_builder: MemtableBuilderRef,
flush_handle: Option<FlushHandle>,
/// `WriterInner` will reject any future writing, if the closed flag is set.
///
/// It should protected by upper mutex
closed: bool,
engine_config: Arc<EngineConfig>,
ttl: Option<Duration>,
/// Size in bytes to freeze the mutable memtable.
write_buffer_size: usize,
}
impl WriterInner {
fn new(
memtable_builder: MemtableBuilderRef,
engine_config: Arc<EngineConfig>,
ttl: Option<Duration>,
write_buffer_size: usize,
) -> WriterInner {
WriterInner {
memtable_builder,
flush_handle: None,
engine_config,
closed: false,
ttl,
write_buffer_size,
}
}
/// Write `WriteBatch` to region, now the schema of batch needs to be validated outside.
///
/// Mutable reference of writer ensure no other reference of this writer can modify the
/// version control (write is exclusive).
async fn write<S: LogStore>(
&mut self,
version_mutex: &Mutex<()>,
_ctx: &WriteContext,
mut request: WriteBatch,
writer_ctx: WriterContext<'_, S>,
) -> Result<WriteResponse> {
self.preprocess_write(&writer_ctx).await?;
let version_control = writer_ctx.version_control();
let _lock = version_mutex.lock().await;
let metadata = version_control.metadata();
// We need to check the schema again since it might has been altered. We need
// to compat request's schema before writing it into the WAL otherwise some
// default constraint like `current_timestamp()` would yield different value
// during replay.
request.compat_write(metadata.schema().user_schema())?;
let committed_sequence = version_control.committed_sequence();
// Sequence for current write batch.
let next_sequence = committed_sequence + 1;
let version = version_control.current();
let wal_header = WalHeader::with_last_manifest_version(version.manifest_version());
let _ = writer_ctx
.wal
.write_to_wal(next_sequence, wal_header, Some(request.payload()))
.await?;
// Insert batch into memtable.
let mut inserter = Inserter::new(next_sequence);
inserter.insert_memtable(request.payload(), version.mutable_memtable())?;
// Update committed_sequence to make current batch visible. The `&mut self` of WriterInner
// guarantees the writer is exclusive.
version_control.set_committed_sequence(next_sequence);
Ok(WriteResponse {})
}
async fn replay<S: LogStore>(
&mut self,
version_mutex: &Mutex<()>,
mut recovered_metadata: RecoveredMetadataMap,
writer_ctx: WriterContext<'_, S>,
) -> Result<()> {
let version_control = writer_ctx.version_control();
let (flushed_sequence, mut last_sequence);
let mut num_requests = 0;
let mut num_recovered_metadata = 0;
let mut next_apply_metadata = recovered_metadata.pop_first();
{
let _lock = version_mutex.lock().await;
// Data after flushed sequence need to be recovered.
flushed_sequence = version_control.current().flushed_sequence();
last_sequence = flushed_sequence;
// Read starts from the first entry after last flushed entry, so the start sequence
// should be flushed_sequence + 1.
let mut stream = writer_ctx.wal.read_from_wal(flushed_sequence + 1).await?;
while let Some((req_sequence, _header, payload)) = stream.try_next().await? {
while let Some((sequence_before_alter, _)) = next_apply_metadata {
// There might be multiple metadata changes to be applied, so a loop is necessary.
if req_sequence > sequence_before_alter {
// This is the first request that use the new metadata.
self.apply_metadata(
&writer_ctx,
sequence_before_alter,
next_apply_metadata,
version_control,
)?;
num_recovered_metadata += 1;
next_apply_metadata = recovered_metadata.pop_first();
} else {
// Keep the next_apply_metadata until req_sequence > sequence_before_alter
break;
}
}
if req_sequence > last_sequence {
last_sequence = req_sequence;
} else {
logging::error!(
"Sequence should not decrease during replay, found {} <= {}, \
region_id: {}, region_name: {}, flushed_sequence: {}, num_requests: {}",
req_sequence,
last_sequence,
writer_ctx.shared.id,
writer_ctx.shared.name,
flushed_sequence,
num_requests,
);
error::SequenceNotMonotonicSnafu {
prev: last_sequence,
given: req_sequence,
}
.fail()?;
}
if let Some(payload) = payload {
num_requests += 1;
// Note that memtables of `Version` may be updated during replay.
let version = version_control.current();
// TODO(yingwen): Trigger flush if the size of memtables reach the flush threshold to avoid
// out of memory during replay, but we need to do it carefully to avoid dead lock.
let mut inserter = Inserter::new(last_sequence);
inserter.insert_memtable(&payload, version.mutable_memtable())?;
}
}
// Apply metadata after last WAL entry
while let Some((sequence_before_alter, _)) = next_apply_metadata {
assert!(
sequence_before_alter >= last_sequence,
"The sequence in metadata after last WAL entry is less than last sequence, \
metadata sequence: {}, last_sequence: {}, region_id: {}, region_name: {}",
sequence_before_alter,
last_sequence,
writer_ctx.shared.id,
writer_ctx.shared.name
);
self.apply_metadata(
&writer_ctx,
sequence_before_alter,
next_apply_metadata,
version_control,
)?;
num_recovered_metadata += 1;
next_apply_metadata = recovered_metadata.pop_first();
}
version_control.set_committed_sequence(last_sequence);
}
logging::info!(
"Region replay finished, region_id: {}, region_name: {}, flushed_sequence: {}, last_sequence: {}, num_requests: {}, num_recovered_metadata: {}",
writer_ctx.shared.id,
writer_ctx.shared.name,
flushed_sequence,
last_sequence,
num_requests,
num_recovered_metadata,
);
Ok(())
}
fn apply_metadata<S: LogStore>(
&self,
writer_ctx: &WriterContext<'_, S>,
sequence: SequenceNumber,
mut metadata: Option<RecoveredMetadata>,
version_control: &VersionControl,
) -> Result<()> {
// It's safe to unwrap here, it's checked outside.
// Move out metadata to avoid cloning it.
let (_, (manifest_version, metadata)) = metadata.take().unwrap();
let region_metadata: RegionMetadataRef =
Arc::new(metadata.try_into().context(error::InvalidRawRegionSnafu {
region: &writer_ctx.shared.name,
})?);
let new_mutable = self
.memtable_builder
.build(region_metadata.schema().clone());
version_control.freeze_mutable_and_apply_metadata(
region_metadata,
manifest_version,
new_mutable,
);
logging::debug!(
"Applied metadata to region: {} when replaying WAL: sequence={} manifest={} ",
writer_ctx.shared.name,
sequence,
manifest_version
);
Ok(())
}
/// Preprocess before write.
///
/// Creates needed mutable memtables, ensures there is enough capacity in memtable and trigger
/// flush if necessary. Returns time ranges of the input write batch.
async fn preprocess_write<S: LogStore>(
&mut self,
writer_ctx: &WriterContext<'_, S>,
) -> Result<()> {
let _timer = PREPROCESS_ELAPSED.start_timer();
let version_control = writer_ctx.version_control();
// Check whether memtable is full or flush should be triggered. We need to do this first since
// switching memtables will clear all mutable memtables.
if let Some(flush_type) = self.should_flush(
writer_ctx.shared,
version_control,
writer_ctx.flush_strategy,
) {
// Trigger flush according to the flush type.
match flush_type {
FlushType::Region => {
// Trigger flush for current region.
self.trigger_flush(writer_ctx, FlushReason::MemtableFull)
.await?;
}
FlushType::Engine => {
// Trigger engine level flush. This wakeup the flush handler
// to pick region to flush.
writer_ctx.flush_scheduler.schedule_engine_flush()?;
}
}
}
Ok(())
}
/// Create a new mutable memtable.
fn alloc_memtable(&self, version_control: &VersionControlRef) -> MemtableRef {
let memtable_schema = version_control.current().schema().clone();
self.memtable_builder.build(memtable_schema)
}
fn should_flush(
&self,
shared: &SharedDataRef,
version_control: &VersionControlRef,
flush_strategy: &FlushStrategyRef,
) -> Option<FlushType> {
let current = version_control.current();
let memtables = current.memtables();
let status = RegionStatus {
region_id: shared.id(),
bytes_mutable: memtables.mutable_bytes_allocated(),
write_buffer_size: self.write_buffer_size,
};
flush_strategy.should_flush(status)
}
async fn trigger_flush<S: LogStore>(
&mut self,
ctx: &WriterContext<'_, S>,
reason: FlushReason,
) -> Result<()> {
let version_control = &ctx.shared.version_control;
let new_mutable = self.alloc_memtable(version_control);
// Freeze all mutable memtables so we can flush them later.
version_control.freeze_mutable(new_mutable);
FLUSH_REQUESTS_TOTAL
.with_label_values(&[reason.as_str()])
.inc();
if let Some(flush_handle) = self.flush_handle.take() {
// Previous flush job is incomplete, wait util it is finished.
// However the last flush job may fail, in which case, we just return error
// and abort current write request. The flush handle is left empty, so the next
// time we still have chance to trigger a new flush.
// TODO(yingwen): We should release the write lock during waiting flush done, which
// needs something like async condvar.
flush_handle.wait().await.map_err(|e| {
logging::error!(e; "Previous flush job failed, region: {}", ctx.shared.name);
e
})?;
}
let current_version = version_control.current();
let (max_memtable_id, mem_to_flush) = current_version.memtables().memtables_to_flush();
if max_memtable_id.is_none() {
// We still update the flush time to avoid the picker picks this region again.
ctx.shared.update_flush_millis();
logging::info!("No memtables to flush in region: {}", ctx.shared.name);
return Ok(());
}
let flush_req = FlushRegionRequest {
max_memtable_id: max_memtable_id.unwrap(),
memtables: mem_to_flush,
// In write thread, safe to use current committed sequence.
flush_sequence: version_control.committed_sequence(),
shared: ctx.shared.clone(),
sst_layer: ctx.sst_layer.clone(),
writer: ctx.writer.clone(),
wal: ctx.wal.clone(),
manifest: ctx.manifest.clone(),
engine_config: self.engine_config.clone(),
ttl: self.ttl,
compaction_time_window: current_version.ssts().compaction_time_window(),
compaction_picker: ctx.compaction_picker.clone(),
};
let flush_handle = ctx
.flush_scheduler
.schedule_region_flush(flush_req)
.map_err(|e| {
logging::error!(e; "Failed to schedule flush request");
e
})?;
self.flush_handle = Some(flush_handle);
Ok(())
}
async fn manual_compact<S: LogStore>(
&mut self,
request: WriterCompactRequest<S>,
compaction_picker: CompactionPickerRef<S>,
compaction_scheduler: CompactionSchedulerRef<S>,
sst_write_buffer_size: ReadableSize,
) -> Result<()> {
let region_id = request.shared_data.id();
let compaction_time_window = request
.shared_data
.version_control
.current()
.ssts()
.compaction_time_window();
let mut compaction_request = CompactionRequestImpl {
region_id,
sst_layer: request.sst_layer,
writer: request.region_writer,
shared: request.shared_data.clone(),
manifest: request.manifest,
wal: request.wal,
ttl: self.ttl,
compaction_time_window,
sender: None,
picker: compaction_picker,
sst_write_buffer_size,
// manual compaction does not reschedule itself.
reschedule_on_finish: false,
};
let compaction_scheduler = compaction_scheduler.clone();
logging::info!(
"Manual compact, region_id: {}, compact_ctx: {:?}",
region_id,
request.compact_ctx
);
if request.compact_ctx.wait {
let (sender, receiver) = oneshot::channel();
compaction_request.sender = Some(sender);
if schedule_compaction(
request.shared_data,
compaction_scheduler,
compaction_request,
) {
receiver
.await
.context(error::CompactTaskCancelSnafu { region_id })??;
}
} else {
let _ = schedule_compaction(
request.shared_data,
compaction_scheduler,
compaction_request,
);
}
Ok(())
}
async fn manual_flush<S: LogStore>(
&mut self,
writer_ctx: WriterContext<'_, S>,
reason: FlushReason,
) -> Result<()> {
self.trigger_flush(&writer_ctx, reason).await?;
Ok(())
}
#[inline]
fn is_closed(&self) -> bool {
self.closed
}
#[inline]
fn mark_closed(&mut self) {
self.closed = true;
}
}
/// Schedule compaction task, returns whether the task is scheduled.
pub(crate) fn schedule_compaction<S: LogStore>(
shared_data: SharedDataRef,
compaction_scheduler: CompactionSchedulerRef<S>,
compaction_request: CompactionRequestImpl<S>,
) -> bool {
let region_id = shared_data.id();
match compaction_scheduler.schedule(compaction_request) {
Ok(scheduled) => {
logging::info!(
"Schedule region {} compaction request result: {}",
region_id,
scheduled
);
scheduled
}
Err(e) => {
logging::error!(e;"Failed to schedule region compaction request {}", region_id);
false
}
}
}

View File

@@ -1,652 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{Debug, Formatter};
use std::hash::Hash;
use std::sync::atomic::{AtomicU8, Ordering};
use std::sync::{Arc, Mutex, RwLock};
use async_trait::async_trait;
use common_telemetry::{debug, error, info};
use snafu::{ensure, ResultExt};
use tokio::sync::Notify;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use crate::error::{IllegalSchedulerStateSnafu, Result, StopSchedulerSnafu};
use crate::scheduler::dedup_deque::DedupDeque;
use crate::scheduler::rate_limit::{
BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter, RateLimiter,
};
pub mod dedup_deque;
pub mod rate_limit;
/// Request that can be scheduled.
/// It must contain a key for deduplication.
pub trait Request: Send + Sync + 'static {
/// Type of request key.
type Key: Eq + Hash + Clone + Debug + Send + Sync;
/// Returns the request key.
fn key(&self) -> Self::Key;
/// Notify the request result.
fn complete(self, result: Result<()>);
}
#[async_trait::async_trait]
pub trait Handler {
type Request;
async fn handle_request(
&self,
req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()>;
}
/// [Scheduler] defines a set of API to schedule requests.
#[async_trait]
pub trait Scheduler: Debug {
type Request;
/// Schedules a request.
/// Returns true if request is scheduled. Returns false if task queue already
/// contains the request with same key.
fn schedule(&self, request: Self::Request) -> Result<bool>;
/// Stops scheduler. If `await_termination` is set to true, the scheduler will
/// wait until all queued requests are processed.
async fn stop(&self, await_termination: bool) -> Result<()>;
}
/// Scheduler config.
#[derive(Debug)]
pub struct SchedulerConfig {
pub max_inflight_tasks: usize,
}
impl Default for SchedulerConfig {
fn default() -> Self {
Self {
max_inflight_tasks: 4,
}
}
}
const STATE_RUNNING: u8 = 0;
const STATE_STOP: u8 = 1;
const STATE_AWAIT_TERMINATION: u8 = 2;
/// Request scheduler based on local state.
pub struct LocalScheduler<R: Request> {
/// Request FIFO with key deduplication.
request_queue: Arc<RwLock<DedupDeque<R::Key, R>>>,
/// Token used to halt the scheduler.
cancel_token: CancellationToken,
/// Tasks use a cooperative manner to notify scheduler that another request can be scheduled.
task_notifier: Arc<Notify>,
/// Join handle of spawned request handling loop.
join_handle: Mutex<Option<JoinHandle<()>>>,
/// State of scheduler.
state: Arc<AtomicU8>,
}
impl<R> Debug for LocalScheduler<R>
where
R: Request + Send + Sync,
{
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("LocalScheduler")
.field("state", &self.state)
.finish()
}
}
impl<R> Drop for LocalScheduler<R>
where
R: Request,
{
fn drop(&mut self) {
self.state.store(STATE_STOP, Ordering::Relaxed);
self.cancel_token.cancel();
// Clear all requests
self.request_queue.write().unwrap().clear();
}
}
#[async_trait]
impl<R> Scheduler for LocalScheduler<R>
where
R: Request + Send,
{
type Request = R;
fn schedule(&self, request: Self::Request) -> Result<bool> {
ensure!(self.running(), IllegalSchedulerStateSnafu);
debug!(
"Schedule request: {:?}, queue size: {}",
request.key(),
self.remaining_requests()
);
let mut queue = self.request_queue.write().unwrap();
let res = queue.push_back(request.key(), request);
self.task_notifier.notify_one();
Ok(res)
}
async fn stop(&self, await_termination: bool) -> Result<()> {
let state = if await_termination {
STATE_AWAIT_TERMINATION
} else {
STATE_STOP
};
self.state.store(state, Ordering::Relaxed);
self.cancel_token.cancel();
let handle = { self.join_handle.lock().unwrap().take() };
if let Some(handle) = handle {
handle.await.context(StopSchedulerSnafu)?;
}
Ok(())
}
}
impl<R> LocalScheduler<R>
where
R: Request,
{
/// Creates a new scheduler instance with given config and request handler.
pub fn new<H>(config: SchedulerConfig, handler: H) -> Self
where
H: Handler<Request = R> + Send + Sync + 'static,
{
let request_queue = Arc::new(RwLock::new(DedupDeque::default()));
let cancel_token = CancellationToken::new();
let task_notifier = Arc::new(Notify::new());
let state = Arc::new(AtomicU8::new(STATE_RUNNING));
let handle_loop = HandlerLoop {
task_notifier: task_notifier.clone(),
req_queue: request_queue.clone(),
cancel_token: cancel_token.child_token(),
limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new(
MaxInflightTaskLimiter::new(config.max_inflight_tasks),
)])),
request_handler: handler,
state: state.clone(),
};
let join_handle = common_runtime::spawn_bg(async move {
debug!("Task handler loop spawned");
handle_loop.run().await;
});
Self {
join_handle: Mutex::new(Some(join_handle)),
request_queue,
cancel_token,
task_notifier,
state,
}
}
/// Returns remaining requests number.
#[inline]
fn remaining_requests(&self) -> usize {
self.request_queue.read().unwrap().len()
}
#[inline]
fn running(&self) -> bool {
self.state.load(Ordering::Relaxed) == STATE_RUNNING
}
}
pub struct HandlerLoop<R: Request, H: Handler> {
pub req_queue: Arc<RwLock<DedupDeque<R::Key, R>>>,
pub cancel_token: CancellationToken,
pub task_notifier: Arc<Notify>,
pub request_handler: H,
pub limiter: Arc<CascadeRateLimiter<R>>,
pub state: Arc<AtomicU8>,
}
impl<R, H> HandlerLoop<R, H>
where
R: Request,
H: Handler<Request = R>,
{
/// Runs scheduled requests dispatch loop.
pub async fn run(&self) {
let limiter = self.limiter.clone();
while self.running() {
tokio::select! {
_ = self.task_notifier.notified() => {
debug!("Notified, queue size: {:?}",self.req_queue.read().unwrap().len());
self.poll_and_execute(&limiter).await;
}
_ = self.cancel_token.cancelled() => {
info!("Task scheduler cancelled.");
break;
}
}
}
// For correctness, we need to poll requests from fifo again.
if self.state.load(Ordering::Relaxed) == STATE_AWAIT_TERMINATION {
info!("Waiting for all pending tasks to finish.");
self.poll_and_execute(&limiter).await;
self.state.store(STATE_STOP, Ordering::Relaxed);
}
info!("Task scheduler stopped");
}
/// Polls and executes requests as many as possible until rate limited.
async fn poll_and_execute(&self, limiter: &Arc<CascadeRateLimiter<R>>) {
while let Some((task_key, req)) = self.poll_task().await {
if let Ok(token) = limiter.acquire_token(&req) {
debug!("Executing request: {:?}", task_key);
if let Err(e) = self
.handle_request(req, token, self.task_notifier.clone())
.await
{
error!(e; "Failed to submit request: {:?}", task_key);
} else {
info!("Submitted task: {:?}", task_key);
}
} else {
// rate limited, put back to req queue to wait for next schedule
debug!(
"Put back request {:?}, queue size: {}",
task_key,
self.req_queue.read().unwrap().len()
);
self.put_back_req(task_key, req).await;
break;
}
}
}
#[inline]
async fn poll_task(&self) -> Option<(R::Key, R)> {
let mut queue = self.req_queue.write().unwrap();
queue.pop_front()
}
/// Puts request back to the front of request queue.
#[inline]
async fn put_back_req(&self, key: R::Key, req: R) {
let mut queue = self.req_queue.write().unwrap();
let _ = queue.push_front(key, req);
}
// Handles request, submit task to bg runtime.
async fn handle_request(
&self,
req: R,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
self.request_handler
.handle_request(req, token, finish_notifier)
.await
}
#[inline]
fn running(&self) -> bool {
self.state.load(Ordering::Relaxed) == STATE_RUNNING
}
}
#[cfg(test)]
mod tests {
use std::sync::atomic::{AtomicBool, AtomicI32};
use std::time::Duration;
use futures_util::future::BoxFuture;
use store_api::storage::RegionId;
use super::*;
use crate::scheduler::dedup_deque::DedupDeque;
use crate::scheduler::rate_limit::{
BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter,
};
use crate::scheduler::{HandlerLoop, LocalScheduler, Scheduler, SchedulerConfig};
struct CountdownLatch {
counter: std::sync::Mutex<usize>,
notify: Notify,
}
impl CountdownLatch {
fn new(size: usize) -> Self {
Self {
counter: std::sync::Mutex::new(size),
notify: Notify::new(),
}
}
fn countdown(&self) {
let mut counter = self.counter.lock().unwrap();
if *counter >= 1 {
*counter -= 1;
if *counter == 0 {
self.notify.notify_one();
}
}
}
/// Users should only call this once.
async fn wait(&self) {
self.notify.notified().await
}
}
#[tokio::test]
async fn test_schedule_handler() {
common_telemetry::init_default_ut_logging();
let queue = Arc::new(std::sync::RwLock::new(DedupDeque::default()));
let latch = Arc::new(CountdownLatch::new(2));
let latch_cloned = latch.clone();
let handler = Arc::new(HandlerLoop {
req_queue: queue.clone(),
cancel_token: Default::default(),
task_notifier: Arc::new(Default::default()),
request_handler: MockHandler {
cb: move || {
latch_cloned.countdown();
},
},
limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new(
MaxInflightTaskLimiter::new(3),
)])),
state: Arc::new(AtomicU8::default()),
});
let handler_cloned = handler.clone();
let _handle = common_runtime::spawn_bg(async move { handler_cloned.run().await });
let _ = queue
.write()
.unwrap()
.push_back(1.into(), MockRequest::default());
handler.task_notifier.notify_one();
let _ = queue
.write()
.unwrap()
.push_back(2.into(), MockRequest::default());
handler.task_notifier.notify_one();
tokio::time::timeout(Duration::from_secs(1), latch.wait())
.await
.unwrap();
}
#[derive(Default, Debug)]
struct MockRequest {
region_id: RegionId,
}
struct MockHandler<F> {
cb: F,
}
#[async_trait::async_trait]
impl<F> Handler for MockHandler<F>
where
F: Fn() + Send + Sync,
{
type Request = MockRequest;
async fn handle_request(
&self,
_req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
(self.cb)();
token.try_release();
finish_notifier.notify_one();
Ok(())
}
}
impl Request for MockRequest {
type Key = RegionId;
fn key(&self) -> Self::Key {
self.region_id
}
fn complete(self, _result: Result<()>) {}
}
#[tokio::test]
async fn test_scheduler() {
let latch = Arc::new(CountdownLatch::new(2));
let latch_cloned = latch.clone();
let handler = MockHandler {
cb: move || {
latch_cloned.countdown();
},
};
let scheduler: LocalScheduler<MockRequest> = LocalScheduler::new(
SchedulerConfig {
max_inflight_tasks: 3,
},
handler,
);
let _ = scheduler
.schedule(MockRequest {
region_id: 1.into(),
})
.unwrap();
let _ = scheduler
.schedule(MockRequest {
region_id: 2.into(),
})
.unwrap();
tokio::time::timeout(Duration::from_secs(1), latch.wait())
.await
.unwrap();
}
#[tokio::test]
async fn test_scheduler_many() {
common_telemetry::init_default_ut_logging();
let task_size = 100;
let latch = Arc::new(CountdownLatch::new(task_size));
let latch_clone = latch.clone();
let handler = MockHandler {
cb: move || {
latch_clone.countdown();
},
};
let config = SchedulerConfig {
max_inflight_tasks: 3,
};
let scheduler = LocalScheduler::new(config, handler);
for i in 0..task_size {
assert!(scheduler
.schedule(MockRequest {
region_id: RegionId::from(i as u64),
})
.is_ok());
}
tokio::time::timeout(Duration::from_secs(3), latch.wait())
.await
.unwrap();
}
#[tokio::test]
async fn test_scheduler_interval() {
common_telemetry::init_default_ut_logging();
let task_size = 100;
let latch = Arc::new(CountdownLatch::new(task_size));
let latch_clone = latch.clone();
let handler = MockHandler {
cb: move || {
latch_clone.countdown();
},
};
let config = SchedulerConfig {
max_inflight_tasks: 3,
};
let scheduler = LocalScheduler::new(config, handler);
for i in 0..task_size / 2 {
assert!(scheduler
.schedule(MockRequest {
region_id: RegionId::from(i as u64),
})
.is_ok());
}
tokio::time::sleep(Duration::from_millis(100)).await;
for i in task_size / 2..task_size {
assert!(scheduler
.schedule(MockRequest {
region_id: RegionId::from(i as u64),
})
.is_ok());
}
tokio::time::timeout(Duration::from_secs(6), latch.wait())
.await
.unwrap();
}
struct MockAsyncHandler<F> {
cb: F,
}
#[async_trait::async_trait]
impl<F> Handler for MockAsyncHandler<F>
where
F: Fn() -> BoxFuture<'static, ()> + Send + Sync,
{
type Request = MockRequest;
async fn handle_request(
&self,
_req: Self::Request,
token: BoxedRateLimitToken,
finish_notifier: Arc<Notify>,
) -> Result<()> {
let fut = (self.cb)();
fut.await;
token.try_release();
finish_notifier.notify_one();
Ok(())
}
}
#[tokio::test]
async fn test_schedule_duplicate_tasks() {
common_telemetry::init_default_ut_logging();
let (tx, rx) = tokio::sync::watch::channel(false);
let handler = MockAsyncHandler {
cb: move || {
let mut rx = rx.clone();
Box::pin(async move {
// Block the handler so it can't handle more requests.
loop {
rx.changed().await.unwrap();
if *rx.borrow() {
break;
}
}
}) as _ // Casts the Pin<Box<async block>> to Pin<Box<dyn Future>>
},
};
let config = SchedulerConfig {
max_inflight_tasks: 30,
};
let scheduler = LocalScheduler::new(config, handler);
let mut scheduled_task = 0;
for _ in 0..10 {
if scheduler
.schedule(MockRequest {
region_id: 1.into(),
})
.unwrap()
{
scheduled_task += 1;
}
}
tx.send(true).unwrap();
scheduler.stop(true).await.unwrap();
debug!("Schedule tasks: {}", scheduled_task);
assert!(scheduled_task < 10);
}
#[tokio::test]
async fn test_await_termination() {
common_telemetry::init_default_ut_logging();
let finished = Arc::new(AtomicI32::new(0));
let finished_clone = finished.clone();
let handler = MockHandler {
cb: move || {
let _ = finished_clone.fetch_add(1, Ordering::Relaxed);
},
};
let config = SchedulerConfig {
max_inflight_tasks: 3,
};
let scheduler = Arc::new(LocalScheduler::new(config, handler));
let scheduler_cloned = scheduler.clone();
let task_scheduled = Arc::new(AtomicI32::new(0));
let task_scheduled_cloned = task_scheduled.clone();
let scheduling = Arc::new(AtomicBool::new(true));
let scheduling_clone = scheduling.clone();
let handle = common_runtime::spawn_write(async move {
for i in 0..10000 {
if let Ok(res) = scheduler_cloned.schedule(MockRequest {
region_id: RegionId::from(i as u64),
}) {
if res {
let _ = task_scheduled_cloned.fetch_add(1, Ordering::Relaxed);
}
}
if !scheduling_clone.load(Ordering::Relaxed) {
break;
}
}
});
scheduler.stop(true).await.unwrap();
scheduling.store(false, Ordering::Relaxed);
let finished = finished.load(Ordering::Relaxed);
handle.await.unwrap();
assert_eq!(finished, task_scheduled.load(Ordering::Relaxed));
}
}

View File

@@ -1,124 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::hash_map::Entry;
use std::collections::{HashMap, VecDeque};
use std::fmt::{Debug, Formatter};
use std::hash::Hash;
/// Deque with key deduplication.
pub struct DedupDeque<K, V> {
deque: VecDeque<K>,
existing: HashMap<K, V>,
}
impl<K, V> Default for DedupDeque<K, V> {
fn default() -> Self {
Self {
deque: VecDeque::new(),
existing: HashMap::new(),
}
}
}
impl<K: Eq + Hash + Clone, V> DedupDeque<K, V> {
/// Pushes a key value to the back of deque.
/// Returns true if the deque does not already contain value with the same key, otherwise
/// returns false.
pub fn push_back(&mut self, key: K, value: V) -> bool {
debug_assert_eq!(self.deque.len(), self.existing.len());
if let Entry::Vacant(entry) = self.existing.entry(key.clone()) {
let _ = entry.insert(value);
self.deque.push_back(key);
return true;
}
false
}
/// Pushes a key value to the front of deque.
/// Returns true if the deque does not already contain value with the same key, otherwise
/// returns false.
pub fn push_front(&mut self, key: K, value: V) -> bool {
if let Entry::Vacant(entry) = self.existing.entry(key.clone()) {
let _ = entry.insert(value);
self.deque.push_front(key);
return true;
}
false
}
/// Pops a pair from the back of deque. Returns [None] if the deque is empty.
pub fn pop_front(&mut self) -> Option<(K, V)> {
debug_assert_eq!(self.deque.len(), self.existing.len());
let key = self.deque.pop_front()?;
let value = self.existing.remove(&key)?;
Some((key, value))
}
#[inline]
pub fn len(&self) -> usize {
debug_assert_eq!(self.deque.len(), self.existing.len());
self.deque.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.deque.is_empty()
}
#[inline]
pub fn clear(&mut self) {
self.deque.clear();
self.existing.clear();
}
}
impl<K, V> Debug for DedupDeque<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DedupDeque")
.field("deque", &self.deque)
.field("existing", &self.existing)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dedup_deque() {
let mut deque = DedupDeque::default();
assert!(deque.push_back(1, "hello".to_string()));
assert_eq!(1, deque.len());
assert!(deque.push_back(2, "world".to_string()));
assert_eq!(2, deque.len());
assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap());
assert_eq!(1, deque.len());
assert_eq!((2, "world".to_string()), deque.pop_front().unwrap());
assert_eq!(0, deque.len());
// insert duplicated item
assert!(deque.push_back(1, "hello".to_string()));
assert!(!deque.push_back(1, "world".to_string()));
assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap());
deque.clear();
assert!(deque.is_empty());
}
}

View File

@@ -1,185 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::marker::PhantomData;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use crate::error::{RateLimitedSnafu, Result};
pub trait RateLimitToken {
/// Releases the token.
/// ### Note
/// Implementation should guarantee the idempotency.
fn try_release(&self);
}
pub type BoxedRateLimitToken = Box<dyn RateLimitToken + Send + Sync>;
impl<T: RateLimitToken + ?Sized> RateLimitToken for Box<T> {
fn try_release(&self) {
(**self).try_release()
}
}
/// Rate limiter
pub trait RateLimiter {
type Request;
/// Acquires a token from rate limiter. Returns `Err` on failure.
fn acquire_token(&self, req: &Self::Request) -> Result<BoxedRateLimitToken>;
}
pub type BoxedRateLimiter<R> = Box<dyn RateLimiter<Request = R> + Send + Sync>;
/// Limits max inflight tasks number.
pub struct MaxInflightTaskLimiter<R> {
max_inflight_tasks: usize,
inflight_tasks: Arc<AtomicUsize>,
_phantom_data: PhantomData<R>,
}
impl<R> MaxInflightTaskLimiter<R> {
pub fn new(max_inflight_tasks: usize) -> Self {
Self {
max_inflight_tasks,
inflight_tasks: Arc::new(AtomicUsize::new(0)),
_phantom_data: Default::default(),
}
}
}
impl<R> RateLimiter for MaxInflightTaskLimiter<R> {
type Request = R;
fn acquire_token(&self, _: &Self::Request) -> Result<BoxedRateLimitToken> {
if self.inflight_tasks.fetch_add(1, Ordering::Relaxed) >= self.max_inflight_tasks {
let _ = self.inflight_tasks.fetch_sub(1, Ordering::Relaxed);
return RateLimitedSnafu {
msg: format!(
"Max inflight task num exceeds, current: {}, max: {}",
self.inflight_tasks.load(Ordering::Relaxed),
self.max_inflight_tasks
),
}
.fail();
}
Ok(Box::new(MaxInflightLimiterToken::new(
self.inflight_tasks.clone(),
)))
}
}
pub struct MaxInflightLimiterToken {
counter: Arc<AtomicUsize>,
released: AtomicBool,
}
impl MaxInflightLimiterToken {
pub fn new(counter: Arc<AtomicUsize>) -> Self {
Self {
counter,
released: AtomicBool::new(false),
}
}
}
impl RateLimitToken for MaxInflightLimiterToken {
fn try_release(&self) {
if self
.released
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
.is_ok()
{
let _ = self.counter.fetch_sub(1, Ordering::Relaxed);
}
}
}
/// A composite rate limiter that allows token acquisition only when all internal limiters allow.
pub struct CascadeRateLimiter<T> {
limits: Vec<BoxedRateLimiter<T>>,
}
impl<T> CascadeRateLimiter<T> {
pub fn new(limits: Vec<BoxedRateLimiter<T>>) -> Self {
Self { limits }
}
}
impl<T> RateLimiter for CascadeRateLimiter<T> {
type Request = T;
fn acquire_token(&self, req: &Self::Request) -> Result<BoxedRateLimitToken> {
let mut res = vec![];
for limit in &self.limits {
match limit.acquire_token(req) {
Ok(token) => {
res.push(token);
}
Err(e) => {
res.iter().for_each(RateLimitToken::try_release);
return Err(e);
}
}
}
Ok(Box::new(CompositeToken { tokens: res }))
}
}
/// Composite token that releases all acquired token when released.
pub struct CompositeToken {
tokens: Vec<BoxedRateLimitToken>,
}
impl RateLimitToken for CompositeToken {
fn try_release(&self) {
for token in &self.tokens {
token.try_release();
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_max_inflight_limiter() {
let limiter = MaxInflightTaskLimiter::new(3);
let t1 = limiter.acquire_token(&1).unwrap();
assert_eq!(1, limiter.inflight_tasks.load(Ordering::Relaxed));
let _t2 = limiter.acquire_token(&1).unwrap();
assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed));
let _t3 = limiter.acquire_token(&1).unwrap();
assert_eq!(3, limiter.inflight_tasks.load(Ordering::Relaxed));
assert!(limiter.acquire_token(&1).is_err());
t1.try_release();
assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed));
let _t4 = limiter.acquire_token(&1).unwrap();
}
#[test]
fn test_cascade_limiter() {
let limiter: CascadeRateLimiter<usize> =
CascadeRateLimiter::new(vec![Box::new(MaxInflightTaskLimiter::new(3))]);
let t1 = limiter.acquire_token(&1).unwrap();
let _t2 = limiter.acquire_token(&1).unwrap();
let _t3 = limiter.acquire_token(&1).unwrap();
assert!(limiter.acquire_token(&1).is_err());
t1.try_release();
let _t4 = limiter.acquire_token(&1).unwrap();
}
}

View File

@@ -1,59 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod compat;
mod projected;
mod region;
mod store;
pub use crate::schema::projected::{ProjectedSchema, ProjectedSchemaRef};
pub use crate::schema::region::{RegionSchema, RegionSchemaRef};
pub use crate::schema::store::{StoreSchema, StoreSchemaRef};
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datatypes::vectors::{
Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector, VectorRef,
};
use crate::read::Batch;
pub const REGION_NAME: &str = "test";
pub(crate) fn new_batch() -> Batch {
new_batch_with_num_values(1)
}
pub(crate) fn new_batch_with_num_values(num_field_columns: usize) -> Batch {
let k0 = Int64Vector::from_slice([1, 2, 3]);
let timestamp = TimestampMillisecondVector::from_vec(vec![4, 5, 6]);
let mut columns: Vec<VectorRef> = vec![Arc::new(k0), Arc::new(timestamp)];
for i in 0..num_field_columns {
let vi = Int64Vector::from_slice([i as i64, i as i64, i as i64]);
columns.push(Arc::new(vi));
}
let sequences = UInt64Vector::from_slice([100, 100, 100]);
let op_types = UInt8Vector::from_slice([0, 0, 0]);
columns.push(Arc::new(sequences));
columns.push(Arc::new(op_types));
Batch::new(columns)
}
}

View File

@@ -1,611 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Utilities for resolving schema compatibility problems.
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::schema::SchemaRef;
use datatypes::vectors::{Helper, VectorRef};
use snafu::{ensure, OptionExt, ResultExt};
use crate::error::{self, Result};
use crate::metadata::ColumnMetadata;
use crate::read::Batch;
use crate::schema::{ProjectedSchemaRef, StoreSchemaRef};
/// Make schema compatible to write to target with another schema.
pub trait CompatWrite {
/// Makes the schema of `self` compatible with `dest_schema`.
///
/// For column in `dest_schema` but not in `self`, this method would insert a
/// vector with default value.
///
/// If there are columns not in `dest_schema`, an error would be returned.
fn compat_write(&mut self, dest_schema: &SchemaRef) -> Result<()>;
}
/// Checks whether column with `source_column` could be read as a column with `dest_column`.
///
/// Returns
/// - `Ok(true)` if `source_column` is compatible to read using `dest_column` as schema.
/// - `Ok(false)` if they are considered different columns.
/// - `Err` if there is incompatible issue that could not be resolved.
fn is_source_column_compatible(
source_column: &ColumnMetadata,
dest_column: &ColumnMetadata,
) -> Result<bool> {
ensure!(
source_column.name() == dest_column.name(),
error::CompatReadSnafu {
reason: format!(
"try to use column in {} for column {}",
source_column.name(),
dest_column.name()
),
}
);
if source_column.id() != dest_column.id() {
return Ok(false);
}
ensure!(
source_column.desc.data_type == dest_column.desc.data_type,
error::CompatReadSnafu {
reason: format!(
"could not read column {} from {:?} type as {:?} type",
dest_column.name(),
source_column.desc.data_type,
dest_column.desc.data_type
),
}
);
ensure!(
dest_column.desc.is_nullable() || !source_column.desc.is_nullable(),
error::CompatReadSnafu {
reason: format!(
"unable to read nullable data for non null column {}",
dest_column.name()
),
}
);
Ok(true)
}
/// Adapter to help reading data with source schema as data with dest schema.
#[derive(Debug)]
pub struct ReadAdapter {
/// Schema of data source.
source_schema: StoreSchemaRef,
/// Schema user expects to read.
dest_schema: ProjectedSchemaRef,
/// For each column in dest schema, stores the index in read result for
/// this column, or None if the column is not in result.
///
/// This vec would be left empty if `source_version == dest_version`.
indices_in_result: Vec<Option<usize>>,
/// For each column in source schema, stores whether we need to read that column. All
/// columns are needed by default.
is_source_needed: Vec<bool>,
}
impl ReadAdapter {
/// Creates a new [ReadAdapter] that could convert data with `source_schema` into data
/// with `dest_schema`.
pub fn new(
source_schema: StoreSchemaRef,
dest_schema: ProjectedSchemaRef,
) -> Result<ReadAdapter> {
if source_schema.version() == dest_schema.schema_to_read().version() {
ReadAdapter::from_same_version(source_schema, dest_schema)
} else {
ReadAdapter::from_different_version(source_schema, dest_schema)
}
}
fn from_same_version(
source_schema: StoreSchemaRef,
dest_schema: ProjectedSchemaRef,
) -> Result<ReadAdapter> {
let mut is_source_needed = vec![true; source_schema.num_columns()];
if source_schema.num_columns() != dest_schema.schema_to_read().num_columns() {
// `dest_schema` might be projected, so we need to find out value columns that not be read
// by the `dest_schema`.
for (offset, field_column) in source_schema.field_columns().iter().enumerate() {
// Iterate value columns in source and mark those not in destination as unneeded.
if !dest_schema.is_needed(field_column.id()) {
is_source_needed[source_schema.field_column_index_by_offset(offset)] = false;
}
}
}
Ok(ReadAdapter {
source_schema,
dest_schema,
indices_in_result: Vec::new(),
is_source_needed,
})
}
fn from_different_version(
source_schema: StoreSchemaRef,
dest_schema: ProjectedSchemaRef,
) -> Result<ReadAdapter> {
let schema_to_read = dest_schema.schema_to_read();
let mut indices_in_result = vec![None; schema_to_read.num_columns()];
let mut is_source_needed = vec![true; source_schema.num_columns()];
// Number of columns in result from source data.
let mut num_columns_in_result = 0;
for (idx, source_column) in source_schema.columns().iter().enumerate() {
// For each column in source schema, check whether we need to read it.
if let Some(dest_idx) = schema_to_read
.schema()
.column_index_by_name(source_column.name())
{
let dest_column = &schema_to_read.columns()[dest_idx];
// Check whether we could read this column.
if is_source_column_compatible(source_column, dest_column)? {
// Mark that this column could be read from source data, since some
// columns in source schema would be skipped, we should not use
// the source column's index directly.
indices_in_result[dest_idx] = Some(num_columns_in_result);
num_columns_in_result += 1;
} else {
// This column is not the same column in dest schema, should be fill by default value
// instead of reading from source data.
is_source_needed[idx] = false;
}
} else {
// The column is not in `dest_schema`, we don't need to read it.
is_source_needed[idx] = false;
}
}
Ok(ReadAdapter {
source_schema,
dest_schema,
indices_in_result,
is_source_needed,
})
}
/// Returns a bool slice to denote which key column in source is needed.
#[inline]
pub fn source_key_needed(&self) -> &[bool] {
&self.is_source_needed[..self.source_schema.row_key_end()]
}
/// Returns a bool slice to denote which value column in source is needed.
#[inline]
pub fn source_value_needed(&self) -> &[bool] {
&self.is_source_needed
[self.source_schema.row_key_end()..self.source_schema.user_column_end()]
}
/// Construct a new [Batch] from row key, value, sequence and op_type.
///
/// # Panics
/// Panics if input `VectorRef` is empty.
pub fn batch_from_parts(
&self,
row_key_columns: Vec<VectorRef>,
mut field_columns: Vec<VectorRef>,
sequences: VectorRef,
op_types: VectorRef,
) -> Result<Batch> {
// Each vector should has same length, so here we just use the length of `sequence`.
let num_rows = sequences.len();
let mut source = row_key_columns;
// Reserve space for value, sequence and op_type
source.reserve(field_columns.len() + 2);
source.append(&mut field_columns);
// Internal columns are push in sequence, op_type order.
source.push(sequences);
source.push(op_types);
if !self.need_compat() {
return Ok(Batch::new(source));
}
self.source_columns_to_batch(source, num_rows)
}
/// Returns list of fields indices need to read from the parquet file.
pub fn fields_to_read(&self) -> Vec<usize> {
self.is_source_needed
.iter()
.enumerate()
.filter_map(|(idx, needed)| if *needed { Some(idx) } else { None })
.collect::<Vec<_>>()
}
/// Convert [RecordBatch] read from the parquet file into [Batch].
///
/// The [RecordBatch] should have the same schema as [`ReadAdapter::fields_to_read()`].
pub fn arrow_record_batch_to_batch(&self, record_batch: &RecordBatch) -> Result<Batch> {
let names = self
.source_schema
.schema()
.column_schemas()
.iter()
.zip(self.is_source_needed.iter())
.filter_map(|(column_schema, is_needed)| {
if *is_needed {
Some(&column_schema.name)
} else {
None
}
});
let source = record_batch
.columns()
.iter()
.zip(names)
.map(|(column, name)| {
Helper::try_into_vector(column.clone()).context(error::ConvertChunkSnafu { name })
})
.collect::<Result<_>>()?;
if !self.need_compat() || record_batch.num_rows() == 0 {
return Ok(Batch::new(source));
}
let num_rows = record_batch.num_rows();
self.source_columns_to_batch(source, num_rows)
}
#[inline]
fn need_compat(&self) -> bool {
self.source_schema.version() != self.dest_schema.schema_to_read().version()
}
fn source_columns_to_batch(&self, source: Vec<VectorRef>, num_rows: usize) -> Result<Batch> {
let column_schemas = self.dest_schema.schema_to_read().schema().column_schemas();
let columns = self
.indices_in_result
.iter()
.zip(column_schemas)
.map(|(index_opt, column_schema)| {
if let Some(idx) = index_opt {
Ok(source[*idx].clone())
} else {
let vector = column_schema
.create_default_vector(num_rows)
.context(error::CreateDefaultToReadSnafu {
column: &column_schema.name,
})?
.context(error::NoDefaultToReadSnafu {
column: &column_schema.name,
})?;
Ok(vector)
}
})
.collect::<Result<Vec<_>>>()?;
Ok(Batch::new(columns))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::Schema;
use store_api::storage::ColumnDescriptorBuilder;
use super::*;
use crate::error::Error;
use crate::metadata::RegionMetadata;
use crate::schema::{tests, ProjectedSchema, RegionSchema};
use crate::test_util::{descriptor_util, schema_util};
fn call_batch_from_parts(
adapter: &ReadAdapter,
batch: &Batch,
num_field_columns: usize,
) -> Batch {
let key = batch.columns()[0..2].to_vec();
let value = batch.columns()[2..2 + num_field_columns].to_vec();
let sequence = batch.column(2 + num_field_columns).clone();
let op_type = batch.column(2 + num_field_columns + 1).clone();
adapter
.batch_from_parts(key, value, sequence, op_type)
.unwrap()
}
fn check_batch_from_parts_without_padding(
adapter: &ReadAdapter,
batch: &Batch,
num_field_columns: usize,
) {
let new_batch = call_batch_from_parts(adapter, batch, num_field_columns);
assert_eq!(*batch, new_batch);
}
fn call_arrow_chunk_to_batch(adapter: &ReadAdapter, batch: &Batch) -> Batch {
let columns_schema = adapter
.source_schema
.columns()
.iter()
.zip(adapter.is_source_needed.iter())
.filter_map(|(field, is_needed)| {
if *is_needed {
Some(field.to_column_schema().unwrap())
} else {
None
}
})
.collect::<Vec<_>>();
let arrow_schema = Schema::try_new(columns_schema)
.unwrap()
.arrow_schema()
.clone();
let arrays = batch.columns().iter().map(|v| v.to_arrow_array()).collect();
let chunk = RecordBatch::try_new(arrow_schema, arrays).unwrap();
adapter.arrow_record_batch_to_batch(&chunk).unwrap()
}
fn check_arrow_chunk_to_batch_without_padding(adapter: &ReadAdapter, batch: &Batch) {
let new_batch = call_arrow_chunk_to_batch(adapter, batch);
assert_eq!(*batch, new_batch);
}
fn check_batch_with_null_padding(batch: &Batch, new_batch: &Batch, null_columns: &[usize]) {
assert_eq!(
batch.num_columns() + null_columns.len(),
new_batch.num_columns()
);
let columns_from_source = new_batch
.columns()
.iter()
.enumerate()
.filter_map(|(i, v)| {
if null_columns.contains(&i) {
None
} else {
Some(v.clone())
}
})
.collect::<Vec<_>>();
assert_eq!(batch.columns(), &columns_from_source);
for idx in null_columns {
assert!(new_batch.column(*idx).only_null());
}
}
#[test]
fn test_compat_same_schema() {
// (k0, timestamp, v0, v1) with version 0.
let region_schema = Arc::new(schema_util::new_region_schema(0, 2));
let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema.clone()));
let source_schema = region_schema.store_schema().clone();
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
assert_eq!(&[true, true], adapter.source_key_needed());
assert_eq!(&[true, true], adapter.source_value_needed());
let batch = tests::new_batch_with_num_values(2);
check_batch_from_parts_without_padding(&adapter, &batch, 2);
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4, 5],);
check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
}
#[test]
fn test_compat_same_version_with_projection() {
// (k0, timestamp, v0, v1) with version 0.
let region_schema = Arc::new(schema_util::new_region_schema(0, 2));
// Just read v0, k0.
let projected_schema =
Arc::new(ProjectedSchema::new(region_schema.clone(), Some(vec![2, 0])).unwrap());
let source_schema = region_schema.store_schema().clone();
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
assert_eq!(&[true, true], adapter.source_key_needed());
assert_eq!(&[true, false], adapter.source_value_needed());
// One value column has been filtered out, so the result batch should only contains one value column.
let batch = tests::new_batch_with_num_values(1);
check_batch_from_parts_without_padding(&adapter, &batch, 1);
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5]);
check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
}
#[test]
fn test_compat_old_column() {
// (k0, timestamp, v0) with version 0.
let region_schema_old = Arc::new(schema_util::new_region_schema(0, 1));
// (k0, timestamp, v0, v1) with version 1.
let region_schema_new = Arc::new(schema_util::new_region_schema(1, 1));
// Just read v0, k0
let projected_schema =
Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![2, 0])).unwrap());
let source_schema = region_schema_old.store_schema().clone();
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
assert_eq!(&[true, true], adapter.source_key_needed());
assert_eq!(&[true], adapter.source_value_needed());
let batch = tests::new_batch_with_num_values(1);
check_batch_from_parts_without_padding(&adapter, &batch, 1);
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4],);
check_arrow_chunk_to_batch_without_padding(&adapter, &batch);
}
#[test]
fn test_compat_new_column() {
// (k0, timestamp, v0, v1) with version 0.
let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2));
// (k0, timestamp, v0, v1, v2) with version 1.
let region_schema_new = Arc::new(schema_util::new_region_schema(1, 3));
// Just read v2, v0, k0
let projected_schema =
Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![4, 2, 0])).unwrap());
let source_schema = region_schema_old.store_schema().clone();
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
assert_eq!(&[true, true], adapter.source_key_needed());
assert_eq!(&[true, false], adapter.source_value_needed());
// Only read one value column from source.
let batch = tests::new_batch_with_num_values(1);
// New batch should contains k0, timestamp, v0, sequence, op_type.
let new_batch = call_batch_from_parts(&adapter, &batch, 1);
// v2 is filled by null.
check_batch_with_null_padding(&batch, &new_batch, &[3]);
assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5],);
let new_batch = call_arrow_chunk_to_batch(&adapter, &batch);
check_batch_with_null_padding(&batch, &new_batch, &[3]);
}
#[test]
fn test_compat_different_column() {
// (k0, timestamp, v0, v1) with version 0.
let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2));
let mut descriptor = descriptor_util::desc_with_field_columns(tests::REGION_NAME, 2);
// Assign a much larger column id to v0.
descriptor.default_cf.columns[0].id = descriptor.default_cf.columns.last().unwrap().id + 10;
let metadata: RegionMetadata = descriptor.try_into().unwrap();
let columns = metadata.columns;
// (k0, timestamp, v0, v1) with version 2, and v0 has different column id.
let region_schema_new = Arc::new(RegionSchema::new(columns, 2).unwrap());
let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema_new));
let source_schema = region_schema_old.store_schema().clone();
let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap();
assert_eq!(&[true, true], adapter.source_key_needed());
// v0 is discarded as it has different column id than new schema's.
assert_eq!(&[false, true], adapter.source_value_needed());
// New batch should contains k0, timestamp, v1, sequence, op_type, so we need to remove v0
// from the created batch.
let batch = tests::new_batch_with_num_values(2);
let mut columns = batch.columns().to_vec();
// Remove v0.
let _ = columns.remove(2);
let batch = Batch::new(columns);
let new_batch = call_batch_from_parts(&adapter, &batch, 1);
// v0 is filled by null.
check_batch_with_null_padding(&batch, &new_batch, &[2]);
assert_eq!(&adapter.fields_to_read(), &[0, 1, 3, 4, 5],);
let new_batch = call_arrow_chunk_to_batch(&adapter, &batch);
check_batch_with_null_padding(&batch, &new_batch, &[2]);
}
#[inline]
fn new_column_desc_builder() -> ColumnDescriptorBuilder {
ColumnDescriptorBuilder::new(10, "test", ConcreteDataType::int32_datatype())
}
#[test]
fn test_is_source_column_compatible() {
let desc = new_column_desc_builder().build().unwrap();
let source = ColumnMetadata { cf_id: 1, desc };
// Same column is always compatible, also tests read nullable column
// as a nullable column.
assert!(is_source_column_compatible(&source, &source).unwrap());
// Different id.
let desc = new_column_desc_builder()
.id(source.desc.id + 1)
.build()
.unwrap();
let dest = ColumnMetadata { cf_id: 1, desc };
assert!(!is_source_column_compatible(&source, &dest).unwrap());
}
#[test]
fn test_nullable_column_read_by_not_null() {
let desc = new_column_desc_builder().build().unwrap();
assert!(desc.is_nullable());
let source = ColumnMetadata { cf_id: 1, desc };
let desc = new_column_desc_builder()
.is_nullable(false)
.build()
.unwrap();
let dest = ColumnMetadata { cf_id: 1, desc };
let err = is_source_column_compatible(&source, &dest).unwrap_err();
assert!(
matches!(err, Error::CompatRead { .. }),
"{err:?} is not CompatRead",
);
}
#[test]
fn test_read_not_null_column() {
let desc = new_column_desc_builder()
.is_nullable(false)
.build()
.unwrap();
let source = ColumnMetadata { cf_id: 1, desc };
let desc = new_column_desc_builder()
.is_nullable(false)
.build()
.unwrap();
let not_null_dest = ColumnMetadata { cf_id: 1, desc };
assert!(is_source_column_compatible(&source, &not_null_dest).unwrap());
let desc = new_column_desc_builder().build().unwrap();
let null_dest = ColumnMetadata { cf_id: 1, desc };
assert!(is_source_column_compatible(&source, &null_dest).unwrap());
}
#[test]
fn test_read_column_with_different_name() {
let desc = new_column_desc_builder().build().unwrap();
let source = ColumnMetadata { cf_id: 1, desc };
let desc = new_column_desc_builder()
.name(format!("{}_other", source.desc.name))
.build()
.unwrap();
let dest = ColumnMetadata { cf_id: 1, desc };
let err = is_source_column_compatible(&source, &dest).unwrap_err();
assert!(
matches!(err, Error::CompatRead { .. }),
"{err:?} is not CompatRead",
);
}
}

View File

@@ -1,590 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::Ordering;
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use api::v1::OpType;
use common_base::BitVec;
use datatypes::prelude::ScalarVector;
use datatypes::schema::{SchemaBuilder, SchemaRef};
use datatypes::vectors::{BooleanVector, UInt8Vector};
use snafu::{ensure, ResultExt};
use store_api::storage::{Chunk, ColumnId};
use crate::error;
use crate::metadata::{self, Result};
use crate::read::{Batch, BatchOp};
use crate::schema::{RegionSchema, RegionSchemaRef, StoreSchema, StoreSchemaRef};
/// Metadata about projection.
#[derive(Debug, Default)]
struct Projection {
/// Column indices of projection.
projected_columns: Vec<usize>,
/// Sorted and deduplicated indices of columns to read, includes all row key columns
/// and internal columns.
///
/// We use these indices to read from data sources.
columns_to_read: Vec<usize>,
/// Maps column id to its index in `columns_to_read`.
///
/// Used to ask whether the column with given column id is needed in projection.
id_to_read_idx: HashMap<ColumnId, usize>,
/// Maps index of `projected_columns` to index of the column in `columns_to_read`.
///
/// Invariant:
/// - `projected_idx_to_read_idx.len() == projected_columns.len()`
projected_idx_to_read_idx: Vec<usize>,
/// Number of user columns to read.
num_user_columns: usize,
}
impl Projection {
fn new(region_schema: &RegionSchema, projected_columns: Vec<usize>) -> Projection {
// Get a sorted list of column indices to read.
let mut column_indices: BTreeSet<_> = projected_columns.iter().cloned().collect();
column_indices.extend(region_schema.row_key_indices());
let num_user_columns = column_indices.len();
// Now insert internal columns.
column_indices.extend([
region_schema.sequence_index(),
region_schema.op_type_index(),
]);
let columns_to_read: Vec<_> = column_indices.into_iter().collect();
// The region schema ensure that last two column must be internal columns.
assert_eq!(
region_schema.sequence_index(),
columns_to_read[num_user_columns]
);
assert_eq!(
region_schema.op_type_index(),
columns_to_read[num_user_columns + 1]
);
// Mapping: <column id> => <index in `columns_to_read`>
let id_to_read_idx: HashMap<_, _> = columns_to_read
.iter()
.enumerate()
.map(|(idx, col_idx)| (region_schema.column_metadata(*col_idx).id(), idx))
.collect();
// Use column id to find index in `columns_to_read` of a column in `projected_columns`.
let projected_idx_to_read_idx = projected_columns
.iter()
.map(|col_idx| {
let column_id = region_schema.column_metadata(*col_idx).id();
// This unwrap() should be safe since `columns_to_read` must contains all columns in `projected_columns`.
let read_idx = id_to_read_idx.get(&column_id).unwrap();
*read_idx
})
.collect();
Projection {
projected_columns,
columns_to_read,
id_to_read_idx,
projected_idx_to_read_idx,
num_user_columns,
}
}
}
/// Schema with projection info.
#[derive(Debug)]
pub struct ProjectedSchema {
/// Projection info, `None` means don't need to do projection.
projection: Option<Projection>,
/// Schema used to read from data sources.
schema_to_read: StoreSchemaRef,
/// User schema after projection.
projected_user_schema: SchemaRef,
}
pub type ProjectedSchemaRef = Arc<ProjectedSchema>;
impl ProjectedSchema {
/// Create a new `ProjectedSchema` with given `projected_columns`.
///
/// If `projected_columns` is None, then all columns would be read. If `projected_columns` is
/// `Some`, then the `Vec` in it contains the indices of columns need to be read.
///
/// If the `Vec` is empty or contains invalid index, `Err` would be returned.
pub fn new(
region_schema: RegionSchemaRef,
projected_columns: Option<Vec<usize>>,
) -> Result<ProjectedSchema> {
match projected_columns {
Some(indices) => {
Self::validate_projection(&region_schema, &indices)?;
let projection = Projection::new(&region_schema, indices);
let schema_to_read = Self::build_schema_to_read(&region_schema, &projection)?;
let projected_user_schema =
Self::build_projected_user_schema(&region_schema, &projection)?;
Ok(ProjectedSchema {
projection: Some(projection),
schema_to_read,
projected_user_schema,
})
}
None => Ok(ProjectedSchema::no_projection(region_schema)),
}
}
/// Create a `ProjectedSchema` that read all columns.
pub fn no_projection(region_schema: RegionSchemaRef) -> ProjectedSchema {
// We could just reuse the StoreSchema and user schema.
ProjectedSchema {
projection: None,
schema_to_read: region_schema.store_schema().clone(),
projected_user_schema: region_schema.user_schema().clone(),
}
}
#[inline]
pub fn projected_user_schema(&self) -> &SchemaRef {
&self.projected_user_schema
}
#[inline]
pub fn schema_to_read(&self) -> &StoreSchemaRef {
&self.schema_to_read
}
/// Convert [Batch] into [Chunk].
///
/// This will remove all internal columns. The input `batch` should has the
/// same schema as [`self.schema_to_read()`](ProjectedSchema::schema_to_read).
/// The output [Chunk] has the same schema as
/// [`self.projected_user_schema()`](ProjectedSchema::projected_user_schema).
pub fn batch_to_chunk(&self, batch: &Batch) -> Chunk {
let columns = match &self.projection {
Some(projection) => projection
.projected_idx_to_read_idx
.iter()
.map(|col_idx| batch.column(*col_idx))
.cloned()
.collect(),
None => {
let num_user_columns = self.projected_user_schema.num_columns();
batch
.columns()
.iter()
.take(num_user_columns)
.cloned()
.collect()
}
};
Chunk::new(columns)
}
/// Returns true if column with given `column_id` is needed (in projection).
pub fn is_needed(&self, column_id: ColumnId) -> bool {
self.projection
.as_ref()
.map(|p| p.id_to_read_idx.contains_key(&column_id))
.unwrap_or(true)
}
fn build_schema_to_read(
region_schema: &RegionSchema,
projection: &Projection,
) -> Result<StoreSchemaRef> {
// Reorder columns according to the projection.
let columns: Vec<_> = projection
.columns_to_read
.iter()
.map(|col_idx| region_schema.column_metadata(*col_idx))
.cloned()
.collect();
// All row key columns are reserved in this schema, so we can use the row_key_end
// and timestamp_key_index from region schema.
let store_schema = StoreSchema::new(
columns,
region_schema.version(),
region_schema.row_key_end(),
projection.num_user_columns,
)?;
Ok(Arc::new(store_schema))
}
fn build_projected_user_schema(
region_schema: &RegionSchema,
projection: &Projection,
) -> Result<SchemaRef> {
let column_schemas: Vec<_> = projection
.projected_columns
.iter()
.map(|col_idx| {
region_schema
.column_metadata(*col_idx)
.desc
.to_column_schema()
})
.collect();
let schema = SchemaBuilder::try_from(column_schemas)
.context(metadata::ConvertSchemaSnafu)?
.version(region_schema.version())
.build()
.context(metadata::InvalidSchemaSnafu)?;
Ok(Arc::new(schema))
}
fn validate_projection(region_schema: &RegionSchema, indices: &[usize]) -> Result<()> {
// The projection indices should not be empty, at least the timestamp column
// should be always read, and the `StoreSchema` also requires the timestamp column.
ensure!(
!indices.is_empty(),
metadata::InvalidProjectionSnafu {
msg: "at least one column should be read",
}
);
// Now only allowed to read user columns.
let user_schema = region_schema.user_schema();
for i in indices {
ensure!(
*i < user_schema.num_columns(),
metadata::InvalidProjectionSnafu {
msg: format!(
"index {} out of bound, only contains {} columns",
i,
user_schema.num_columns()
),
}
);
}
Ok(())
}
}
impl BatchOp for ProjectedSchema {
fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering {
// Ordered by (row_key asc, sequence desc, op_type desc).
let indices = self.schema_to_read.row_key_indices();
for idx in indices {
let (left_col, right_col) = (left.column(idx), right.column(idx));
// Comparison of vector is done by virtual method calls currently. Consider using
// enum dispatch if this becomes bottleneck.
let order = left_col.get_ref(i).cmp(&right_col.get_ref(j));
if order != Ordering::Equal {
return order;
}
}
let (sequence_index, op_type_index) = (
self.schema_to_read.sequence_index(),
self.schema_to_read.op_type_index(),
);
right
.column(sequence_index)
.get_ref(j)
.cmp(&left.column(sequence_index).get_ref(i))
.then_with(|| {
right
.column(op_type_index)
.get_ref(j)
.cmp(&left.column(op_type_index).get_ref(i))
})
}
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) {
if let Some(prev) = prev {
assert_eq!(batch.num_columns(), prev.num_columns());
}
let indices = self.schema_to_read.row_key_indices();
for idx in indices {
let (current, prev_col) = (
batch.column(idx),
prev.map(|prev| prev.column(idx).as_ref()),
);
current.find_unique(selected, prev_col);
}
}
fn filter(&self, batch: &Batch, filter: &BooleanVector) -> error::Result<Batch> {
let columns = batch
.columns()
.iter()
.enumerate()
.map(|(i, v)| {
v.filter(filter).context(error::FilterColumnSnafu {
name: self.schema_to_read.column_name(i),
})
})
.collect::<error::Result<Vec<_>>>()?;
Ok(Batch::new(columns))
}
fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec) {
let op_types = batch.column(self.schema_to_read.op_type_index());
// Safety: We expect the batch has the same schema as `self.schema_to_read`. The
// read procedure should guarantee this, otherwise this is a critical bug and it
// should be fine to panic.
let op_types = op_types
.as_any()
.downcast_ref::<UInt8Vector>()
.unwrap_or_else(|| {
panic!(
"Expect op_type (UInt8) column at index {}, given {:?}",
self.schema_to_read.op_type_index(),
op_types.data_type()
);
});
for (i, op_type) in op_types.iter_data().enumerate() {
if op_type == Some(OpType::Delete as u8) {
selected.set(i, false);
}
}
}
}
#[cfg(test)]
mod tests {
use api::v1::OpType;
use datatypes::prelude::ScalarVector;
use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::{TimestampMillisecondVector, VectorRef};
use super::*;
use crate::metadata::Error;
use crate::schema::tests;
use crate::test_util::{read_util, schema_util};
#[test]
fn test_projection() {
// Build a region schema with 2 value columns. So the final user schema is
// (k0, timestamp, v0, v1)
let region_schema = schema_util::new_region_schema(0, 2);
// Projection, but still keep column order.
// After projection: (timestamp, v0)
let projected_columns = vec![1, 2];
let projection = Projection::new(&region_schema, projected_columns.clone());
assert_eq!(projected_columns, projection.projected_columns);
// Need to read (k0, timestamp, v0, sequence, op_type)
assert_eq!(&[0, 1, 2, 4, 5], &projection.columns_to_read[..]);
assert_eq!(5, projection.id_to_read_idx.len());
// Index of timestamp, v0 in `columns_to_read`
assert_eq!(&[1, 2], &projection.projected_idx_to_read_idx[..]);
// 3 columns: k0, timestamp, v0
assert_eq!(3, projection.num_user_columns);
// Projection, unordered.
// After projection: (timestamp, v1, k0)
let projected_columns = vec![1, 3, 0];
let projection = Projection::new(&region_schema, projected_columns.clone());
assert_eq!(projected_columns, projection.projected_columns);
// Need to read (k0, timestamp, v1, sequence, op_type)
assert_eq!(&[0, 1, 3, 4, 5], &projection.columns_to_read[..]);
assert_eq!(5, projection.id_to_read_idx.len());
// Index of timestamp, v1, k0 in `columns_to_read`
assert_eq!(&[1, 2, 0], &projection.projected_idx_to_read_idx[..]);
// 3 columns: k0, timestamp, v1
assert_eq!(3, projection.num_user_columns);
// Empty projection.
let projection = Projection::new(&region_schema, Vec::new());
assert!(projection.projected_columns.is_empty());
// Still need to read row keys.
assert_eq!(&[0, 1, 4, 5], &projection.columns_to_read[..]);
assert_eq!(4, projection.id_to_read_idx.len());
assert!(projection.projected_idx_to_read_idx.is_empty());
assert_eq!(2, projection.num_user_columns);
}
#[test]
fn test_projected_schema_with_projection() {
// (k0, timestamp, v0, v1, v2)
let region_schema = Arc::new(schema_util::new_region_schema(123, 3));
// After projection: (v1, timestamp)
let projected_schema =
ProjectedSchema::new(region_schema.clone(), Some(vec![3, 1])).unwrap();
let expect_user = schema_util::new_schema_with_version(
&[
("v1", LogicalTypeId::Int64, true),
("timestamp", LogicalTypeId::TimestampMillisecond, false),
],
Some(1),
123,
);
assert_eq!(expect_user, **projected_schema.projected_user_schema());
// Test is_needed
let needed: Vec<_> = region_schema
.columns()
.iter()
.enumerate()
.filter_map(|(idx, column_meta)| {
if projected_schema.is_needed(column_meta.id()) {
Some(idx)
} else {
None
}
})
.collect();
// (k0, timestamp, v1, sequence, op_type)
assert_eq!(&[0, 1, 3, 5, 6], &needed[..]);
// Use another projection.
// After projection: (v0, timestamp)
let projected_schema = ProjectedSchema::new(region_schema, Some(vec![2, 1])).unwrap();
// The schema to read should be same as region schema with (k0, timestamp, v0).
// We can't use `new_schema_with_version()` because the StoreSchema also store other
// metadata that `new_schema_with_version()` can't store.
let expect_schema = schema_util::new_region_schema(123, 1);
assert_eq!(
expect_schema.store_schema(),
projected_schema.schema_to_read()
);
// (k0, timestamp, v0, sequence, op_type)
let batch = tests::new_batch();
// Test Batch to our Chunk.
// (v0, timestamp)
let chunk = projected_schema.batch_to_chunk(&batch);
assert_eq!(2, chunk.columns.len());
assert_eq!(&chunk.columns[0], batch.column(2));
assert_eq!(&chunk.columns[1], batch.column(1));
}
#[test]
fn test_projected_schema_no_projection() {
// (k0, timestamp, v0)
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
let projected_schema = ProjectedSchema::no_projection(region_schema.clone());
assert_eq!(
region_schema.user_schema(),
projected_schema.projected_user_schema()
);
assert_eq!(
region_schema.store_schema(),
projected_schema.schema_to_read()
);
for column in region_schema.columns() {
assert!(projected_schema.is_needed(column.id()));
}
// (k0, timestamp, v0, sequence, op_type)
let batch = tests::new_batch();
// Test Batch to our Chunk.
// (k0, timestamp, v0)
let chunk = projected_schema.batch_to_chunk(&batch);
assert_eq!(3, chunk.columns.len());
}
#[test]
fn test_projected_schema_empty_projection() {
// (k0, timestamp, v0)
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
let err = ProjectedSchema::new(region_schema, Some(Vec::new()))
.err()
.unwrap();
assert!(matches!(err, Error::InvalidProjection { .. }));
}
#[test]
fn test_compare_batch() {
let schema = read_util::new_projected_schema();
let left = read_util::new_full_kv_batch(&[(1000, 1, 1000, OpType::Put)]);
let right = read_util::new_full_kv_batch(&[
(999, 1, 1000, OpType::Put),
(1000, 1, 999, OpType::Put),
(1000, 1, 1000, OpType::Put),
]);
assert_eq!(Ordering::Greater, schema.compare_row(&left, 0, &right, 0));
assert_eq!(Ordering::Less, schema.compare_row(&left, 0, &right, 1));
assert_eq!(Ordering::Equal, schema.compare_row(&left, 0, &right, 2));
}
#[test]
fn test_batch_find_unique() {
let schema = read_util::new_projected_schema();
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);
let mut selected = BitVec::repeat(false, 3);
schema.find_unique(&batch, &mut selected, None);
assert!(selected[0]);
assert!(selected[1]);
assert!(!selected[2]);
let mut selected = BitVec::repeat(false, 3);
let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
schema.find_unique(&batch, &mut selected, Some(&prev));
assert!(!selected[0]);
assert!(selected[1]);
assert!(!selected[2]);
}
#[test]
fn test_find_unique_with_op() {
let schema = read_util::new_projected_schema();
let mut selected = BitVec::repeat(false, 3);
let batch = read_util::new_full_kv_batch(&[
(1001, 1, 3, OpType::Put),
(1000, 1, 2, OpType::Delete),
(1000, 1, 1, OpType::Put),
]);
schema.find_unique(&batch, &mut selected, None);
assert!(selected[0]);
assert!(selected[1]);
assert!(!selected[2]);
}
#[test]
fn test_filter_batch() {
let schema = read_util::new_projected_schema();
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (3000, Some(3))]);
let filter = BooleanVector::from_slice(&[true, false, true]);
let res = schema.filter(&batch, &filter).unwrap();
let expect: VectorRef = Arc::new(TimestampMillisecondVector::from_values([1000, 3000]));
assert_eq!(expect, *res.column(0));
}
#[test]
fn test_unselect_deleted() {
let schema = read_util::new_projected_schema();
let batch = read_util::new_full_kv_batch(&[
(100, 1, 1000, OpType::Put),
(101, 1, 999, OpType::Delete),
(102, 1, 1000, OpType::Put),
(103, 1, 999, OpType::Put),
(104, 1, 1000, OpType::Delete),
]);
let mut selected = BitVec::repeat(true, batch.num_rows());
schema.unselect_deleted(&batch, &mut selected);
assert_eq!(
BitVec::from_iter([true, false, true, true, false]),
selected
);
}
}

View File

@@ -1,214 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt;
use std::sync::Arc;
use datatypes::schema::{Schema, SchemaBuilder, SchemaRef};
use snafu::ResultExt;
use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, ColumnsMetadataRef, Result};
use crate::schema::{StoreSchema, StoreSchemaRef};
/// Schema of region.
///
/// The `RegionSchema` has the knowledge of reserved and internal columns.
/// Reserved columns are columns that their names, ids are reserved by the storage
/// engine, and could not be used by the user. Reserved columns usually have
/// special usage. Reserved columns expect the version columns are also
/// called internal columns (though the version could also be thought as a
/// special kind of internal column), are not visible to user, such as our
/// internal sequence, op_type columns.
///
/// The user schema is the schema that only contains columns that user could visit,
/// as well as what the schema user created.
#[derive(PartialEq, Eq)]
pub struct RegionSchema {
/// Schema that only contains columns that user defined, excluding internal columns
/// that are reserved and used by the storage engine.
///
/// Holding a [SchemaRef] to allow converting into `SchemaRef`/`arrow::SchemaRef`
/// conveniently. The fields order in `SchemaRef` **must** be consistent with
/// columns order in [ColumnsMetadata] to ensure the projection index of a field
/// is correct.
user_schema: SchemaRef,
/// store schema contains all columns of the region, including all internal columns.
store_schema: StoreSchemaRef,
/// Metadata of columns.
columns: ColumnsMetadataRef,
}
impl fmt::Debug for RegionSchema {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("RegionSchema")
.field("columns", &self.columns)
.finish()
}
}
impl RegionSchema {
pub fn new(columns: ColumnsMetadataRef, version: u32) -> Result<RegionSchema> {
let user_schema = Arc::new(build_user_schema(&columns, version)?);
let store_schema = Arc::new(StoreSchema::from_columns_metadata(&columns, version)?);
debug_assert_eq!(user_schema.version(), store_schema.version());
debug_assert_eq!(version, user_schema.version());
Ok(RegionSchema {
user_schema,
store_schema,
columns,
})
}
/// Returns the schema of the region, excluding internal columns that used by
/// the storage engine.
#[inline]
pub fn user_schema(&self) -> &SchemaRef {
&self.user_schema
}
/// Returns the schema actually stores, which would also contains all internal columns.
#[inline]
pub fn store_schema(&self) -> &StoreSchemaRef {
&self.store_schema
}
#[inline]
pub fn row_key_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
self.columns.iter_row_key_columns()
}
#[inline]
pub fn field_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
self.columns.iter_field_columns()
}
#[inline]
pub fn num_row_key_columns(&self) -> usize {
self.columns.num_row_key_columns()
}
#[inline]
pub fn num_field_columns(&self) -> usize {
self.columns.num_field_columns()
}
#[inline]
pub fn version(&self) -> u32 {
self.user_schema.version()
}
#[inline]
pub(crate) fn row_key_end(&self) -> usize {
self.columns.row_key_end()
}
#[inline]
pub(crate) fn sequence_index(&self) -> usize {
self.store_schema.sequence_index()
}
#[inline]
pub(crate) fn op_type_index(&self) -> usize {
self.store_schema.op_type_index()
}
#[inline]
pub(crate) fn row_key_indices(&self) -> impl Iterator<Item = usize> {
self.store_schema.row_key_indices()
}
#[inline]
pub fn timestamp_index(&self) -> usize {
self.store_schema.timestamp_index()
}
#[inline]
pub(crate) fn timestamp_column_name(&self) -> &str {
self.store_schema.column_name(self.timestamp_index())
}
#[inline]
pub(crate) fn value_indices(&self) -> impl Iterator<Item = usize> {
self.store_schema.value_indices()
}
#[inline]
pub fn column_metadata(&self, idx: usize) -> &ColumnMetadata {
self.columns.column_metadata(idx)
}
#[cfg(test)]
pub(crate) fn columns(&self) -> &[ColumnMetadata] {
self.columns.columns()
}
}
pub type RegionSchemaRef = Arc<RegionSchema>;
// Now user schema don't have extra metadata like store schema.
fn build_user_schema(columns: &ColumnsMetadata, version: u32) -> Result<Schema> {
let column_schemas: Vec<_> = columns
.iter_user_columns()
.map(|col| col.desc.to_column_schema())
.collect();
SchemaBuilder::try_from(column_schemas)
.context(metadata::ConvertSchemaSnafu)?
.version(version)
.build()
.context(metadata::InvalidSchemaSnafu)
}
#[cfg(test)]
mod tests {
use datatypes::type_id::LogicalTypeId;
use super::*;
use crate::test_util::schema_util;
#[test]
fn test_region_schema() {
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
let expect_schema = schema_util::new_schema_with_version(
&[
("k0", LogicalTypeId::Int64, false),
("timestamp", LogicalTypeId::TimestampMillisecond, false),
("v0", LogicalTypeId::Int64, true),
],
Some(1),
123,
);
assert_eq!(expect_schema, **region_schema.user_schema());
// Checks row key column.
let mut row_keys = region_schema.row_key_columns();
assert_eq!("k0", row_keys.next().unwrap().desc.name);
assert_eq!("timestamp", row_keys.next().unwrap().desc.name);
assert_eq!(None, row_keys.next());
assert_eq!(2, region_schema.num_row_key_columns());
// Checks value column.
let mut values = region_schema.field_columns();
assert_eq!("v0", values.next().unwrap().desc.name);
assert_eq!(None, values.next());
assert_eq!(1, region_schema.num_field_columns());
// Checks version.
assert_eq!(123, region_schema.version());
}
}

View File

@@ -1,323 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use datatypes::arrow::datatypes::Schema as ArrowSchema;
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::schema::{Schema, SchemaBuilder, SchemaRef};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::consts;
use crate::error::NewRecordBatchSnafu;
use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, Error, Result};
use crate::read::Batch;
const ROW_KEY_END_KEY: &str = "greptime:storage:row_key_end";
const USER_COLUMN_END_KEY: &str = "greptime:storage:user_column_end";
/// Schema that contains storage engine specific metadata, such as internal columns.
///
/// Used internally, contains all row key columns, internal columns and a sub set of
/// value columns in a region. The columns are organized in `key, value, internal` order.
#[derive(Debug, PartialEq, Eq)]
pub struct StoreSchema {
columns: Vec<ColumnMetadata>,
schema: SchemaRef,
row_key_end: usize,
user_column_end: usize,
}
pub type StoreSchemaRef = Arc<StoreSchema>;
impl StoreSchema {
#[inline]
pub fn version(&self) -> u32 {
self.schema.version()
}
#[inline]
pub fn schema(&self) -> &SchemaRef {
&self.schema
}
#[inline]
pub fn arrow_schema(&self) -> &Arc<ArrowSchema> {
self.schema.arrow_schema()
}
// TODO(yingwen): Remove this method.
pub fn batch_to_arrow_record_batch(
&self,
batch: &Batch,
) -> std::result::Result<RecordBatch, crate::error::Error> {
assert_eq!(self.schema.num_columns(), batch.num_columns(),);
RecordBatch::try_new(
self.schema.arrow_schema().clone(),
batch.columns().iter().map(|v| v.to_arrow_array()).collect(),
)
.context(NewRecordBatchSnafu)
}
/// Returns the ending index of row key columns.
///
/// The ending index has the same value as the number of the row key columns.
#[inline]
pub fn row_key_end(&self) -> usize {
self.row_key_end
}
/// Returns the index of timestamp column.
/// We always assume that timestamp is the last column in [StoreSchema].
#[inline]
pub fn timestamp_index(&self) -> usize {
self.row_key_end - 1
}
pub(crate) fn contains_column(&self, name: &str) -> bool {
self.schema.column_schema_by_name(name).is_some()
}
pub(crate) fn is_key_column(&self, name: &str) -> bool {
self.schema
.column_index_by_name(name)
.map(|idx| idx < self.row_key_end)
.unwrap_or(false)
}
pub(crate) fn is_user_column(&self, name: &str) -> bool {
self.schema
.column_index_by_name(name)
.map(|idx| idx < self.user_column_end)
.unwrap_or(false)
}
pub(crate) fn from_columns_metadata(
columns: &ColumnsMetadata,
version: u32,
) -> Result<StoreSchema> {
StoreSchema::new(
columns.columns().to_vec(),
version,
columns.row_key_end(),
columns.user_column_end(),
)
}
pub(crate) fn new(
columns: Vec<ColumnMetadata>,
version: u32,
row_key_end: usize,
user_column_end: usize,
) -> Result<StoreSchema> {
let column_schemas = columns
.iter()
.map(|meta| meta.to_column_schema())
.collect::<Result<Vec<_>>>()?;
let schema = SchemaBuilder::try_from(column_schemas)
.context(metadata::ConvertSchemaSnafu)?
.version(version)
.add_metadata(ROW_KEY_END_KEY, row_key_end.to_string())
.add_metadata(USER_COLUMN_END_KEY, user_column_end.to_string())
.build()
.context(metadata::InvalidSchemaSnafu)?;
assert_eq!(
consts::SEQUENCE_COLUMN_NAME,
schema.column_schemas()[user_column_end].name
);
assert_eq!(
consts::OP_TYPE_COLUMN_NAME,
schema.column_schemas()[user_column_end + 1].name
);
Ok(StoreSchema {
columns,
schema: Arc::new(schema),
row_key_end,
user_column_end,
})
}
#[inline]
pub(crate) fn sequence_index(&self) -> usize {
self.user_column_end
}
#[inline]
pub(crate) fn op_type_index(&self) -> usize {
self.user_column_end + 1
}
#[inline]
pub(crate) fn row_key_indices(&self) -> impl Iterator<Item = usize> {
0..self.row_key_end
}
#[inline]
pub(crate) fn value_indices(&self) -> impl Iterator<Item = usize> {
self.row_key_end..self.user_column_end
}
#[inline]
pub(crate) fn column_name(&self, idx: usize) -> &str {
&self.schema.column_schemas()[idx].name
}
/// # Panic
/// Panics if `name` is not a valid column name.
#[inline]
pub(crate) fn column_index(&self, name: &str) -> usize {
self.schema.column_index_by_name(name).unwrap()
}
#[inline]
pub(crate) fn num_columns(&self) -> usize {
self.schema.num_columns()
}
#[inline]
pub(crate) fn user_column_end(&self) -> usize {
self.user_column_end
}
#[inline]
pub(crate) fn field_columns(&self) -> &[ColumnMetadata] {
&self.columns[self.row_key_end..self.user_column_end]
}
/// Returns the index of the value column according its `offset`.
#[inline]
pub(crate) fn field_column_index_by_offset(&self, offset: usize) -> usize {
self.row_key_end + offset
}
#[inline]
pub(crate) fn columns(&self) -> &[ColumnMetadata] {
&self.columns
}
}
impl TryFrom<Arc<ArrowSchema>> for StoreSchema {
type Error = Error;
fn try_from(arrow_schema: Arc<ArrowSchema>) -> std::result::Result<Self, Self::Error> {
let schema = Schema::try_from(arrow_schema).context(metadata::ConvertArrowSchemaSnafu)?;
// Recover other metadata from schema.
let row_key_end = parse_index_from_metadata(schema.metadata(), ROW_KEY_END_KEY)?;
let user_column_end = parse_index_from_metadata(schema.metadata(), USER_COLUMN_END_KEY)?;
// There should be sequence and op_type columns.
ensure!(
consts::SEQUENCE_COLUMN_NAME == schema.column_schemas()[user_column_end].name,
metadata::InvalidIndexSnafu
);
ensure!(
consts::OP_TYPE_COLUMN_NAME == schema.column_schemas()[user_column_end + 1].name,
metadata::InvalidIndexSnafu
);
// Recover ColumnMetadata from schema.
let columns = schema
.column_schemas()
.iter()
.map(ColumnMetadata::from_column_schema)
.collect::<Result<_>>()?;
Ok(StoreSchema {
columns,
schema: Arc::new(schema),
row_key_end,
user_column_end,
})
}
}
impl TryFrom<ArrowSchema> for StoreSchema {
type Error = Error;
fn try_from(arrow_schema: ArrowSchema) -> std::result::Result<StoreSchema, Self::Error> {
StoreSchema::try_from(Arc::new(arrow_schema))
}
}
fn parse_index_from_metadata(metadata: &HashMap<String, String>, key: &str) -> Result<usize> {
let value = metadata
.get(key)
.context(metadata::MetaNotFoundSnafu { key })?;
value.parse().with_context(|_| metadata::ParseMetaIntSnafu {
key_value: format!("{key}={value}"),
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::read::Batch;
use crate::schema::tests;
use crate::test_util::schema_util;
fn check_chunk_batch(record_batch: &RecordBatch, batch: &Batch) {
assert_eq!(5, record_batch.num_columns());
assert_eq!(3, record_batch.num_rows());
for i in 0..5 {
assert_eq!(record_batch.column(i), &batch.column(i).to_arrow_array());
}
}
#[test]
fn test_store_schema() {
let region_schema = Arc::new(schema_util::new_region_schema(123, 1));
// Checks StoreSchema.
let store_schema = region_schema.store_schema();
assert_eq!(123, store_schema.version());
let sst_arrow_schema = store_schema.arrow_schema();
let converted_store_schema = StoreSchema::try_from((**sst_arrow_schema).clone()).unwrap();
assert_eq!(**store_schema, converted_store_schema);
let column_schemas: Vec<_> = region_schema
.columns()
.iter()
.map(|meta| meta.to_column_schema().unwrap())
.collect();
let expect_schema = SchemaBuilder::try_from(column_schemas)
.unwrap()
.version(123)
.build()
.unwrap();
// Only compare column schemas since SchemaRef in StoreSchema also contains other metadata that only used
// by StoreSchema.
assert_eq!(
expect_schema.column_schemas(),
store_schema.schema().column_schemas(),
);
assert_eq!(3, store_schema.sequence_index());
assert_eq!(4, store_schema.op_type_index());
let row_key_indices: Vec<_> = store_schema.row_key_indices().collect();
assert_eq!([0, 1], &row_key_indices[..]);
let value_indices: Vec<_> = store_schema.value_indices().collect();
assert_eq!([2], &value_indices[..]);
// Test batch and chunk conversion.
let batch = tests::new_batch();
// Convert batch to chunk.
let chunk = store_schema.batch_to_arrow_record_batch(&batch).unwrap();
check_chunk_batch(&chunk, &batch);
}
}

View File

@@ -1,103 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp;
use async_trait::async_trait;
use store_api::storage::{
GetRequest, GetResponse, ReadContext, ScanRequest, ScanResponse, SchemaRef, SequenceNumber,
Snapshot,
};
use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
use crate::error::{Error, Result};
use crate::sst::AccessLayerRef;
use crate::version::VersionRef;
/// [Snapshot] implementation.
pub struct SnapshotImpl {
version: VersionRef,
/// Max sequence number (inclusive) visible to user.
visible_sequence: SequenceNumber,
sst_layer: AccessLayerRef,
}
#[async_trait]
impl Snapshot for SnapshotImpl {
type Error = Error;
type Reader = ChunkReaderImpl;
fn schema(&self) -> &SchemaRef {
self.version.user_schema()
}
async fn scan(
&self,
ctx: &ReadContext,
request: ScanRequest,
) -> Result<ScanResponse<ChunkReaderImpl>> {
let visible_sequence = self.sequence_to_read(request.sequence);
let memtable_version = self.version.memtables();
let mutables = memtable_version.mutable_memtable();
let immutables = memtable_version.immutable_memtables();
let mut builder = ChunkReaderBuilder::new(
self.version.metadata().id(),
self.version.schema().clone(),
self.sst_layer.clone(),
)
.reserve_num_memtables(memtable_version.num_memtables())
.projection(request.projection)
.filters(request.filters)
.batch_size(ctx.batch_size)
.output_ordering(request.output_ordering)
.visible_sequence(visible_sequence)
.pick_memtables(mutables.clone())
.use_chain_reader(true);
for memtable in immutables {
builder = builder.pick_memtables(memtable.clone());
}
let reader = builder.pick_all_ssts(self.version.ssts())?.build().await?;
Ok(ScanResponse { reader })
}
async fn get(&self, _ctx: &ReadContext, _request: GetRequest) -> Result<GetResponse> {
unimplemented!()
}
}
impl SnapshotImpl {
pub fn new(
version: VersionRef,
visible_sequence: SequenceNumber,
sst_layer: AccessLayerRef,
) -> SnapshotImpl {
SnapshotImpl {
version,
visible_sequence,
sst_layer,
}
}
#[inline]
fn sequence_to_read(&self, request_sequence: Option<SequenceNumber>) -> SequenceNumber {
request_sequence
.map(|s| cmp::min(s, self.visible_sequence))
.unwrap_or(self.visible_sequence)
}
}

View File

@@ -1,830 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub(crate) mod parquet;
mod pruning;
mod stream_writer;
use std::collections::HashMap;
use std::fmt;
use std::fmt::{Debug, Formatter};
use std::str::FromStr;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use async_trait::async_trait;
use common_base::readable_size::ReadableSize;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::{debug, error};
use common_time::range::TimestampRange;
use common_time::Timestamp;
use datatypes::schema::SchemaRef;
use futures_util::StreamExt;
use object_store::{util, ObjectStore};
use serde::{Deserialize, Deserializer, Serialize};
use snafu::{ResultExt, Snafu};
use store_api::storage::{ChunkReader, RegionId};
use table::predicate::Predicate;
use uuid::Uuid;
use crate::chunk::ChunkReaderImpl;
use crate::error;
use crate::error::{DeleteSstSnafu, Result};
use crate::file_purger::{FilePurgeRequest, FilePurgerRef};
use crate::memtable::BoxedBatchIterator;
use crate::read::{Batch, BatchReader, BoxedBatchReader};
use crate::scheduler::Scheduler;
use crate::schema::ProjectedSchemaRef;
use crate::sst::parquet::{ChunkStream, ParquetReader, ParquetWriter};
/// Maximum level of SSTs.
pub const MAX_LEVEL: u8 = 2;
pub type Level = u8;
pub use crate::sst::stream_writer::BufferedWriter;
// We only has fixed number of level, so we use array to hold elements. This implementation
// detail of LevelMetaVec should not be exposed to the user of [LevelMetas].
type LevelMetaVec = [LevelMeta; MAX_LEVEL as usize];
/// Metadata of all SSTs under a region.
///
/// Files are organized into multiple level, though there may be only one level.
#[derive(Clone)]
pub struct LevelMetas {
levels: LevelMetaVec,
sst_layer: AccessLayerRef,
file_purger: FilePurgerRef,
/// Compaction time window in seconds
compaction_time_window: Option<i64>,
}
impl std::fmt::Debug for LevelMetas {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("LevelMetas")
.field("levels", &self.levels)
.field("compaction_time_window", &self.compaction_time_window)
.finish()
}
}
impl LevelMetas {
/// Create a new LevelMetas and initialized each level.
pub fn new(sst_layer: AccessLayerRef, file_purger: FilePurgerRef) -> LevelMetas {
LevelMetas {
levels: new_level_meta_vec(),
sst_layer,
file_purger,
compaction_time_window: Default::default(),
}
}
/// Returns total level number.
#[inline]
pub fn level_num(&self) -> usize {
self.levels.len()
}
pub fn compaction_time_window(&self) -> Option<i64> {
self.compaction_time_window
}
#[inline]
pub fn level(&self, level: Level) -> &LevelMeta {
&self.levels[level as usize]
}
/// Merge `self` with files to add/remove to create a new [LevelMetas].
///
/// # Panics
/// Panics if level of [FileHandle] is greater than [MAX_LEVEL].
pub fn merge(
&self,
files_to_add: impl Iterator<Item = FileMeta>,
files_to_remove: impl Iterator<Item = FileMeta>,
compaction_time_window: Option<i64>,
) -> LevelMetas {
let mut merged = self.clone();
for file in files_to_add {
let level = file.level;
let handle = FileHandle::new(file, self.sst_layer.clone(), self.file_purger.clone());
merged.levels[level as usize].add_file(handle);
}
for file in files_to_remove {
let level = file.level;
if let Some(removed_file) = merged.levels[level as usize].remove_file(file.file_id) {
removed_file.mark_deleted();
}
}
// we only update region's compaction time window iff region's window is not set and VersionEdit's
// compaction time window is present.
if let Some(window) = compaction_time_window {
let _ = merged.compaction_time_window.get_or_insert(window);
}
merged
}
pub fn mark_all_files_deleted(&self) -> Vec<FileId> {
self.levels().iter().fold(vec![], |mut files, level| {
files.extend(level.files().map(|f| {
f.mark_deleted();
f.file_id()
}));
files
})
}
pub fn levels(&self) -> &[LevelMeta] {
&self.levels
}
pub fn file_purger(&self) -> FilePurgerRef {
self.file_purger.clone()
}
}
/// Metadata of files in same SST level.
#[derive(Default, Clone)]
pub struct LevelMeta {
level: Level,
/// Handles to the files in this level.
// TODO(yingwen): Now for simplicity, files are unordered, maybe sort the files by time range
// or use another structure to hold them.
files: HashMap<FileId, FileHandle>,
}
impl std::fmt::Debug for LevelMeta {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("LevelMeta")
.field("level", &self.level)
.field("files", &self.files.keys())
.finish()
}
}
impl LevelMeta {
pub fn new(level: Level) -> Self {
Self {
level,
files: HashMap::new(),
}
}
fn add_file(&mut self, file: FileHandle) {
let _ = self.files.insert(file.file_id(), file);
}
fn remove_file(&mut self, file_to_remove: FileId) -> Option<FileHandle> {
self.files.remove(&file_to_remove)
}
/// Returns the level of level meta.
#[inline]
pub fn level(&self) -> Level {
self.level
}
/// Returns number of SST files in level.
#[inline]
pub fn file_num(&self) -> usize {
self.files.len()
}
/// Returns expired SSTs from current level.
pub fn get_expired_files(&self, expire_time: &Timestamp) -> Vec<FileHandle> {
self.files
.iter()
.filter_map(|(_, v)| {
let Some((_, end)) = v.time_range() else {
return None;
};
if end < expire_time {
Some(v.clone())
} else {
None
}
})
.collect()
}
pub fn files(&self) -> impl Iterator<Item = &FileHandle> {
self.files.values()
}
}
fn new_level_meta_vec() -> LevelMetaVec {
(0u8..MAX_LEVEL)
.map(LevelMeta::new)
.collect::<Vec<_>>()
.try_into()
.unwrap() // safety: LevelMetaVec is a fixed length array with length MAX_LEVEL
}
#[derive(Clone)]
pub struct FileHandle {
inner: Arc<FileHandleInner>,
}
impl Debug for FileHandle {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("FileHandle")
.field("file_id", &self.inner.meta.file_id)
.field("region_id", &self.inner.meta.region_id)
.field("time_range", &self.inner.meta.time_range)
.field("size", &self.inner.meta.file_size)
.field("level", &self.inner.meta.level)
.field("compacting", &self.inner.compacting)
.field("deleted", &self.inner.deleted)
.finish()
}
}
impl FileHandle {
pub fn new(
meta: FileMeta,
sst_layer: AccessLayerRef,
file_purger: FilePurgerRef,
) -> FileHandle {
FileHandle {
inner: Arc::new(FileHandleInner::new(meta, sst_layer, file_purger)),
}
}
/// Returns level as usize so it can be used as index.
#[inline]
pub fn level(&self) -> Level {
self.inner.meta.level
}
#[inline]
pub fn file_name(&self) -> String {
self.inner.meta.file_id.as_parquet()
}
#[inline]
pub fn file_path(&self) -> String {
self.inner
.sst_layer
.sst_file_path(&self.inner.meta.file_id.as_parquet())
}
#[inline]
pub fn file_id(&self) -> FileId {
self.inner.meta.file_id
}
#[inline]
pub fn time_range(&self) -> &Option<(Timestamp, Timestamp)> {
&self.inner.meta.time_range
}
/// Returns true if current file is under compaction.
#[inline]
pub fn compacting(&self) -> bool {
self.inner.compacting.load(Ordering::Relaxed)
}
/// Sets the compacting flag.
#[inline]
pub fn mark_compacting(&self, compacting: bool) {
self.inner.compacting.store(compacting, Ordering::Relaxed);
}
#[inline]
pub fn deleted(&self) -> bool {
self.inner.deleted.load(Ordering::Relaxed)
}
#[inline]
pub fn mark_deleted(&self) {
self.inner.deleted.store(true, Ordering::Relaxed);
}
#[inline]
pub fn meta(&self) -> FileMeta {
self.inner.meta.clone()
}
#[inline]
pub fn file_size(&self) -> u64 {
self.inner.meta.file_size
}
}
/// Actually data of [FileHandle].
///
/// Contains meta of the file, and other mutable info like metrics.
struct FileHandleInner {
meta: FileMeta,
compacting: AtomicBool,
deleted: AtomicBool,
sst_layer: AccessLayerRef,
file_purger: FilePurgerRef,
}
impl fmt::Debug for FileHandleInner {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("FileHandleInner")
.field("meta", &self.meta)
.field("compacting", &self.compacting)
.field("deleted", &self.deleted)
.finish()
}
}
impl Drop for FileHandleInner {
fn drop(&mut self) {
if self.deleted.load(Ordering::Relaxed) {
let request = FilePurgeRequest {
sst_layer: self.sst_layer.clone(),
file_id: self.meta.file_id,
region_id: self.meta.region_id,
};
match self.file_purger.schedule(request) {
Ok(res) => {
debug!(
"Scheduled SST purge task, region: {}, name: {}, res: {}",
self.meta.region_id,
self.meta.file_id.as_parquet(),
res
);
}
Err(e) => {
error!(e; "Failed to schedule SST purge task, region: {}, name: {}",
self.meta.region_id, self.meta.file_id.as_parquet());
}
}
}
}
}
impl FileHandleInner {
fn new(
meta: FileMeta,
sst_layer: AccessLayerRef,
file_purger: FilePurgerRef,
) -> FileHandleInner {
FileHandleInner {
meta,
compacting: AtomicBool::new(false),
deleted: AtomicBool::new(false),
sst_layer,
file_purger,
}
}
}
#[derive(Debug, Snafu, PartialEq)]
pub struct ParseIdError {
source: uuid::Error,
}
/// Unique id for [SST File].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
pub struct FileId(Uuid);
impl FileId {
/// Returns a new unique [FileId] randomly.
pub fn random() -> FileId {
FileId(Uuid::new_v4())
}
/// Parses id from string.
pub fn parse_str(input: &str) -> std::result::Result<FileId, ParseIdError> {
Uuid::parse_str(input).map(FileId).context(ParseIdSnafu)
}
/// Append `.parquet` to file id to make a complete file name
pub fn as_parquet(&self) -> String {
format!("{}{}", self.0.hyphenated(), ".parquet")
}
}
impl fmt::Display for FileId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl FromStr for FileId {
type Err = ParseIdError;
fn from_str(s: &str) -> std::result::Result<FileId, ParseIdError> {
FileId::parse_str(s)
}
}
/// Immutable metadata of a sst file.
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
#[serde(default)]
pub struct FileMeta {
/// Region of file.
pub region_id: RegionId,
/// Compared to normal file names, FileId ignore the extension
#[serde(deserialize_with = "deserialize_from_string")]
#[serde(alias = "file_name")]
pub file_id: FileId,
/// Timestamp range of file.
pub time_range: Option<(Timestamp, Timestamp)>,
/// SST level of the file.
pub level: Level,
/// Size of the file.
pub file_size: u64,
}
fn deserialize_from_string<'de, D>(deserializer: D) -> std::result::Result<FileId, D::Error>
where
D: Deserializer<'de>,
{
let s: &str = Deserialize::deserialize(deserializer)?;
let stripped = s.strip_suffix(".parquet").unwrap_or(s); // strip parquet suffix if needed.
FileId::from_str(stripped).map_err(<D::Error as serde::de::Error>::custom)
}
#[derive(Debug)]
pub struct WriteOptions {
// TODO(yingwen): [flush] row group size.
pub sst_write_buffer_size: ReadableSize,
}
impl Default for WriteOptions {
fn default() -> Self {
Self {
sst_write_buffer_size: ReadableSize::mb(8),
}
}
}
pub struct ReadOptions {
/// Suggested size of each batch.
pub batch_size: usize,
/// The schema that user expected to read, might not the same as the
/// schema of the SST file.
pub projected_schema: ProjectedSchemaRef,
pub predicate: Predicate,
pub time_range: TimestampRange,
}
#[derive(Debug, PartialEq)]
pub struct SstInfo {
pub time_range: Option<(Timestamp, Timestamp)>,
pub file_size: u64,
pub num_rows: usize,
}
/// SST access layer.
#[async_trait]
pub trait AccessLayer: Send + Sync + std::fmt::Debug {
/// Returns the sst file path.
fn sst_file_path(&self, file_name: &str) -> String;
/// Writes SST file with given `file_id` and returns the SST info.
/// If source does not contain any data, `write_sst` will return `Ok(None)`.
async fn write_sst(
&self,
file_id: FileId,
source: Source,
opts: &WriteOptions,
) -> Result<Option<SstInfo>>;
/// Read SST file with given `file_handle` and schema.
async fn read_sst(
&self,
file_handle: FileHandle,
opts: &ReadOptions,
) -> Result<BoxedBatchReader>;
/// Deletes a SST file with given name.
async fn delete_sst(&self, file_id: FileId) -> Result<()>;
}
pub type AccessLayerRef = Arc<dyn AccessLayer>;
/// Parquet writer data source.
pub enum Source {
/// Writes rows from memtable to parquet
Iter(BoxedBatchIterator),
/// Writes row from ChunkReaderImpl (maybe a set of SSTs) to parquet.
Reader(ChunkReaderImpl),
/// Record batch stream yielded by table scan
Stream(SendableRecordBatchStream),
}
impl Source {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
match self {
Source::Iter(iter) => iter.next().transpose(),
Source::Reader(reader) => reader
.next_chunk()
.await
.map(|p| p.map(|chunk| Batch::new(chunk.columns))),
Source::Stream(stream) => stream
.next()
.await
.transpose()
.map(|r| r.map(|r| Batch::new(r.columns().to_vec())))
.context(error::CreateRecordBatchSnafu),
}
}
fn schema(&self) -> SchemaRef {
match self {
Source::Iter(iter) => {
let projected_schema = iter.schema();
projected_schema.schema_to_read().schema().clone()
}
Source::Reader(reader) => reader.projected_schema().schema_to_read().schema().clone(),
Source::Stream(stream) => stream.schema(),
}
}
}
/// Sst access layer.
pub struct FsAccessLayer {
sst_dir: String,
object_store: ObjectStore,
}
impl fmt::Debug for FsAccessLayer {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("FsAccessLayer")
.field("sst_dir", &self.sst_dir)
.finish()
}
}
impl FsAccessLayer {
pub fn new(sst_dir: &str, object_store: ObjectStore) -> FsAccessLayer {
FsAccessLayer {
sst_dir: util::normalize_dir(sst_dir),
object_store,
}
}
}
#[async_trait]
impl AccessLayer for FsAccessLayer {
fn sst_file_path(&self, file_name: &str) -> String {
format!("{}{}", self.sst_dir, file_name)
}
/// Writes SST file with given `file_id`.
async fn write_sst(
&self,
file_id: FileId,
source: Source,
opts: &WriteOptions,
) -> Result<Option<SstInfo>> {
// Now we only supports parquet format. We may allow caller to specific SST format in
// WriteOptions in the future.
let file_path = self.sst_file_path(&file_id.as_parquet());
let writer = ParquetWriter::new(&file_path, source, self.object_store.clone());
writer.write_sst(opts).await
}
/// Read SST file with given `file_handle` and schema.
async fn read_sst(
&self,
file_handle: FileHandle,
opts: &ReadOptions,
) -> Result<BoxedBatchReader> {
let reader = ParquetReader::new(
file_handle,
self.object_store.clone(),
opts.projected_schema.clone(),
opts.predicate.clone(),
opts.time_range,
);
Ok(Box::new(LazyParquetBatchReader::new(reader)))
}
/// Deletes a SST file with given file id.
async fn delete_sst(&self, file_id: FileId) -> Result<()> {
let path = self.sst_file_path(&file_id.as_parquet());
self.object_store
.delete(&path)
.await
.context(DeleteSstSnafu)
}
}
struct LazyParquetBatchReader {
inner: ParquetReader,
stream: Option<ChunkStream>,
}
impl LazyParquetBatchReader {
fn new(inner: ParquetReader) -> Self {
Self {
inner,
stream: None,
}
}
}
#[async_trait]
impl BatchReader for LazyParquetBatchReader {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
if let Some(s) = &mut self.stream {
s.next_batch().await
} else {
let mut stream = self.inner.chunk_stream().await?;
let res = stream.next_batch().await;
self.stream = Some(stream);
res
}
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use super::*;
use crate::file_purger::noop::NoopFilePurgeHandler;
use crate::scheduler::{LocalScheduler, SchedulerConfig};
#[test]
fn test_file_id() {
let id = FileId::random();
let uuid_str = id.to_string();
assert_eq!(id.0.to_string(), uuid_str);
let parsed = FileId::parse_str(&uuid_str).unwrap();
assert_eq!(id, parsed);
let parsed = uuid_str.parse().unwrap();
assert_eq!(id, parsed);
}
#[test]
fn test_file_id_serialization() {
let id = FileId::random();
let json = serde_json::to_string(&id).unwrap();
assert_eq!(format!("\"{id}\""), json);
let parsed = serde_json::from_str(&json).unwrap();
assert_eq!(id, parsed);
}
#[test]
fn test_deserialize_file_meta() {
let file_meta = create_file_meta(FileId::random(), 0);
let serialized_file_meta = serde_json::to_string(&file_meta).unwrap();
let deserialized_file_meta = serde_json::from_str(&serialized_file_meta);
assert_eq!(file_meta, deserialized_file_meta.unwrap());
}
#[test]
fn test_deserialize_from_string() {
let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55\",\"time_range\":null,\"level\":0}";
let file_meta = create_file_meta(
FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
0,
);
let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
assert_eq!(file_meta, deserialized_file_meta);
}
#[test]
fn test_deserialize_from_string_parquet() {
let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}";
let file_meta = create_file_meta(
FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
0,
);
let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
assert_eq!(file_meta, deserialized_file_meta);
}
#[test]
fn test_deserialize_from_string_parquet_file_name() {
let json_file_meta = "{\"region_id\":0,\"file_name\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}";
let file_meta = create_file_meta(
FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
0,
);
let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap();
assert_eq!(file_meta, deserialized_file_meta);
}
#[test]
fn test_file_id_as_parquet() {
let id = FileId::from_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap();
assert_eq!(
"67e55044-10b1-426f-9247-bb680e5fe0c8.parquet",
id.as_parquet()
);
}
fn create_file_meta(file_id: FileId, level: Level) -> FileMeta {
FileMeta {
region_id: 0.into(),
file_id,
time_range: None,
level,
file_size: 0,
}
}
#[test]
fn test_level_metas_add_and_remove() {
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
let purger = Arc::new(LocalScheduler::new(
SchedulerConfig::default(),
NoopFilePurgeHandler,
));
let file_ids = [
FileId::random(),
FileId::random(),
FileId::random(),
FileId::random(),
];
let metas = LevelMetas::new(layer, purger);
let merged = metas.merge(
vec![
create_file_meta(file_ids[0], 0),
create_file_meta(file_ids[1], 0),
]
.into_iter(),
vec![].into_iter(),
None,
);
assert_eq!(
HashSet::from([file_ids[0], file_ids[1]]),
merged.level(0).files().map(|f| f.file_id()).collect()
);
let merged1 = merged.merge(
vec![
create_file_meta(file_ids[2], 1),
create_file_meta(file_ids[3], 1),
]
.into_iter(),
vec![].into_iter(),
None,
);
assert_eq!(
HashSet::from([file_ids[0], file_ids[1]]),
merged1.level(0).files().map(|f| f.file_id()).collect()
);
assert_eq!(
HashSet::from([file_ids[2], file_ids[3]]),
merged1.level(1).files().map(|f| f.file_id()).collect()
);
let removed1 = merged1.merge(
vec![].into_iter(),
vec![
create_file_meta(file_ids[0], 0),
create_file_meta(file_ids[2], 0),
]
.into_iter(),
None,
);
assert_eq!(
HashSet::from([file_ids[1]]),
removed1.level(0).files().map(|f| f.file_id()).collect()
);
assert_eq!(
HashSet::from([file_ids[2], file_ids[3]]),
removed1.level(1).files().map(|f| f.file_id()).collect()
);
let removed2 = removed1.merge(
vec![].into_iter(),
vec![
create_file_meta(file_ids[2], 1),
create_file_meta(file_ids[3], 1),
]
.into_iter(),
None,
);
assert_eq!(
HashSet::from([file_ids[1]]),
removed2.level(0).files().map(|f| f.file_id()).collect()
);
assert_eq!(
HashSet::new(),
removed2.level(1).files().map(|f| f.file_id()).collect()
);
}
}

View File

@@ -1,819 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Parquet sst format.
use std::collections::HashMap;
use std::pin::Pin;
use std::sync::Arc;
use async_compat::CompatExt;
use async_stream::try_stream;
use async_trait::async_trait;
use common_telemetry::{debug, error};
use common_time::range::TimestampRange;
use common_time::Timestamp;
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::prelude::ConcreteDataType;
use futures_util::{Stream, StreamExt, TryStreamExt};
use object_store::ObjectStore;
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
use parquet::basic::{Compression, Encoding, ZstdLevel};
use parquet::file::metadata::KeyValue;
use parquet::file::properties::WriterProperties;
use parquet::format::FileMetaData;
use parquet::schema::types::ColumnPath;
use snafu::{OptionExt, ResultExt};
use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
use table::predicate::Predicate;
use tokio::io::BufReader;
use crate::error::{self, DecodeParquetTimeRangeSnafu, ReadObjectSnafu, ReadParquetSnafu, Result};
use crate::read::{Batch, BatchReader};
use crate::schema::compat::ReadAdapter;
use crate::schema::{ProjectedSchemaRef, StoreSchema};
use crate::sst;
use crate::sst::pruning::build_row_filter;
use crate::sst::stream_writer::BufferedWriter;
use crate::sst::{FileHandle, Source, SstInfo};
/// Parquet sst writer.
pub struct ParquetWriter<'a> {
file_path: &'a str,
source: Source,
object_store: ObjectStore,
max_row_group_size: usize,
}
impl<'a> ParquetWriter<'a> {
pub fn new(file_path: &'a str, source: Source, object_store: ObjectStore) -> ParquetWriter {
ParquetWriter {
file_path,
source,
object_store,
max_row_group_size: 4096, // TODO(hl): make this configurable
}
}
pub async fn write_sst(self, opts: &sst::WriteOptions) -> Result<Option<SstInfo>> {
self.write_rows(None, opts).await
}
/// Iterates memtable and writes rows to Parquet file.
/// A chunk of records yielded from each iteration with a size given
/// in config will be written to a single row group.
async fn write_rows(
mut self,
extra_meta: Option<HashMap<String, String>>,
opts: &sst::WriteOptions,
) -> Result<Option<SstInfo>> {
let schema = self.source.schema();
let mut props_builder = WriterProperties::builder()
.set_compression(Compression::ZSTD(ZstdLevel::default()))
.set_encoding(Encoding::PLAIN)
.set_max_row_group_size(self.max_row_group_size)
.set_key_value_metadata(extra_meta.map(|map| {
map.iter()
.map(|(k, v)| KeyValue::new(k.clone(), v.clone()))
.collect::<Vec<_>>()
}))
.set_column_encoding(
ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
Encoding::DELTA_BINARY_PACKED,
)
.set_column_dictionary_enabled(
ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]),
false,
);
if let Some(ts_col) = schema.timestamp_column() {
props_builder = props_builder.set_column_encoding(
ColumnPath::new(vec![ts_col.name.clone()]),
Encoding::DELTA_BINARY_PACKED,
);
}
let writer_props = props_builder.build();
let mut buffered_writer = BufferedWriter::try_new(
self.file_path.to_string(),
self.object_store.clone(),
&schema,
Some(writer_props),
opts.sst_write_buffer_size.as_bytes() as usize,
)
.await?;
let mut rows_written = 0;
while let Some(batch) = self.source.next_batch().await? {
buffered_writer.write(&batch).await?;
rows_written += batch.num_rows();
}
if rows_written == 0 {
debug!("No data written, try abort writer: {}", self.file_path);
let _ = buffered_writer.close().await?;
return Ok(None);
}
let (file_meta, file_size) = buffered_writer.close().await?;
let time_range = decode_timestamp_range(&file_meta, &schema).ok().flatten();
// object_store.write will make sure all bytes are written or an error is raised.
Ok(Some(SstInfo {
time_range,
file_size,
num_rows: rows_written,
}))
}
}
fn decode_timestamp_range(
file_meta: &FileMetaData,
schema: &datatypes::schema::SchemaRef,
) -> Result<Option<(Timestamp, Timestamp)>> {
let (Some(ts_col_idx), Some(ts_col)) = (schema.timestamp_index(), schema.timestamp_column())
else {
return Ok(None);
};
let ts_datatype = &ts_col.data_type;
decode_timestamp_range_inner(file_meta, ts_col_idx, ts_datatype)
}
fn decode_timestamp_range_inner(
file_meta: &FileMetaData,
ts_index: usize,
ts_datatype: &ConcreteDataType,
) -> Result<Option<(Timestamp, Timestamp)>> {
let mut start = i64::MAX;
let mut end = i64::MIN;
let unit = match ts_datatype {
ConcreteDataType::Timestamp(type_) => type_.unit(),
_ => {
return DecodeParquetTimeRangeSnafu {
msg: format!("Unexpected timestamp column datatype: {ts_datatype:?}"),
}
.fail();
}
};
for rg in &file_meta.row_groups {
let Some(ref metadata) = rg
.columns
.get(ts_index)
.context(DecodeParquetTimeRangeSnafu {
msg: format!("Cannot find ts column by index: {ts_index}"),
})?
.meta_data
else {
return Ok(None);
};
let Some(stats) = &metadata.statistics else {
return Ok(None);
};
let (Some(min_value), Some(max_value)) = (&stats.min_value, &stats.max_value) else {
return Ok(None);
};
// according to [parquet's spec](https://parquet.apache.org/docs/file-format/data-pages/encodings/), min/max value in stats uses plain encoding with little endian.
// also see https://github.com/apache/arrow-rs/blob/5fb337db04a1a19f7d40da46f19b7b5fd4051593/parquet/src/file/statistics.rs#L172
let min = i64::from_le_bytes(min_value[..8].try_into().map_err(|e| {
error!(
"Failed to decode min value from stats, bytes: {:?}, source: {:?}",
min_value, e
);
DecodeParquetTimeRangeSnafu {
msg: "decode min value",
}
.build()
})?);
let max = i64::from_le_bytes(max_value[..8].try_into().map_err(|e| {
error!(
"Failed to decode max value from stats, bytes: {:?}, source: {:?}",
max_value, e
);
DecodeParquetTimeRangeSnafu {
msg: "decode max value",
}
.build()
})?);
start = start.min(min);
end = end.max(max);
}
assert!(
start <= end,
"Illegal timestamp range decoded from SST file {:?}, start: {}, end: {}",
file_meta,
start,
end
);
Ok(Some((
Timestamp::new(start, unit),
Timestamp::new(end, unit),
)))
}
pub struct ParquetReader {
// Holds the file handle to avoid the file purge purge it.
file_handle: FileHandle,
object_store: ObjectStore,
projected_schema: ProjectedSchemaRef,
predicate: Predicate,
time_range: TimestampRange,
}
impl ParquetReader {
pub fn new(
file_handle: FileHandle,
object_store: ObjectStore,
projected_schema: ProjectedSchemaRef,
predicate: Predicate,
time_range: TimestampRange,
) -> ParquetReader {
ParquetReader {
file_handle,
object_store,
projected_schema,
predicate,
time_range,
}
}
pub async fn chunk_stream(&self) -> Result<ChunkStream> {
let file_path = self.file_handle.file_path();
let operator = self.object_store.clone();
let reader = operator
.reader(&file_path)
.await
.context(ReadObjectSnafu { path: &file_path })?
.compat();
let buf_reader = BufReader::new(reader);
let builder = ParquetRecordBatchStreamBuilder::new(buf_reader)
.await
.context(ReadParquetSnafu { file: &file_path })?;
let arrow_schema = builder.schema().clone();
let store_schema = Arc::new(
StoreSchema::try_from(arrow_schema)
.context(error::ConvertStoreSchemaSnafu { file: &file_path })?,
);
let adapter = ReadAdapter::new(store_schema.clone(), self.projected_schema.clone())?;
let pruned_row_groups = self
.predicate
.prune_row_groups(
builder.metadata().row_groups(),
store_schema.schema().clone(),
)
.into_iter()
.enumerate()
.filter_map(|(idx, valid)| if valid { Some(idx) } else { None })
.collect::<Vec<_>>();
let parquet_schema_desc = builder.metadata().file_metadata().schema_descr_ptr();
let projection_mask = ProjectionMask::roots(&parquet_schema_desc, adapter.fields_to_read());
let mut builder = builder
.with_projection(projection_mask.clone())
.with_row_groups(pruned_row_groups);
if let Some(row_filter) = build_row_filter(
self.time_range,
&self.predicate,
&store_schema,
&parquet_schema_desc,
projection_mask,
) {
builder = builder.with_row_filter(row_filter);
}
let mut stream = builder
.build()
.context(ReadParquetSnafu { file: &file_path })?;
let chunk_stream = try_stream!({
while let Some(res) = stream.next().await {
yield res.context(ReadParquetSnafu { file: &file_path })?
}
});
ChunkStream::new(self.file_handle.clone(), adapter, Box::pin(chunk_stream))
}
}
pub type SendableChunkStream = Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>;
pub struct ChunkStream {
// Holds the file handle in the stream to avoid the purger purge it.
_file_handle: FileHandle,
adapter: ReadAdapter,
stream: SendableChunkStream,
}
impl ChunkStream {
pub fn new(
file_handle: FileHandle,
adapter: ReadAdapter,
stream: SendableChunkStream,
) -> Result<Self> {
Ok(Self {
_file_handle: file_handle,
adapter,
stream,
})
}
}
#[async_trait]
impl BatchReader for ChunkStream {
async fn next_batch(&mut self) -> Result<Option<Batch>> {
self.stream
.try_next()
.await?
.map(|rb| self.adapter.arrow_record_batch_to_batch(&rb))
.transpose()
}
}
#[cfg(test)]
mod tests {
use std::ops::Range;
use std::sync::Arc;
use api::v1::OpType;
use common_base::readable_size::ReadableSize;
use common_test_util::temp_dir::create_temp_dir;
use common_time::timestamp::TimeUnit;
use datatypes::arrow::array::{Array, UInt64Array, UInt8Array};
use datatypes::prelude::{ScalarVector, Vector};
use datatypes::types::{TimestampMillisecondType, TimestampType};
use datatypes::vectors::TimestampMillisecondVector;
use object_store::services::Fs;
use super::*;
use crate::file_purger::noop::new_noop_file_purger;
use crate::memtable::{
tests as memtable_tests, DefaultMemtableBuilder, IterContext, MemtableBuilder,
};
use crate::schema::ProjectedSchema;
use crate::sst::{FileId, FileMeta};
fn create_object_store(root: &str) -> ObjectStore {
let mut builder = Fs::default();
let _ = builder.root(root);
ObjectStore::new(builder).unwrap().finish()
}
#[tokio::test]
async fn test_parquet_writer() {
common_telemetry::init_default_ut_logging();
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema);
memtable_tests::write_kvs(
&*memtable,
10, // sequence
OpType::Put,
&[1000, 1002, 2002, 2003, 2003, 1001], // keys
&[
(Some(1), Some(1234)),
(Some(2), Some(1234)),
(Some(7), Some(1234)),
(Some(8), Some(1234)),
(Some(9), Some(1234)),
(Some(3), Some(1234)),
], // values
);
let dir = create_temp_dir("write_parquet");
let path = dir.path().to_str().unwrap();
let object_store = create_object_store(path);
let sst_file_name = "test-flush.parquet";
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
assert!(writer
.write_sst(&sst::WriteOptions::default())
.await
.is_ok());
// verify parquet file
let reader = BufReader::new(object_store.reader(sst_file_name).await.unwrap().compat());
let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
let mut stream = builder.build().unwrap();
// chunk schema: timestamp, v1, __sequence, __op_type
let chunk = stream.next().await.unwrap().unwrap();
assert_eq!(5, chunk.columns().len());
// timestamp
assert_eq!(
&TimestampMillisecondVector::from_slice([
1000.into(),
1001.into(),
1002.into(),
2002.into(),
2003.into(),
])
.to_arrow_array(),
chunk.column(0)
);
// v0
assert_eq!(
&(Arc::new(UInt64Array::from(vec![1, 3, 2, 7, 9])) as Arc<dyn Array>),
chunk.column(1)
);
// v1
assert_eq!(
&(Arc::new(UInt64Array::from(vec![1234; 5])) as Arc<dyn Array>),
chunk.column(2)
);
// sequence
assert_eq!(
&(Arc::new(UInt64Array::from(vec![10; 5])) as Arc<dyn Array>),
chunk.column(3)
);
// op_type
assert_eq!(
&(Arc::new(UInt8Array::from(vec![1; 5])) as Arc<dyn Array>),
chunk.column(4)
);
}
#[tokio::test]
async fn test_write_large_data() {
common_telemetry::init_default_ut_logging();
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema);
let mut rows_written = 0;
for i in 0..16 {
let range: Range<i64> = i * 1024..(i + 1) * 1024;
let keys = range.clone().collect::<Vec<_>>();
let values = range
.map(|idx| (Some(idx as u64), Some(idx as u64)))
.collect::<Vec<_>>();
memtable_tests::write_kvs(&*memtable, i as u64, OpType::Put, &keys, &values);
rows_written += keys.len();
}
let dir = create_temp_dir("write_large_parquet");
let path = dir.path().to_str().unwrap();
let object_store = create_object_store(path);
let sst_file_name = "test-large.parquet";
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
let sst_info = writer
.write_sst(&sst::WriteOptions {
sst_write_buffer_size: ReadableSize::kb(4),
})
.await
.unwrap()
.unwrap();
let file_meta = object_store.stat(sst_file_name).await.unwrap();
assert!(file_meta.is_file());
assert_eq!(sst_info.file_size, file_meta.content_length());
assert_eq!(rows_written, sst_info.num_rows);
}
#[tokio::test]
async fn test_parquet_read_large_batch() {
common_telemetry::init_default_ut_logging();
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
let rows_total = 4096 * 4;
let mut keys_vec = Vec::with_capacity(rows_total);
let mut values_vec = Vec::with_capacity(rows_total);
for i in 0..rows_total {
keys_vec.push(i as i64);
values_vec.push((Some(i as u64), Some(i as u64)));
}
memtable_tests::write_kvs(
&*memtable,
10, // sequence
OpType::Put,
&keys_vec, // keys
&values_vec, // values
);
let dir = create_temp_dir("write_parquet");
let path = dir.path().to_str().unwrap();
let object_store = create_object_store(path);
let sst_file_handle = new_file_handle(FileId::random());
let sst_file_name = sst_file_handle.file_name();
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
let SstInfo {
time_range,
file_size,
..
} = writer
.write_sst(&sst::WriteOptions::default())
.await
.unwrap()
.unwrap();
assert_eq!(
Some((
Timestamp::new_millisecond(0),
Timestamp::new_millisecond((rows_total - 1) as i64)
)),
time_range
);
assert_ne!(file_size, 0);
let operator = create_object_store(dir.path().to_str().unwrap());
let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
let reader = ParquetReader::new(
sst_file_handle,
operator,
projected_schema,
Predicate::empty(),
TimestampRange::min_to_max(),
);
let mut rows_fetched = 0;
let mut stream = reader.chunk_stream().await.unwrap();
while let Some(res) = stream.next_batch().await.unwrap() {
rows_fetched += res.num_rows();
}
assert_eq!(rows_total, rows_fetched);
}
fn new_file_handle(file_id: FileId) -> FileHandle {
let file_purger = new_noop_file_purger();
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
FileHandle::new(
FileMeta {
region_id: 0.into(),
file_id,
time_range: Some((
Timestamp::new_millisecond(0),
Timestamp::new_millisecond(1000),
)),
level: 0,
file_size: 0,
},
layer,
file_purger,
)
}
#[tokio::test]
async fn test_parquet_reader() {
common_telemetry::init_default_ut_logging();
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
memtable_tests::write_kvs(
&*memtable,
10, // sequence
OpType::Put,
&[1000, 1002, 2002, 2003, 2003, 1001], // keys
&[
(Some(1), Some(1234)),
(Some(2), Some(1234)),
(Some(7), Some(1234)),
(Some(8), Some(1234)),
(Some(9), Some(1234)),
(Some(3), Some(1234)),
], // values
);
let dir = create_temp_dir("write_parquet");
let path = dir.path().to_str().unwrap();
let object_store = create_object_store(path);
let file_handle = new_file_handle(FileId::random());
let sst_file_name = file_handle.file_name();
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
let SstInfo {
time_range,
file_size,
..
} = writer
.write_sst(&sst::WriteOptions::default())
.await
.unwrap()
.unwrap();
assert_eq!(
Some((
Timestamp::new_millisecond(1000),
Timestamp::new_millisecond(2003)
)),
time_range
);
assert_ne!(file_size, 0);
let operator = create_object_store(dir.path().to_str().unwrap());
let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap());
let reader = ParquetReader::new(
file_handle,
operator,
projected_schema,
Predicate::empty(),
TimestampRange::min_to_max(),
);
let mut stream = reader.chunk_stream().await.unwrap();
assert_eq!(
5,
stream
.next_batch()
.await
.transpose()
.unwrap()
.unwrap()
.num_rows()
);
}
async fn check_range_read(
file_handle: FileHandle,
object_store: ObjectStore,
schema: ProjectedSchemaRef,
range: TimestampRange,
expect: Vec<i64>,
) {
let reader =
ParquetReader::new(file_handle, object_store, schema, Predicate::empty(), range);
let mut stream = reader.chunk_stream().await.unwrap();
let result = stream.next_batch().await;
let Some(batch) = result.unwrap() else {
// if batch does not contain any row
assert!(expect.is_empty());
return;
};
assert_eq!(
ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType)),
batch.column(0).data_type()
);
let ts = batch
.column(0)
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.map(|t| t.unwrap().0.value())
.collect::<Vec<_>>();
assert_eq!(expect, ts);
}
#[tokio::test]
async fn test_parquet_reader_with_time_range_filter() {
common_telemetry::init_default_ut_logging();
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
memtable_tests::write_kvs(
&*memtable,
10, // sequence
OpType::Put,
&[1000, 1002, 2002, 2003, 2003, 1001, 3001], // keys
&[
(Some(1), Some(1234)),
(Some(2), Some(1234)),
(Some(7), Some(1234)),
(Some(8), Some(1234)),
(Some(9), Some(1234)),
(Some(3), Some(1234)),
(Some(7), Some(1234)),
], // values
);
let dir = create_temp_dir("read-parquet-by-range");
let path = dir.path().to_str().unwrap();
let object_store = create_object_store(path);
let sst_file_handle = new_file_handle(FileId::random());
let sst_file_name = sst_file_handle.file_name();
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone());
let SstInfo {
time_range,
file_size,
..
} = writer
.write_sst(&sst::WriteOptions::default())
.await
.unwrap()
.unwrap();
assert_eq!(
Some((
Timestamp::new_millisecond(1000),
Timestamp::new_millisecond(3001)
)),
time_range
);
assert_ne!(file_size, 0);
let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1, 0, 2])).unwrap());
check_range_read(
sst_file_handle.clone(),
object_store.clone(),
projected_schema.clone(),
TimestampRange::with_unit(1000, 2003, TimeUnit::Millisecond).unwrap(),
vec![1000, 1001, 1002, 2002],
)
.await;
check_range_read(
sst_file_handle.clone(),
object_store.clone(),
projected_schema.clone(),
TimestampRange::with_unit(2002, 3001, TimeUnit::Millisecond).unwrap(),
vec![2002, 2003],
)
.await;
// read a range without any rows.
check_range_read(
sst_file_handle.clone(),
object_store.clone(),
projected_schema.clone(),
TimestampRange::with_unit(3002, 3003, TimeUnit::Millisecond).unwrap(),
vec![],
)
.await;
//
check_range_read(
sst_file_handle.clone(),
object_store.clone(),
projected_schema.clone(),
TimestampRange::with_unit(1000, 3000, TimeUnit::Millisecond).unwrap(),
vec![1000, 1001, 1002, 2002, 2003],
)
.await;
// read full range
check_range_read(
sst_file_handle,
object_store,
projected_schema,
TimestampRange::min_to_max(),
vec![1000, 1001, 1002, 2002, 2003, 3001],
)
.await;
}
#[tokio::test]
async fn test_write_empty_file() {
common_telemetry::init_default_ut_logging();
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder::default().build(schema.clone());
let dir = create_temp_dir("write-empty-file");
let path = dir.path().to_str().unwrap();
let mut builder = Fs::default();
let _ = builder.root(path);
let object_store = ObjectStore::new(builder).unwrap().finish();
let sst_file_name = "test-empty.parquet";
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone());
let sst_info_opt = writer
.write_sst(&sst::WriteOptions::default())
.await
.unwrap();
assert!(sst_info_opt.is_none());
// The file should not exist when no row has been written.
assert!(!object_store.is_exist(sst_file_name).await.unwrap());
}
}

Some files were not shown because too many files have changed in this diff Show More