fix: unify all sst_write_buffer_size usage (#2712)

* fix: unify all sst_write_buffer_size usage

* fix: some CR comments

* fix: logs
This commit is contained in:
Lei, HUANG
2023-11-09 19:22:47 +08:00
committed by GitHub
parent 0cd6dacb45
commit b53537e69b
15 changed files with 80 additions and 28 deletions

View File

@@ -105,6 +105,9 @@ global_write_buffer_reject_size = "2GB"
sst_meta_cache_size = "128MB"
# Cache size for vectors and arrow arrays (default 512MB). Setting it to 0 to disable the cache.
vector_cache_size = "512MB"
# Buffer size for SST writing.
sst_write_buffer_size = "8MB"
# Log options
# [logging]

View File

@@ -191,7 +191,6 @@ mod tests {
use std::io::Write;
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_test_util::temp_dir::create_named_temp_file;
use datanode::config::{CompactionConfig, FileConfig, ObjectStoreConfig, RegionManifestConfig};
use servers::heartbeat_options::HeartbeatOptions;
@@ -300,7 +299,6 @@ mod tests {
max_inflight_tasks: 3,
max_files_in_level0: 7,
max_purge_tasks: 32,
sst_write_buffer_size: ReadableSize::mb(8),
},
options.storage.compaction,
);

View File

@@ -250,8 +250,6 @@ pub struct CompactionConfig {
pub max_files_in_level0: usize,
/// Max task number for SST purge task after compaction.
pub max_purge_tasks: usize,
/// Buffer threshold while writing SST files
pub sst_write_buffer_size: ReadableSize,
}
impl Default for CompactionConfig {
@@ -260,7 +258,6 @@ impl Default for CompactionConfig {
max_inflight_tasks: 4,
max_files_in_level0: 8,
max_purge_tasks: 32,
sst_write_buffer_size: ReadableSize::mb(8),
}
}
}
@@ -312,7 +309,6 @@ impl From<&DatanodeOptions> for StorageEngineConfig {
manifest_gc_duration: value.storage.manifest.gc_duration,
max_files_in_l0: value.storage.compaction.max_files_in_level0,
max_purge_tasks: value.storage.compaction.max_purge_tasks,
sst_write_buffer_size: value.storage.compaction.sst_write_buffer_size,
max_flush_tasks: value.storage.flush.max_flush_tasks,
region_write_buffer_size: value.storage.flush.region_write_buffer_size,
picker_schedule_interval: value.storage.flush.picker_schedule_interval,

View File

@@ -22,6 +22,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::Instant;
use common_base::readable_size::ReadableSize;
use common_telemetry::{debug, error};
pub use picker::CompactionPickerRef;
use snafu::ResultExt;
@@ -30,6 +31,7 @@ use tokio::sync::mpsc::{self, Sender};
use crate::access_layer::AccessLayerRef;
use crate::compaction::twcs::TwcsPicker;
use crate::config::MitoConfig;
use crate::error::{
CompactRegionSnafu, Error, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result,
};
@@ -51,6 +53,8 @@ pub struct CompactionRequest {
pub(crate) file_purger: FilePurgerRef,
/// Start time of compaction task.
pub(crate) start_time: Instant,
/// Buffering threshold while writing SST files.
pub(crate) sst_write_buffer_size: ReadableSize,
}
impl CompactionRequest {
@@ -103,6 +107,7 @@ impl CompactionScheduler {
access_layer: &AccessLayerRef,
file_purger: &FilePurgerRef,
waiter: OptionOutputTx,
engine_config: Arc<MitoConfig>,
) -> Result<()> {
if let Some(status) = self.region_status.get_mut(&region_id) {
// Region is compacting. Add the waiter to pending list.
@@ -117,19 +122,27 @@ impl CompactionScheduler {
access_layer.clone(),
file_purger.clone(),
);
let request = status.new_compaction_request(self.request_sender.clone(), waiter);
let request =
status.new_compaction_request(self.request_sender.clone(), waiter, engine_config);
self.region_status.insert(region_id, status);
self.schedule_compaction_request(request)
}
/// Notifies the scheduler that the compaction job is finished successfully.
pub(crate) fn on_compaction_finished(&mut self, region_id: RegionId) {
pub(crate) fn on_compaction_finished(
&mut self,
region_id: RegionId,
engine_config: Arc<MitoConfig>,
) {
let Some(status) = self.region_status.get_mut(&region_id) else {
return;
};
// We should always try to compact the region until picker returns None.
let request =
status.new_compaction_request(self.request_sender.clone(), OptionOutputTx::none());
let request = status.new_compaction_request(
self.request_sender.clone(),
OptionOutputTx::none(),
engine_config,
);
// Try to schedule next compaction task for this region.
if let Err(e) = self.schedule_compaction_request(request) {
error!(e; "Failed to schedule next compaction for region {}", region_id);
@@ -138,7 +151,7 @@ impl CompactionScheduler {
/// Notifies the scheduler that the compaction job is failed.
pub(crate) fn on_compaction_failed(&mut self, region_id: RegionId, err: Arc<Error>) {
error!(err; "Region {} failed to flush, cancel all pending tasks", region_id);
error!(err; "Region {} failed to compact, cancel all pending tasks", region_id);
// Remove this region.
let Some(status) = self.region_status.remove(&region_id) else {
return;
@@ -236,7 +249,7 @@ impl PendingCompaction {
}
}
/// Send flush error to waiter.
/// Send compaction error to waiter.
fn on_failure(&mut self, region_id: RegionId, err: Arc<Error>) {
for waiter in self.waiters.drain(..) {
waiter.send(Err(err.clone()).context(CompactRegionSnafu { region_id }));
@@ -300,6 +313,7 @@ impl CompactionStatus {
&mut self,
request_sender: Sender<WorkerRequest>,
waiter: OptionOutputTx,
engine_config: Arc<MitoConfig>,
) -> CompactionRequest {
let current_version = self.version_control.current().version;
let start_time = Instant::now();
@@ -310,6 +324,7 @@ impl CompactionStatus {
waiters: Vec::new(),
file_purger: self.file_purger.clone(),
start_time,
sst_write_buffer_size: engine_config.sst_write_buffer_size,
};
if let Some(pending) = self.pending_compaction.take() {
@@ -352,6 +367,7 @@ mod tests {
&env.access_layer,
&purger,
waiter,
Arc::new(MitoConfig::default()),
)
.unwrap();
let output = output_rx.await.unwrap().unwrap();
@@ -369,6 +385,7 @@ mod tests {
&env.access_layer,
&purger,
waiter,
Arc::new(MitoConfig::default()),
)
.unwrap();
let output = output_rx.await.unwrap().unwrap();
@@ -427,6 +444,7 @@ mod tests {
&env.access_layer,
&purger,
OptionOutputTx::none(),
Arc::new(MitoConfig::default()),
)
.unwrap();
// Should schedule 1 compaction.
@@ -454,6 +472,7 @@ mod tests {
&env.access_layer,
&purger,
OptionOutputTx::none(),
Arc::new(MitoConfig::default()),
)
.unwrap();
assert_eq!(1, scheduler.region_status.len());
@@ -466,7 +485,7 @@ mod tests {
.is_some());
// On compaction finished and schedule next compaction.
scheduler.on_compaction_finished(region_id);
scheduler.on_compaction_finished(region_id, Arc::new(MitoConfig::default()));
assert_eq!(1, scheduler.region_status.len());
assert_eq!(2, job_scheduler.num_jobs());
// 5 files for next compaction.
@@ -484,6 +503,7 @@ mod tests {
&env.access_layer,
&purger,
OptionOutputTx::none(),
Arc::new(MitoConfig::default()),
)
.unwrap();
assert_eq!(2, job_scheduler.num_jobs());

View File

@@ -120,6 +120,7 @@ impl Picker for TwcsPicker {
waiters,
file_purger,
start_time,
sst_write_buffer_size,
} = req;
let region_metadata = current_version.metadata.clone();
@@ -167,7 +168,7 @@ impl Picker for TwcsPicker {
sst_layer: access_layer,
outputs,
expired_ssts,
sst_write_buffer_size: ReadableSize::mb(4),
sst_write_buffer_size,
compaction_time_window: Some(time_window_size),
request_sender,
waiters,
@@ -355,7 +356,7 @@ impl CompactionTask for TwcsCompactionTask {
Ok((added, deleted)) => {
info!(
"Compacted SST files, input: {:?}, output: {:?}, window: {:?}",
added, deleted, self.compaction_time_window
deleted, added, self.compaction_time_window
);
BackgroundNotify::CompactionFinished(CompactionFinished {

View File

@@ -26,6 +26,8 @@ const DEFAULT_NUM_WORKERS: usize = 1;
/// Default max running background job.
const DEFAULT_MAX_BG_JOB: usize = 4;
const MULTIPART_UPLOAD_MINIMUM_SIZE: ReadableSize = ReadableSize::mb(5);
/// Configuration for [MitoEngine](crate::engine::MitoEngine).
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
#[serde(default)]
@@ -63,6 +65,8 @@ pub struct MitoConfig {
pub sst_meta_cache_size: ReadableSize,
/// Cache size for vectors and arrow arrays (default 512MB). Setting it to 0 to disable the cache.
pub vector_cache_size: ReadableSize,
/// Buffer size for SST writing.
pub sst_write_buffer_size: ReadableSize,
}
impl Default for MitoConfig {
@@ -79,6 +83,7 @@ impl Default for MitoConfig {
global_write_buffer_reject_size: ReadableSize::gb(2),
sst_meta_cache_size: ReadableSize::mb(128),
vector_cache_size: ReadableSize::mb(512),
sst_write_buffer_size: ReadableSize::mb(8),
}
}
}
@@ -117,5 +122,13 @@ impl MitoConfig {
self.global_write_buffer_reject_size
);
}
if self.sst_write_buffer_size < MULTIPART_UPLOAD_MINIMUM_SIZE {
self.sst_write_buffer_size = MULTIPART_UPLOAD_MINIMUM_SIZE;
warn!(
"Sanitize sst write buffer size to {}",
self.sst_write_buffer_size
);
}
}
}

View File

@@ -26,6 +26,7 @@ use strum::IntoStaticStr;
use tokio::sync::mpsc;
use crate::access_layer::AccessLayerRef;
use crate::config::MitoConfig;
use crate::error::{
Error, FlushRegionSnafu, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result,
};
@@ -198,6 +199,7 @@ pub(crate) struct RegionFlushTask {
pub(crate) memtable_builder: MemtableBuilderRef,
pub(crate) file_purger: FilePurgerRef,
pub(crate) listener: WorkerListener,
pub(crate) engine_config: Arc<MitoConfig>,
pub(crate) row_group_size: Option<usize>,
}
@@ -289,8 +291,10 @@ impl RegionFlushTask {
.with_label_values(&["flush_memtables"])
.start_timer();
// TODO(yingwen): Make it configurable.
let mut write_opts = WriteOptions::default();
let mut write_opts = WriteOptions {
write_buffer_size: self.engine_config.sst_write_buffer_size,
..Default::default()
};
if let Some(row_group_size) = self.row_group_size {
write_opts.row_group_size = row_group_size;
}
@@ -723,6 +727,7 @@ mod tests {
memtable_builder: builder.memtable_builder(),
file_purger: builder.file_purger(),
listener: WorkerListener::default(),
engine_config: Arc::new(MitoConfig::default()),
row_group_size: None,
};
task.push_sender(OptionOutputTx::from(output_tx));

View File

@@ -80,7 +80,7 @@ impl<S> RegionWorkerLoop<S> {
info!("Flush region: {} before alteration", region_id);
// Try to submit a flush task.
let task = self.new_flush_task(&region, FlushReason::Alter, None);
let task = self.new_flush_task(&region, FlushReason::Alter, None, self.config.clone());
if let Err(e) =
self.flush_scheduler
.schedule_flush(region.region_id, &region.version_control, task)

View File

@@ -38,6 +38,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
&region.access_layer,
&region.file_purger,
sender,
self.config.clone(),
) {
error!(e; "Failed to schedule compaction task for region: {}", region_id);
} else {
@@ -86,8 +87,10 @@ impl<S: LogStore> RegionWorkerLoop<S> {
}
// compaction finished.
request.on_success();
// Schedule next compaction if necessary.
self.compaction_scheduler.on_compaction_finished(region_id);
self.compaction_scheduler
.on_compaction_finished(region_id, self.config.clone());
}
/// When compaction fails, we simply log the error.

View File

@@ -14,12 +14,15 @@
//! Handling flush related requests.
use std::sync::Arc;
use common_telemetry::{error, info, warn};
use common_time::util::current_time_millis;
use store_api::logstore::LogStore;
use store_api::region_request::RegionFlushRequest;
use store_api::storage::RegionId;
use crate::config::MitoConfig;
use crate::error::{RegionTruncatedSnafu, Result};
use crate::flush::{FlushReason, RegionFlushTask};
use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
@@ -39,7 +42,12 @@ impl<S> RegionWorkerLoop<S> {
return;
};
let mut task = self.new_flush_task(&region, FlushReason::Manual, request.row_group_size);
let mut task = self.new_flush_task(
&region,
FlushReason::Manual,
request.row_group_size,
self.config.clone(),
);
task.push_sender(sender);
if let Err(e) =
self.flush_scheduler
@@ -94,7 +102,8 @@ impl<S> RegionWorkerLoop<S> {
if region.last_flush_millis() < min_last_flush_time {
// If flush time of this region is earlier than `min_last_flush_time`, we can flush this region.
let task = self.new_flush_task(region, FlushReason::EngineFull, None);
let task =
self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone());
self.flush_scheduler.schedule_flush(
region.region_id,
&region.version_control,
@@ -107,7 +116,8 @@ impl<S> RegionWorkerLoop<S> {
// TODO(yingwen): Maybe flush more tables to reduce write buffer size.
if let Some(region) = max_mem_region {
if !self.flush_scheduler.is_flush_requested(region.region_id) {
let task = self.new_flush_task(region, FlushReason::EngineFull, None);
let task =
self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone());
self.flush_scheduler.schedule_flush(
region.region_id,
&region.version_control,
@@ -125,6 +135,7 @@ impl<S> RegionWorkerLoop<S> {
region: &MitoRegionRef,
reason: FlushReason,
row_group_size: Option<usize>,
engine_config: Arc<MitoConfig>,
) -> RegionFlushTask {
// TODO(yingwen): metrics for flush requested.
RegionFlushTask {
@@ -136,6 +147,7 @@ impl<S> RegionWorkerLoop<S> {
memtable_builder: self.memtable_builder.clone(),
file_purger: region.file_purger.clone(),
listener: self.listener.clone(),
engine_config,
row_group_size,
}
}
@@ -220,6 +232,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
&region.access_layer,
&region.file_purger,
OptionOutputTx::none(),
self.config.clone(),
) {
warn!(
"Failed to schedule compaction after flush, region: {}, err: {}",

View File

@@ -34,7 +34,6 @@ pub struct EngineConfig {
pub manifest_gc_duration: Option<Duration>,
pub max_files_in_l0: usize,
pub max_purge_tasks: usize,
pub sst_write_buffer_size: ReadableSize,
/// Max inflight flush tasks.
pub max_flush_tasks: usize,
/// Default write buffer size for a region.
@@ -59,7 +58,6 @@ impl Default for EngineConfig {
manifest_gc_duration: Some(Duration::from_secs(30)),
max_files_in_l0: 8,
max_purge_tasks: 32,
sst_write_buffer_size: ReadableSize::mb(8),
max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS,
region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE,
picker_schedule_interval: Duration::from_millis(

View File

@@ -18,6 +18,7 @@ mod scheduler;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use common_base::readable_size::ReadableSize;
use common_telemetry::logging;
pub use picker::{FlushPicker, PickerConfig};
pub use scheduler::{
@@ -269,7 +270,7 @@ impl<S: LogStore> FlushJob<S> {
let iter = m.iter(iter_ctx.clone())?;
let sst_layer = self.sst_layer.clone();
let write_options = WriteOptions {
sst_write_buffer_size: self.engine_config.sst_write_buffer_size,
sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
};
futures.push(async move {
Ok(sst_layer

View File

@@ -16,6 +16,7 @@ use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use common_base::readable_size::ReadableSize;
use common_runtime::{RepeatedTask, TaskFunction};
use common_telemetry::logging;
use snafu::{ensure, ResultExt};
@@ -147,7 +148,7 @@ impl<S: LogStore> From<&FlushRegionRequest<S>> for CompactionRequestImpl<S> {
compaction_time_window: req.compaction_time_window,
sender: None,
picker: req.compaction_picker.clone(),
sst_write_buffer_size: req.engine_config.sst_write_buffer_size,
sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage
// compaction triggered by flush always reschedules
reschedule_on_finish: true,
}

View File

@@ -374,7 +374,7 @@ where
let mut inner = self.inner.lock().await;
ensure!(!inner.is_closed(), error::ClosedRegionSnafu);
let sst_write_buffer_size = inner.engine_config.sst_write_buffer_size;
let sst_write_buffer_size = ReadableSize::mb(8); // deprecated usage
inner
.manual_compact(

View File

@@ -706,7 +706,6 @@ type = "{}"
max_inflight_tasks = 4
max_files_in_level0 = 8
max_purge_tasks = 32
sst_write_buffer_size = "8MiB"
[datanode.storage.manifest]
checkpoint_margin = 10
@@ -733,6 +732,7 @@ global_write_buffer_size = "1GiB"
global_write_buffer_reject_size = "2GiB"
sst_meta_cache_size = "128MiB"
vector_cache_size = "512MiB"
sst_write_buffer_size = "8MiB"
[[datanode.region_engine]]