feat: add compaction metrics (#2560)

* feat: add compaction metrics

* feat: add compaction request total count

* fix: CR comments
This commit is contained in:
Lei, HUANG
2023-10-10 15:38:39 +08:00
committed by GitHub
parent ed725d030f
commit 6b39f5923d
5 changed files with 72 additions and 29 deletions

View File

@@ -20,8 +20,9 @@ mod twcs;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Instant;
use common_telemetry::{debug, error};
use common_telemetry::{debug, error, timer};
pub use picker::CompactionPickerRef;
use snafu::ResultExt;
use store_api::storage::RegionId;
@@ -32,6 +33,7 @@ use crate::compaction::twcs::TwcsPicker;
use crate::error::{
CompactRegionSnafu, Error, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result,
};
use crate::metrics::{COMPACTION_STAGE_ELAPSED, STAGE_LABEL};
use crate::region::options::CompactionOptions;
use crate::region::version::{VersionControlRef, VersionRef};
use crate::request::{OptionOutputTx, OutputTx, WorkerRequest};
@@ -47,6 +49,8 @@ pub struct CompactionRequest {
/// Waiters of the compaction request.
pub(crate) waiters: Vec<OutputTx>,
pub(crate) file_purger: FilePurgerRef,
/// Start time of compaction task.
pub(crate) start_time: Instant,
}
impl CompactionRequest {
@@ -175,11 +179,14 @@ impl CompactionScheduler {
"Pick compaction strategy {:?} for region: {}",
picker, region_id
);
let pick_timer = timer!(COMPACTION_STAGE_ELAPSED, &[(STAGE_LABEL, "pick")]);
let Some(mut task) = picker.pick(request) else {
// Nothing to compact, remove it from the region status map.
self.region_status.remove(&region_id);
return Ok(());
};
drop(pick_timer);
// Submit the compaction task.
self.scheduler
@@ -188,10 +195,8 @@ impl CompactionScheduler {
}))
.map_err(|e| {
error!(e; "Failed to submit compaction request for region {}", region_id);
// If failed to submit the job, we need to remove the region from the scheduler.
self.region_status.remove(&region_id);
e
})
}
@@ -295,12 +300,14 @@ impl CompactionStatus {
waiter: OptionOutputTx,
) -> CompactionRequest {
let current_version = self.version_control.current().version;
let start_time = Instant::now();
let mut req = CompactionRequest {
current_version,
access_layer: self.access_layer.clone(),
request_sender: request_sender.clone(),
waiters: Vec::new(),
file_purger: self.file_purger.clone(),
start_time,
};
if let Some(pending) = self.pending_compaction.take() {

View File

@@ -15,14 +15,15 @@
use std::collections::BTreeMap;
use std::fmt::{Debug, Formatter};
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use common_base::readable_size::ReadableSize;
use common_query::Output;
use common_telemetry::{debug, error, info};
use common_telemetry::{debug, error, info, timer};
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use metrics::increment_counter;
use snafu::ResultExt;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::RegionId;
@@ -34,6 +35,7 @@ use crate::compaction::picker::{CompactionTask, Picker};
use crate::compaction::CompactionRequest;
use crate::error;
use crate::error::CompactRegionSnafu;
use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED, STAGE_LABEL};
use crate::request::{
BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, WorkerRequest,
};
@@ -118,6 +120,7 @@ impl Picker for TwcsPicker {
request_sender,
waiters,
file_purger,
start_time,
} = req;
let region_metadata = current_version.metadata.clone();
@@ -170,6 +173,7 @@ impl Picker for TwcsPicker {
request_sender,
waiters,
file_purger,
start_time,
};
Some(Box::new(task))
}
@@ -228,6 +232,8 @@ pub(crate) struct TwcsCompactionTask {
pub(crate) request_sender: mpsc::Sender<WorkerRequest>,
/// Senders that are used to notify waiters waiting for pending compaction tasks.
pub waiters: Vec<OutputTx>,
/// Start time of compaction task
pub start_time: Instant,
}
impl Debug for TwcsCompactionTask {
@@ -310,8 +316,10 @@ impl TwcsCompactionTask {
async fn handle_compaction(&mut self) -> error::Result<(Vec<FileMeta>, Vec<FileMeta>)> {
self.mark_files_compacting(true);
let merge_timer = timer!(COMPACTION_STAGE_ELAPSED, &[(STAGE_LABEL, "merge")]);
let (output, mut compacted) = self.merge_ssts().await.map_err(|e| {
error!(e; "Failed to compact region: {}", self.region_id);
merge_timer.discard();
e
})?;
compacted.extend(self.expired_ssts.iter().map(FileHandle::meta));
@@ -320,6 +328,7 @@ impl TwcsCompactionTask {
/// Handles compaction failure, notifies all waiters.
fn on_failure(&mut self, err: Arc<error::Error>) {
increment_counter!(COMPACTION_FAILURE_COUNT);
for waiter in self.waiters.drain(..) {
waiter.send(Err(err.clone()).context(CompactRegionSnafu {
region_id: self.region_id,
@@ -357,6 +366,7 @@ impl CompactionTask for TwcsCompactionTask {
compaction_time_window: self
.compaction_time_window
.map(|seconds| Duration::from_secs(seconds as u64)),
start_time: self.start_time,
})
}
Err(e) => {

View File

@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
/// Stage label.
pub const STAGE_LABEL: &str = "stage";
/// Global write buffer size in bytes.
pub const WRITE_BUFFER_BYTES: &str = "mito.write_buffer_bytes";
/// Type label.
@@ -42,8 +45,16 @@ pub const WRITE_STALL_TOTAL: &str = "mito.write.stall_total";
pub const WRITE_REJECT_TOTAL: &str = "mito.write.reject_total";
/// Elapsed time of each write stage.
pub const WRITE_STAGE_ELAPSED: &str = "mito.write.stage_elapsed";
/// Stage label.
pub const STAGE_LABEL: &str = "stage";
/// Counter of rows to write.
pub const WRITE_ROWS_TOTAL: &str = "mito.write.rows_total";
// ------ End of write related metrics
// Compaction metrics
/// Timer of different stages in compaction.
pub const COMPACTION_STAGE_ELAPSED: &str = "mito.compaction.stage_elapsed";
/// Timer of whole compaction task.
pub const COMPACTION_ELAPSED_TOTAL: &str = "mito.compaction.total_elapsed";
/// Counter of all requested compaction task.
pub const COMPACTION_REQUEST_COUNT: &str = "mito.compaction.requests_total";
/// Counter of failed compaction task.
pub const COMPACTION_FAILURE_COUNT: &str = "mito.compaction.failure_total";

View File

@@ -16,7 +16,7 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use api::helper::{
is_column_type_value_eq, is_semantic_type_eq, proto_value_type, to_column_data_type,
@@ -29,6 +29,7 @@ use common_telemetry::metric::Timer;
use common_telemetry::tracing::log::info;
use common_telemetry::warn;
use datatypes::prelude::DataType;
use metrics::histogram;
use prost::Message;
use smallvec::SmallVec;
use snafu::{ensure, OptionExt, ResultExt};
@@ -45,6 +46,7 @@ use crate::error::{
InvalidRequestSnafu, Result,
};
use crate::memtable::MemtableId;
use crate::metrics::COMPACTION_ELAPSED_TOTAL;
use crate::sst::file::FileMeta;
use crate::sst::file_purger::{FilePurgerRef, PurgeRequest};
use crate::wal::EntryId;
@@ -646,10 +648,15 @@ pub(crate) struct CompactionFinished {
pub(crate) file_purger: FilePurgerRef,
/// Inferred Compaction time window.
pub(crate) compaction_time_window: Option<Duration>,
/// Start time of compaction task.
pub(crate) start_time: Instant,
}
impl CompactionFinished {
pub fn on_success(self) {
// only update compaction time on success
histogram!(COMPACTION_ELAPSED_TOTAL, self.start_time.elapsed());
for sender in self.senders {
sender.send(Ok(AffectedRows(0)));
}

View File

@@ -12,11 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_telemetry::{error, info};
use common_telemetry::{error, info, timer};
use metrics::increment_counter;
use store_api::logstore::LogStore;
use store_api::storage::RegionId;
use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
use crate::metrics::{COMPACTION_REQUEST_COUNT, COMPACTION_STAGE_ELAPSED, STAGE_LABEL};
use crate::request::{CompactionFailed, CompactionFinished, OnFailure, OptionOutputTx};
use crate::worker::RegionWorkerLoop;
@@ -30,7 +32,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
let Some(region) = self.regions.writable_region_or(region_id, &mut sender) else {
return;
};
increment_counter!(COMPACTION_REQUEST_COUNT);
if let Err(e) = self.compaction_scheduler.schedule_compaction(
region.region_id,
&region.version_control,
@@ -57,27 +59,33 @@ impl<S: LogStore> RegionWorkerLoop<S> {
return;
};
// Write region edit to manifest.
let edit = RegionEdit {
files_to_add: std::mem::take(&mut request.compaction_outputs),
files_to_remove: std::mem::take(&mut request.compacted_files),
compaction_time_window: request.compaction_time_window,
flushed_entry_id: None,
flushed_sequence: None,
};
let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
if let Err(e) = region.manifest_manager.update(action_list).await {
error!(e; "Failed to update manifest, region: {}", region_id);
request.on_failure(e);
return;
{
let manifest_timer =
timer!(COMPACTION_STAGE_ELAPSED, &[(STAGE_LABEL, "write_manifest")]);
// Write region edit to manifest.
let edit = RegionEdit {
files_to_add: std::mem::take(&mut request.compaction_outputs),
files_to_remove: std::mem::take(&mut request.compacted_files),
compaction_time_window: request.compaction_time_window,
flushed_entry_id: None,
flushed_sequence: None,
};
let action_list =
RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
if let Err(e) = region.manifest_manager.update(action_list).await {
error!(e; "Failed to update manifest, region: {}", region_id);
manifest_timer.discard();
request.on_failure(e);
return;
}
// Apply edit to region's version.
region
.version_control
.apply_edit(edit, &[], region.file_purger.clone());
}
// Apply edit to region's version.
region
.version_control
.apply_edit(edit, &[], region.file_purger.clone());
// compaction finished.
request.on_success();
// Schedule next compaction if necessary.
self.compaction_scheduler.on_compaction_finished(region_id);
}