mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-06-01 04:40:39 +00:00
chore: more gc metrics (#7661)
* chore: more gc metrics Signed-off-by: discord9 <discord9@163.com> * clippy Signed-off-by: discord9 <discord9@163.com> * refactor: simple metrics Signed-off-by: discord9 <discord9@163.com> * unused metrics Signed-off-by: discord9 <discord9@163.com> * fix(meta-srv): count need-retry regions in GC failure metric Signed-off-by: discord9 <discord9@163.com> * chore: better bucketing Signed-off-by: discord9 <discord9@163.com> --------- Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
@@ -32,6 +32,7 @@ use crate::gc::candidate::GcCandidate;
|
||||
use crate::gc::dropped::DroppedRegionCollector;
|
||||
use crate::gc::scheduler::{GcJobReport, GcScheduler};
|
||||
use crate::gc::tracker::RegionGcInfo;
|
||||
use crate::metrics::METRIC_META_GC_CANDIDATE_REGIONS;
|
||||
|
||||
impl GcScheduler {
|
||||
pub(crate) async fn trigger_gc(&self) -> Result<GcJobReport> {
|
||||
@@ -51,6 +52,13 @@ impl GcScheduler {
|
||||
let dropped_collector =
|
||||
DroppedRegionCollector::new(self.ctx.as_ref(), &self.config, &self.region_gc_tracker);
|
||||
let dropped_assignment = dropped_collector.collect_and_assign(&table_reparts).await?;
|
||||
let candidate_count: usize = per_table_candidates.values().map(|c| c.len()).sum();
|
||||
let dropped_count: usize = dropped_assignment
|
||||
.regions_by_peer
|
||||
.values()
|
||||
.map(|regions| regions.len())
|
||||
.sum();
|
||||
METRIC_META_GC_CANDIDATE_REGIONS.set((candidate_count + dropped_count) as i64);
|
||||
|
||||
if per_table_candidates.is_empty() && dropped_assignment.regions_by_peer.is_empty() {
|
||||
info!("No GC candidates found, skipping GC cycle");
|
||||
|
||||
@@ -40,6 +40,7 @@ use crate::error::{self, KvBackendSnafu, Result, SerializeToJsonSnafu, TableMeta
|
||||
use crate::gc::util::table_route_to_region;
|
||||
use crate::gc::{Peer2Regions, Region2Peers};
|
||||
use crate::handler::HeartbeatMailbox;
|
||||
use crate::metrics::{METRIC_META_GC_DATANODE_CALLS_TOTAL, METRIC_META_GC_FAILED_REGIONS_TOTAL};
|
||||
use crate::service::mailbox::{Channel, MailboxRef};
|
||||
|
||||
/// Helper function to send GetFileRefs instruction and wait for reply.
|
||||
@@ -49,6 +50,27 @@ async fn send_get_file_refs(
|
||||
peer: &Peer,
|
||||
instruction: GetFileRefs,
|
||||
timeout: Duration,
|
||||
) -> Result<GetFileRefsReply> {
|
||||
let result = send_get_file_refs_inner(mailbox, server_addr, peer, instruction, timeout).await;
|
||||
|
||||
match &result {
|
||||
Ok(_) => METRIC_META_GC_DATANODE_CALLS_TOTAL
|
||||
.with_label_values(&["get_file_refs", "success"])
|
||||
.inc(),
|
||||
Err(_) => METRIC_META_GC_DATANODE_CALLS_TOTAL
|
||||
.with_label_values(&["get_file_refs", "error"])
|
||||
.inc(),
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
async fn send_get_file_refs_inner(
|
||||
mailbox: &MailboxRef,
|
||||
server_addr: &str,
|
||||
peer: &Peer,
|
||||
instruction: GetFileRefs,
|
||||
timeout: Duration,
|
||||
) -> Result<GetFileRefsReply> {
|
||||
let instruction = instruction::Instruction::GetFileRefs(instruction);
|
||||
let msg = MailboxMessage::json_message(
|
||||
@@ -96,6 +118,43 @@ async fn send_gc_regions(
|
||||
server_addr: &str,
|
||||
timeout: Duration,
|
||||
description: &str,
|
||||
) -> Result<GcReport> {
|
||||
let failed_region_count = gc_regions.regions.len() as u64;
|
||||
let result =
|
||||
send_gc_regions_inner(mailbox, peer, gc_regions, server_addr, timeout, description).await;
|
||||
|
||||
match result {
|
||||
Ok(report) => {
|
||||
METRIC_META_GC_DATANODE_CALLS_TOTAL
|
||||
.with_label_values(&["gc_regions", "success"])
|
||||
.inc();
|
||||
|
||||
let need_retry_count = report.need_retry_regions.len() as u64;
|
||||
if need_retry_count > 0 {
|
||||
METRIC_META_GC_FAILED_REGIONS_TOTAL.inc_by(need_retry_count);
|
||||
}
|
||||
|
||||
Ok(report)
|
||||
}
|
||||
Err(e) => {
|
||||
METRIC_META_GC_DATANODE_CALLS_TOTAL
|
||||
.with_label_values(&["gc_regions", "error"])
|
||||
.inc();
|
||||
if failed_region_count > 0 {
|
||||
METRIC_META_GC_FAILED_REGIONS_TOTAL.inc_by(failed_region_count);
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn send_gc_regions_inner(
|
||||
mailbox: &MailboxRef,
|
||||
peer: &Peer,
|
||||
gc_regions: GcRegions,
|
||||
server_addr: &str,
|
||||
timeout: Duration,
|
||||
description: &str,
|
||||
) -> Result<GcReport> {
|
||||
let instruction = instruction::Instruction::GcRegions(gc_regions.clone());
|
||||
let msg = MailboxMessage::json_message(
|
||||
|
||||
@@ -30,6 +30,9 @@ use crate::error::{Error, Result};
|
||||
use crate::gc::ctx::{DefaultGcSchedulerCtx, SchedulerCtx};
|
||||
use crate::gc::options::{GcSchedulerOptions, TICKER_INTERVAL};
|
||||
use crate::gc::tracker::RegionGcTracker;
|
||||
use crate::metrics::{
|
||||
METRIC_META_GC_SCHEDULER_CYCLES_TOTAL, METRIC_META_GC_SCHEDULER_DURATION_SECONDS,
|
||||
};
|
||||
use crate::service::mailbox::MailboxRef;
|
||||
|
||||
/// Report for a GC job.
|
||||
@@ -141,6 +144,8 @@ impl GcScheduler {
|
||||
}
|
||||
|
||||
pub(crate) async fn handle_tick(&self) -> Result<GcJobReport> {
|
||||
METRIC_META_GC_SCHEDULER_CYCLES_TOTAL.inc();
|
||||
let _timer = METRIC_META_GC_SCHEDULER_DURATION_SECONDS.start_timer();
|
||||
info!("Start to trigger gc");
|
||||
let report = self.trigger_gc().await?;
|
||||
|
||||
|
||||
@@ -90,4 +90,41 @@ lazy_static! {
|
||||
/// The topic estimated replay size.
|
||||
pub static ref METRIC_META_TOPIC_ESTIMATED_REPLAY_SIZE: IntGaugeVec =
|
||||
register_int_gauge_vec!("meta_topic_estimated_replay_size", "meta topic estimated replay size", &["topic_name"]).unwrap();
|
||||
|
||||
/// Total GC scheduler cycles.
|
||||
pub static ref METRIC_META_GC_SCHEDULER_CYCLES_TOTAL: IntCounter = register_int_counter!(
|
||||
"greptime_metasrv_gc_scheduler_cycles_total",
|
||||
"Total GC scheduler cycles",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/// Time for a full GC scheduler cycle.
|
||||
pub static ref METRIC_META_GC_SCHEDULER_DURATION_SECONDS: Histogram = register_histogram!(
|
||||
"greptime_metasrv_gc_scheduler_duration_seconds",
|
||||
"Time for a full GC scheduler cycle",
|
||||
vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/// Number of GC candidate regions.
|
||||
pub static ref METRIC_META_GC_CANDIDATE_REGIONS: IntGauge = register_int_gauge!(
|
||||
"greptime_metasrv_gc_candidate_regions",
|
||||
"Number of GC candidate regions",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/// Calls to datanodes for GC.
|
||||
pub static ref METRIC_META_GC_DATANODE_CALLS_TOTAL: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_metasrv_gc_datanode_calls_total",
|
||||
"Calls to datanodes for GC",
|
||||
&["operation", "status"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/// Total regions that failed GC.
|
||||
pub static ref METRIC_META_GC_FAILED_REGIONS_TOTAL: IntCounter = register_int_counter!(
|
||||
"greptime_metasrv_gc_failed_regions_total",
|
||||
"Total regions that failed GC",
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -45,7 +45,10 @@ use crate::error::{
|
||||
TooManyGcJobsSnafu, UnexpectedSnafu,
|
||||
};
|
||||
use crate::manifest::action::{RegionManifest, RemovedFile};
|
||||
use crate::metrics::{GC_DELETE_FILE_CNT, GC_ORPHANED_INDEX_FILES, GC_SKIPPED_UNPARSABLE_FILES};
|
||||
use crate::metrics::{
|
||||
GC_DELETE_FILE_CNT, GC_DURATION_SECONDS, GC_ERRORS_TOTAL, GC_FILES_DELETED_TOTAL,
|
||||
GC_ORPHANED_INDEX_FILES, GC_RUNS_TOTAL, GC_SKIPPED_UNPARSABLE_FILES,
|
||||
};
|
||||
use crate::region::{MitoRegionRef, RegionRoleState};
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId, delete_files, delete_index};
|
||||
use crate::sst::location::{self};
|
||||
@@ -267,6 +270,9 @@ impl LocalGcWorker {
|
||||
/// TODO(discord9): consider instead running in parallel mode
|
||||
pub async fn run(self) -> Result<GcReport> {
|
||||
info!("LocalGcWorker started");
|
||||
let _timer = GC_DURATION_SECONDS
|
||||
.with_label_values(&["total"])
|
||||
.start_timer();
|
||||
let now = std::time::Instant::now();
|
||||
|
||||
let mut deleted_files = HashMap::new();
|
||||
@@ -336,6 +342,12 @@ impl LocalGcWorker {
|
||||
region: Option<MitoRegionRef>,
|
||||
tmp_ref_files: &HashSet<FileRef>,
|
||||
) -> Result<Vec<RemovedFile>> {
|
||||
let mode = if self.full_file_listing {
|
||||
"full_listing"
|
||||
} else {
|
||||
"fast"
|
||||
};
|
||||
GC_RUNS_TOTAL.with_label_values(&[mode]).inc();
|
||||
debug!(
|
||||
"Doing gc for region {}, {}",
|
||||
region_id,
|
||||
@@ -370,6 +382,9 @@ impl LocalGcWorker {
|
||||
region.region_id(),
|
||||
manifest.manifest_version
|
||||
);
|
||||
GC_ERRORS_TOTAL
|
||||
.with_label_values(&["manifest_mismatch"])
|
||||
.inc();
|
||||
return Ok(vec![]);
|
||||
}
|
||||
Some(manifest)
|
||||
@@ -460,6 +475,9 @@ impl LocalGcWorker {
|
||||
region_id
|
||||
);
|
||||
|
||||
let _delete_timer = GC_DURATION_SECONDS
|
||||
.with_label_values(&["delete_files"])
|
||||
.start_timer();
|
||||
self.delete_files(region_id, &deletable_files).await?;
|
||||
|
||||
debug!(
|
||||
@@ -467,6 +485,9 @@ impl LocalGcWorker {
|
||||
unused_file_cnt, region_id
|
||||
);
|
||||
if let Some(region) = ®ion {
|
||||
let _update_timer = GC_DURATION_SECONDS
|
||||
.with_label_values(&["update_manifest"])
|
||||
.start_timer();
|
||||
self.update_manifest_removed_files(region, deletable_files.clone())
|
||||
.await?;
|
||||
}
|
||||
@@ -497,12 +518,26 @@ impl LocalGcWorker {
|
||||
)
|
||||
.await?;
|
||||
|
||||
for index_id in index_ids {
|
||||
delete_index(index_id, &self.access_layer, &self.cache_manager).await?;
|
||||
if !file_pairs.is_empty() {
|
||||
let deleted_count = file_pairs.len() as u64;
|
||||
GC_FILES_DELETED_TOTAL
|
||||
.with_label_values(&["parquet"])
|
||||
.inc_by(deleted_count);
|
||||
GC_DELETE_FILE_CNT.inc_by(deleted_count);
|
||||
}
|
||||
|
||||
// FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now
|
||||
GC_DELETE_FILE_CNT.add(removed_files.len() as i64);
|
||||
for index_id in index_ids {
|
||||
match delete_index(index_id, &self.access_layer, &self.cache_manager).await {
|
||||
Ok(()) => {
|
||||
GC_FILES_DELETED_TOTAL.with_label_values(&["index"]).inc();
|
||||
GC_DELETE_FILE_CNT.inc();
|
||||
}
|
||||
Err(err) => {
|
||||
GC_ERRORS_TOTAL.with_label_values(&["delete_failed"]).inc();
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -588,16 +623,29 @@ impl LocalGcWorker {
|
||||
region_id: RegionId,
|
||||
file_cnt_hint: usize,
|
||||
) -> Result<Vec<Entry>> {
|
||||
let _timer = GC_DURATION_SECONDS
|
||||
.with_label_values(&["list_files"])
|
||||
.start_timer();
|
||||
let start = tokio::time::Instant::now();
|
||||
let concurrency = (file_cnt_hint / Self::CONCURRENCY_LIST_PER_FILES)
|
||||
.max(1)
|
||||
.min(self.opt.max_concurrent_lister_per_gc_job);
|
||||
|
||||
let listers = self.partition_region_files(region_id, concurrency).await?;
|
||||
let listers = self
|
||||
.partition_region_files(region_id, concurrency)
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
GC_ERRORS_TOTAL.with_label_values(&["list_failed"]).inc();
|
||||
})?;
|
||||
let lister_cnt = listers.len();
|
||||
|
||||
// Step 2: Concurrently list all files in the region directory
|
||||
let all_entries = self.list_region_files_concurrent(listers).await?;
|
||||
let all_entries = self
|
||||
.list_region_files_concurrent(listers)
|
||||
.await
|
||||
.inspect_err(|_| {
|
||||
GC_ERRORS_TOTAL.with_label_values(&["list_failed"]).inc();
|
||||
})?;
|
||||
let cnt = all_entries.len();
|
||||
info!(
|
||||
"gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}.",
|
||||
|
||||
@@ -508,8 +508,8 @@ lazy_static! {
|
||||
.unwrap();
|
||||
|
||||
/// Counter for the number of files deleted by the GC worker.
|
||||
pub static ref GC_DELETE_FILE_CNT: IntGauge =
|
||||
register_int_gauge!(
|
||||
pub static ref GC_DELETE_FILE_CNT: IntCounter =
|
||||
register_int_counter!(
|
||||
"greptime_mito_gc_delete_file_count",
|
||||
"mito gc deleted file count",
|
||||
).unwrap();
|
||||
@@ -528,6 +528,35 @@ lazy_static! {
|
||||
"mito gc orphaned index files count",
|
||||
).unwrap();
|
||||
|
||||
/// Histogram for GC operation duration by stage.
|
||||
pub static ref GC_DURATION_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"greptime_mito_gc_duration_seconds",
|
||||
"GC operation duration by stage",
|
||||
&[STAGE_LABEL],
|
||||
exponential_buckets(0.01, 10.0, 6).unwrap(),
|
||||
).unwrap();
|
||||
|
||||
/// Counter for GC runs by mode.
|
||||
pub static ref GC_RUNS_TOTAL: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_mito_gc_runs_total",
|
||||
"Total GC runs by mode",
|
||||
&["mode"],
|
||||
).unwrap();
|
||||
|
||||
/// Counter for GC errors by type.
|
||||
pub static ref GC_ERRORS_TOTAL: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_mito_gc_errors_total",
|
||||
"Total GC errors by type",
|
||||
&["error_type"],
|
||||
).unwrap();
|
||||
|
||||
/// Counter for total files deleted by GC, labeled by file type.
|
||||
pub static ref GC_FILES_DELETED_TOTAL: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_mito_gc_files_deleted_total",
|
||||
"Total files deleted by GC",
|
||||
&[FILE_TYPE_LABEL],
|
||||
).unwrap();
|
||||
|
||||
/// Total number of files downloaded during cache fill on region open.
|
||||
pub static ref CACHE_FILL_DOWNLOADED_FILES: IntCounter = register_int_counter!(
|
||||
"mito_cache_fill_downloaded_files",
|
||||
|
||||
Reference in New Issue
Block a user