chore: more gc metrics (#7661)

* chore: more gc metrics

Signed-off-by: discord9 <discord9@163.com>

* clippy

Signed-off-by: discord9 <discord9@163.com>

* refactor: simple metrics

Signed-off-by: discord9 <discord9@163.com>

* unused metrics

Signed-off-by: discord9 <discord9@163.com>

* fix(meta-srv): count need-retry regions in GC failure metric

Signed-off-by: discord9 <discord9@163.com>

* chore: better bucketing

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
This commit is contained in:
discord9
2026-02-24 16:21:10 +08:00
committed by GitHub
parent 0e0a892f6b
commit 279b009583
6 changed files with 195 additions and 9 deletions

View File

@@ -32,6 +32,7 @@ use crate::gc::candidate::GcCandidate;
use crate::gc::dropped::DroppedRegionCollector;
use crate::gc::scheduler::{GcJobReport, GcScheduler};
use crate::gc::tracker::RegionGcInfo;
use crate::metrics::METRIC_META_GC_CANDIDATE_REGIONS;
impl GcScheduler {
pub(crate) async fn trigger_gc(&self) -> Result<GcJobReport> {
@@ -51,6 +52,13 @@ impl GcScheduler {
let dropped_collector =
DroppedRegionCollector::new(self.ctx.as_ref(), &self.config, &self.region_gc_tracker);
let dropped_assignment = dropped_collector.collect_and_assign(&table_reparts).await?;
let candidate_count: usize = per_table_candidates.values().map(|c| c.len()).sum();
let dropped_count: usize = dropped_assignment
.regions_by_peer
.values()
.map(|regions| regions.len())
.sum();
METRIC_META_GC_CANDIDATE_REGIONS.set((candidate_count + dropped_count) as i64);
if per_table_candidates.is_empty() && dropped_assignment.regions_by_peer.is_empty() {
info!("No GC candidates found, skipping GC cycle");

View File

@@ -40,6 +40,7 @@ use crate::error::{self, KvBackendSnafu, Result, SerializeToJsonSnafu, TableMeta
use crate::gc::util::table_route_to_region;
use crate::gc::{Peer2Regions, Region2Peers};
use crate::handler::HeartbeatMailbox;
use crate::metrics::{METRIC_META_GC_DATANODE_CALLS_TOTAL, METRIC_META_GC_FAILED_REGIONS_TOTAL};
use crate::service::mailbox::{Channel, MailboxRef};
/// Helper function to send GetFileRefs instruction and wait for reply.
@@ -49,6 +50,27 @@ async fn send_get_file_refs(
peer: &Peer,
instruction: GetFileRefs,
timeout: Duration,
) -> Result<GetFileRefsReply> {
let result = send_get_file_refs_inner(mailbox, server_addr, peer, instruction, timeout).await;
match &result {
Ok(_) => METRIC_META_GC_DATANODE_CALLS_TOTAL
.with_label_values(&["get_file_refs", "success"])
.inc(),
Err(_) => METRIC_META_GC_DATANODE_CALLS_TOTAL
.with_label_values(&["get_file_refs", "error"])
.inc(),
}
result
}
async fn send_get_file_refs_inner(
mailbox: &MailboxRef,
server_addr: &str,
peer: &Peer,
instruction: GetFileRefs,
timeout: Duration,
) -> Result<GetFileRefsReply> {
let instruction = instruction::Instruction::GetFileRefs(instruction);
let msg = MailboxMessage::json_message(
@@ -96,6 +118,43 @@ async fn send_gc_regions(
server_addr: &str,
timeout: Duration,
description: &str,
) -> Result<GcReport> {
let failed_region_count = gc_regions.regions.len() as u64;
let result =
send_gc_regions_inner(mailbox, peer, gc_regions, server_addr, timeout, description).await;
match result {
Ok(report) => {
METRIC_META_GC_DATANODE_CALLS_TOTAL
.with_label_values(&["gc_regions", "success"])
.inc();
let need_retry_count = report.need_retry_regions.len() as u64;
if need_retry_count > 0 {
METRIC_META_GC_FAILED_REGIONS_TOTAL.inc_by(need_retry_count);
}
Ok(report)
}
Err(e) => {
METRIC_META_GC_DATANODE_CALLS_TOTAL
.with_label_values(&["gc_regions", "error"])
.inc();
if failed_region_count > 0 {
METRIC_META_GC_FAILED_REGIONS_TOTAL.inc_by(failed_region_count);
}
Err(e)
}
}
}
async fn send_gc_regions_inner(
mailbox: &MailboxRef,
peer: &Peer,
gc_regions: GcRegions,
server_addr: &str,
timeout: Duration,
description: &str,
) -> Result<GcReport> {
let instruction = instruction::Instruction::GcRegions(gc_regions.clone());
let msg = MailboxMessage::json_message(

View File

@@ -30,6 +30,9 @@ use crate::error::{Error, Result};
use crate::gc::ctx::{DefaultGcSchedulerCtx, SchedulerCtx};
use crate::gc::options::{GcSchedulerOptions, TICKER_INTERVAL};
use crate::gc::tracker::RegionGcTracker;
use crate::metrics::{
METRIC_META_GC_SCHEDULER_CYCLES_TOTAL, METRIC_META_GC_SCHEDULER_DURATION_SECONDS,
};
use crate::service::mailbox::MailboxRef;
/// Report for a GC job.
@@ -141,6 +144,8 @@ impl GcScheduler {
}
pub(crate) async fn handle_tick(&self) -> Result<GcJobReport> {
METRIC_META_GC_SCHEDULER_CYCLES_TOTAL.inc();
let _timer = METRIC_META_GC_SCHEDULER_DURATION_SECONDS.start_timer();
info!("Start to trigger gc");
let report = self.trigger_gc().await?;

View File

@@ -90,4 +90,41 @@ lazy_static! {
/// The topic estimated replay size.
pub static ref METRIC_META_TOPIC_ESTIMATED_REPLAY_SIZE: IntGaugeVec =
register_int_gauge_vec!("meta_topic_estimated_replay_size", "meta topic estimated replay size", &["topic_name"]).unwrap();
/// Total GC scheduler cycles.
pub static ref METRIC_META_GC_SCHEDULER_CYCLES_TOTAL: IntCounter = register_int_counter!(
"greptime_metasrv_gc_scheduler_cycles_total",
"Total GC scheduler cycles",
)
.unwrap();
/// Time for a full GC scheduler cycle.
pub static ref METRIC_META_GC_SCHEDULER_DURATION_SECONDS: Histogram = register_histogram!(
"greptime_metasrv_gc_scheduler_duration_seconds",
"Time for a full GC scheduler cycle",
vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
)
.unwrap();
/// Number of GC candidate regions.
pub static ref METRIC_META_GC_CANDIDATE_REGIONS: IntGauge = register_int_gauge!(
"greptime_metasrv_gc_candidate_regions",
"Number of GC candidate regions",
)
.unwrap();
/// Calls to datanodes for GC.
pub static ref METRIC_META_GC_DATANODE_CALLS_TOTAL: IntCounterVec = register_int_counter_vec!(
"greptime_metasrv_gc_datanode_calls_total",
"Calls to datanodes for GC",
&["operation", "status"],
)
.unwrap();
/// Total regions that failed GC.
pub static ref METRIC_META_GC_FAILED_REGIONS_TOTAL: IntCounter = register_int_counter!(
"greptime_metasrv_gc_failed_regions_total",
"Total regions that failed GC",
)
.unwrap();
}

View File

@@ -45,7 +45,10 @@ use crate::error::{
TooManyGcJobsSnafu, UnexpectedSnafu,
};
use crate::manifest::action::{RegionManifest, RemovedFile};
use crate::metrics::{GC_DELETE_FILE_CNT, GC_ORPHANED_INDEX_FILES, GC_SKIPPED_UNPARSABLE_FILES};
use crate::metrics::{
GC_DELETE_FILE_CNT, GC_DURATION_SECONDS, GC_ERRORS_TOTAL, GC_FILES_DELETED_TOTAL,
GC_ORPHANED_INDEX_FILES, GC_RUNS_TOTAL, GC_SKIPPED_UNPARSABLE_FILES,
};
use crate::region::{MitoRegionRef, RegionRoleState};
use crate::sst::file::{RegionFileId, RegionIndexId, delete_files, delete_index};
use crate::sst::location::{self};
@@ -267,6 +270,9 @@ impl LocalGcWorker {
/// TODO(discord9): consider instead running in parallel mode
pub async fn run(self) -> Result<GcReport> {
info!("LocalGcWorker started");
let _timer = GC_DURATION_SECONDS
.with_label_values(&["total"])
.start_timer();
let now = std::time::Instant::now();
let mut deleted_files = HashMap::new();
@@ -336,6 +342,12 @@ impl LocalGcWorker {
region: Option<MitoRegionRef>,
tmp_ref_files: &HashSet<FileRef>,
) -> Result<Vec<RemovedFile>> {
let mode = if self.full_file_listing {
"full_listing"
} else {
"fast"
};
GC_RUNS_TOTAL.with_label_values(&[mode]).inc();
debug!(
"Doing gc for region {}, {}",
region_id,
@@ -370,6 +382,9 @@ impl LocalGcWorker {
region.region_id(),
manifest.manifest_version
);
GC_ERRORS_TOTAL
.with_label_values(&["manifest_mismatch"])
.inc();
return Ok(vec![]);
}
Some(manifest)
@@ -460,6 +475,9 @@ impl LocalGcWorker {
region_id
);
let _delete_timer = GC_DURATION_SECONDS
.with_label_values(&["delete_files"])
.start_timer();
self.delete_files(region_id, &deletable_files).await?;
debug!(
@@ -467,6 +485,9 @@ impl LocalGcWorker {
unused_file_cnt, region_id
);
if let Some(region) = &region {
let _update_timer = GC_DURATION_SECONDS
.with_label_values(&["update_manifest"])
.start_timer();
self.update_manifest_removed_files(region, deletable_files.clone())
.await?;
}
@@ -497,12 +518,26 @@ impl LocalGcWorker {
)
.await?;
for index_id in index_ids {
delete_index(index_id, &self.access_layer, &self.cache_manager).await?;
if !file_pairs.is_empty() {
let deleted_count = file_pairs.len() as u64;
GC_FILES_DELETED_TOTAL
.with_label_values(&["parquet"])
.inc_by(deleted_count);
GC_DELETE_FILE_CNT.inc_by(deleted_count);
}
// FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now
GC_DELETE_FILE_CNT.add(removed_files.len() as i64);
for index_id in index_ids {
match delete_index(index_id, &self.access_layer, &self.cache_manager).await {
Ok(()) => {
GC_FILES_DELETED_TOTAL.with_label_values(&["index"]).inc();
GC_DELETE_FILE_CNT.inc();
}
Err(err) => {
GC_ERRORS_TOTAL.with_label_values(&["delete_failed"]).inc();
return Err(err);
}
}
}
Ok(())
}
@@ -588,16 +623,29 @@ impl LocalGcWorker {
region_id: RegionId,
file_cnt_hint: usize,
) -> Result<Vec<Entry>> {
let _timer = GC_DURATION_SECONDS
.with_label_values(&["list_files"])
.start_timer();
let start = tokio::time::Instant::now();
let concurrency = (file_cnt_hint / Self::CONCURRENCY_LIST_PER_FILES)
.max(1)
.min(self.opt.max_concurrent_lister_per_gc_job);
let listers = self.partition_region_files(region_id, concurrency).await?;
let listers = self
.partition_region_files(region_id, concurrency)
.await
.inspect_err(|_| {
GC_ERRORS_TOTAL.with_label_values(&["list_failed"]).inc();
})?;
let lister_cnt = listers.len();
// Step 2: Concurrently list all files in the region directory
let all_entries = self.list_region_files_concurrent(listers).await?;
let all_entries = self
.list_region_files_concurrent(listers)
.await
.inspect_err(|_| {
GC_ERRORS_TOTAL.with_label_values(&["list_failed"]).inc();
})?;
let cnt = all_entries.len();
info!(
"gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}.",

View File

@@ -508,8 +508,8 @@ lazy_static! {
.unwrap();
/// Counter for the number of files deleted by the GC worker.
pub static ref GC_DELETE_FILE_CNT: IntGauge =
register_int_gauge!(
pub static ref GC_DELETE_FILE_CNT: IntCounter =
register_int_counter!(
"greptime_mito_gc_delete_file_count",
"mito gc deleted file count",
).unwrap();
@@ -528,6 +528,35 @@ lazy_static! {
"mito gc orphaned index files count",
).unwrap();
/// Histogram for GC operation duration by stage.
pub static ref GC_DURATION_SECONDS: HistogramVec = register_histogram_vec!(
"greptime_mito_gc_duration_seconds",
"GC operation duration by stage",
&[STAGE_LABEL],
exponential_buckets(0.01, 10.0, 6).unwrap(),
).unwrap();
/// Counter for GC runs by mode.
pub static ref GC_RUNS_TOTAL: IntCounterVec = register_int_counter_vec!(
"greptime_mito_gc_runs_total",
"Total GC runs by mode",
&["mode"],
).unwrap();
/// Counter for GC errors by type.
pub static ref GC_ERRORS_TOTAL: IntCounterVec = register_int_counter_vec!(
"greptime_mito_gc_errors_total",
"Total GC errors by type",
&["error_type"],
).unwrap();
/// Counter for total files deleted by GC, labeled by file type.
pub static ref GC_FILES_DELETED_TOTAL: IntCounterVec = register_int_counter_vec!(
"greptime_mito_gc_files_deleted_total",
"Total files deleted by GC",
&[FILE_TYPE_LABEL],
).unwrap();
/// Total number of files downloaded during cache fill on region open.
pub static ref CACHE_FILL_DOWNLOADED_FILES: IntCounter = register_int_counter!(
"mito_cache_fill_downloaded_files",