diff --git a/src/meta-srv/src/gc/handler.rs b/src/meta-srv/src/gc/handler.rs index d140e40d9f..32e67e1d71 100644 --- a/src/meta-srv/src/gc/handler.rs +++ b/src/meta-srv/src/gc/handler.rs @@ -32,6 +32,7 @@ use crate::gc::candidate::GcCandidate; use crate::gc::dropped::DroppedRegionCollector; use crate::gc::scheduler::{GcJobReport, GcScheduler}; use crate::gc::tracker::RegionGcInfo; +use crate::metrics::METRIC_META_GC_CANDIDATE_REGIONS; impl GcScheduler { pub(crate) async fn trigger_gc(&self) -> Result { @@ -51,6 +52,13 @@ impl GcScheduler { let dropped_collector = DroppedRegionCollector::new(self.ctx.as_ref(), &self.config, &self.region_gc_tracker); let dropped_assignment = dropped_collector.collect_and_assign(&table_reparts).await?; + let candidate_count: usize = per_table_candidates.values().map(|c| c.len()).sum(); + let dropped_count: usize = dropped_assignment + .regions_by_peer + .values() + .map(|regions| regions.len()) + .sum(); + METRIC_META_GC_CANDIDATE_REGIONS.set((candidate_count + dropped_count) as i64); if per_table_candidates.is_empty() && dropped_assignment.regions_by_peer.is_empty() { info!("No GC candidates found, skipping GC cycle"); diff --git a/src/meta-srv/src/gc/procedure.rs b/src/meta-srv/src/gc/procedure.rs index b612e0fe8f..bf56a2f4cd 100644 --- a/src/meta-srv/src/gc/procedure.rs +++ b/src/meta-srv/src/gc/procedure.rs @@ -40,6 +40,7 @@ use crate::error::{self, KvBackendSnafu, Result, SerializeToJsonSnafu, TableMeta use crate::gc::util::table_route_to_region; use crate::gc::{Peer2Regions, Region2Peers}; use crate::handler::HeartbeatMailbox; +use crate::metrics::{METRIC_META_GC_DATANODE_CALLS_TOTAL, METRIC_META_GC_FAILED_REGIONS_TOTAL}; use crate::service::mailbox::{Channel, MailboxRef}; /// Helper function to send GetFileRefs instruction and wait for reply. @@ -49,6 +50,27 @@ async fn send_get_file_refs( peer: &Peer, instruction: GetFileRefs, timeout: Duration, +) -> Result { + let result = send_get_file_refs_inner(mailbox, server_addr, peer, instruction, timeout).await; + + match &result { + Ok(_) => METRIC_META_GC_DATANODE_CALLS_TOTAL + .with_label_values(&["get_file_refs", "success"]) + .inc(), + Err(_) => METRIC_META_GC_DATANODE_CALLS_TOTAL + .with_label_values(&["get_file_refs", "error"]) + .inc(), + } + + result +} + +async fn send_get_file_refs_inner( + mailbox: &MailboxRef, + server_addr: &str, + peer: &Peer, + instruction: GetFileRefs, + timeout: Duration, ) -> Result { let instruction = instruction::Instruction::GetFileRefs(instruction); let msg = MailboxMessage::json_message( @@ -96,6 +118,43 @@ async fn send_gc_regions( server_addr: &str, timeout: Duration, description: &str, +) -> Result { + let failed_region_count = gc_regions.regions.len() as u64; + let result = + send_gc_regions_inner(mailbox, peer, gc_regions, server_addr, timeout, description).await; + + match result { + Ok(report) => { + METRIC_META_GC_DATANODE_CALLS_TOTAL + .with_label_values(&["gc_regions", "success"]) + .inc(); + + let need_retry_count = report.need_retry_regions.len() as u64; + if need_retry_count > 0 { + METRIC_META_GC_FAILED_REGIONS_TOTAL.inc_by(need_retry_count); + } + + Ok(report) + } + Err(e) => { + METRIC_META_GC_DATANODE_CALLS_TOTAL + .with_label_values(&["gc_regions", "error"]) + .inc(); + if failed_region_count > 0 { + METRIC_META_GC_FAILED_REGIONS_TOTAL.inc_by(failed_region_count); + } + Err(e) + } + } +} + +async fn send_gc_regions_inner( + mailbox: &MailboxRef, + peer: &Peer, + gc_regions: GcRegions, + server_addr: &str, + timeout: Duration, + description: &str, ) -> Result { let instruction = instruction::Instruction::GcRegions(gc_regions.clone()); let msg = MailboxMessage::json_message( diff --git a/src/meta-srv/src/gc/scheduler.rs b/src/meta-srv/src/gc/scheduler.rs index 394af9006a..648539f27f 100644 --- a/src/meta-srv/src/gc/scheduler.rs +++ b/src/meta-srv/src/gc/scheduler.rs @@ -30,6 +30,9 @@ use crate::error::{Error, Result}; use crate::gc::ctx::{DefaultGcSchedulerCtx, SchedulerCtx}; use crate::gc::options::{GcSchedulerOptions, TICKER_INTERVAL}; use crate::gc::tracker::RegionGcTracker; +use crate::metrics::{ + METRIC_META_GC_SCHEDULER_CYCLES_TOTAL, METRIC_META_GC_SCHEDULER_DURATION_SECONDS, +}; use crate::service::mailbox::MailboxRef; /// Report for a GC job. @@ -141,6 +144,8 @@ impl GcScheduler { } pub(crate) async fn handle_tick(&self) -> Result { + METRIC_META_GC_SCHEDULER_CYCLES_TOTAL.inc(); + let _timer = METRIC_META_GC_SCHEDULER_DURATION_SECONDS.start_timer(); info!("Start to trigger gc"); let report = self.trigger_gc().await?; diff --git a/src/meta-srv/src/metrics.rs b/src/meta-srv/src/metrics.rs index 30576fbd11..879b1de206 100644 --- a/src/meta-srv/src/metrics.rs +++ b/src/meta-srv/src/metrics.rs @@ -90,4 +90,41 @@ lazy_static! { /// The topic estimated replay size. pub static ref METRIC_META_TOPIC_ESTIMATED_REPLAY_SIZE: IntGaugeVec = register_int_gauge_vec!("meta_topic_estimated_replay_size", "meta topic estimated replay size", &["topic_name"]).unwrap(); + + /// Total GC scheduler cycles. + pub static ref METRIC_META_GC_SCHEDULER_CYCLES_TOTAL: IntCounter = register_int_counter!( + "greptime_metasrv_gc_scheduler_cycles_total", + "Total GC scheduler cycles", + ) + .unwrap(); + + /// Time for a full GC scheduler cycle. + pub static ref METRIC_META_GC_SCHEDULER_DURATION_SECONDS: Histogram = register_histogram!( + "greptime_metasrv_gc_scheduler_duration_seconds", + "Time for a full GC scheduler cycle", + vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0], + ) + .unwrap(); + + /// Number of GC candidate regions. + pub static ref METRIC_META_GC_CANDIDATE_REGIONS: IntGauge = register_int_gauge!( + "greptime_metasrv_gc_candidate_regions", + "Number of GC candidate regions", + ) + .unwrap(); + + /// Calls to datanodes for GC. + pub static ref METRIC_META_GC_DATANODE_CALLS_TOTAL: IntCounterVec = register_int_counter_vec!( + "greptime_metasrv_gc_datanode_calls_total", + "Calls to datanodes for GC", + &["operation", "status"], + ) + .unwrap(); + + /// Total regions that failed GC. + pub static ref METRIC_META_GC_FAILED_REGIONS_TOTAL: IntCounter = register_int_counter!( + "greptime_metasrv_gc_failed_regions_total", + "Total regions that failed GC", + ) + .unwrap(); } diff --git a/src/mito2/src/gc.rs b/src/mito2/src/gc.rs index 6dffb710f6..3866c8892f 100644 --- a/src/mito2/src/gc.rs +++ b/src/mito2/src/gc.rs @@ -45,7 +45,10 @@ use crate::error::{ TooManyGcJobsSnafu, UnexpectedSnafu, }; use crate::manifest::action::{RegionManifest, RemovedFile}; -use crate::metrics::{GC_DELETE_FILE_CNT, GC_ORPHANED_INDEX_FILES, GC_SKIPPED_UNPARSABLE_FILES}; +use crate::metrics::{ + GC_DELETE_FILE_CNT, GC_DURATION_SECONDS, GC_ERRORS_TOTAL, GC_FILES_DELETED_TOTAL, + GC_ORPHANED_INDEX_FILES, GC_RUNS_TOTAL, GC_SKIPPED_UNPARSABLE_FILES, +}; use crate::region::{MitoRegionRef, RegionRoleState}; use crate::sst::file::{RegionFileId, RegionIndexId, delete_files, delete_index}; use crate::sst::location::{self}; @@ -267,6 +270,9 @@ impl LocalGcWorker { /// TODO(discord9): consider instead running in parallel mode pub async fn run(self) -> Result { info!("LocalGcWorker started"); + let _timer = GC_DURATION_SECONDS + .with_label_values(&["total"]) + .start_timer(); let now = std::time::Instant::now(); let mut deleted_files = HashMap::new(); @@ -336,6 +342,12 @@ impl LocalGcWorker { region: Option, tmp_ref_files: &HashSet, ) -> Result> { + let mode = if self.full_file_listing { + "full_listing" + } else { + "fast" + }; + GC_RUNS_TOTAL.with_label_values(&[mode]).inc(); debug!( "Doing gc for region {}, {}", region_id, @@ -370,6 +382,9 @@ impl LocalGcWorker { region.region_id(), manifest.manifest_version ); + GC_ERRORS_TOTAL + .with_label_values(&["manifest_mismatch"]) + .inc(); return Ok(vec![]); } Some(manifest) @@ -460,6 +475,9 @@ impl LocalGcWorker { region_id ); + let _delete_timer = GC_DURATION_SECONDS + .with_label_values(&["delete_files"]) + .start_timer(); self.delete_files(region_id, &deletable_files).await?; debug!( @@ -467,6 +485,9 @@ impl LocalGcWorker { unused_file_cnt, region_id ); if let Some(region) = ®ion { + let _update_timer = GC_DURATION_SECONDS + .with_label_values(&["update_manifest"]) + .start_timer(); self.update_manifest_removed_files(region, deletable_files.clone()) .await?; } @@ -497,12 +518,26 @@ impl LocalGcWorker { ) .await?; - for index_id in index_ids { - delete_index(index_id, &self.access_layer, &self.cache_manager).await?; + if !file_pairs.is_empty() { + let deleted_count = file_pairs.len() as u64; + GC_FILES_DELETED_TOTAL + .with_label_values(&["parquet"]) + .inc_by(deleted_count); + GC_DELETE_FILE_CNT.inc_by(deleted_count); } - // FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now - GC_DELETE_FILE_CNT.add(removed_files.len() as i64); + for index_id in index_ids { + match delete_index(index_id, &self.access_layer, &self.cache_manager).await { + Ok(()) => { + GC_FILES_DELETED_TOTAL.with_label_values(&["index"]).inc(); + GC_DELETE_FILE_CNT.inc(); + } + Err(err) => { + GC_ERRORS_TOTAL.with_label_values(&["delete_failed"]).inc(); + return Err(err); + } + } + } Ok(()) } @@ -588,16 +623,29 @@ impl LocalGcWorker { region_id: RegionId, file_cnt_hint: usize, ) -> Result> { + let _timer = GC_DURATION_SECONDS + .with_label_values(&["list_files"]) + .start_timer(); let start = tokio::time::Instant::now(); let concurrency = (file_cnt_hint / Self::CONCURRENCY_LIST_PER_FILES) .max(1) .min(self.opt.max_concurrent_lister_per_gc_job); - let listers = self.partition_region_files(region_id, concurrency).await?; + let listers = self + .partition_region_files(region_id, concurrency) + .await + .inspect_err(|_| { + GC_ERRORS_TOTAL.with_label_values(&["list_failed"]).inc(); + })?; let lister_cnt = listers.len(); // Step 2: Concurrently list all files in the region directory - let all_entries = self.list_region_files_concurrent(listers).await?; + let all_entries = self + .list_region_files_concurrent(listers) + .await + .inspect_err(|_| { + GC_ERRORS_TOTAL.with_label_values(&["list_failed"]).inc(); + })?; let cnt = all_entries.len(); info!( "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}.", diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index c29f021aa0..c0537567f9 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -508,8 +508,8 @@ lazy_static! { .unwrap(); /// Counter for the number of files deleted by the GC worker. - pub static ref GC_DELETE_FILE_CNT: IntGauge = - register_int_gauge!( + pub static ref GC_DELETE_FILE_CNT: IntCounter = + register_int_counter!( "greptime_mito_gc_delete_file_count", "mito gc deleted file count", ).unwrap(); @@ -528,6 +528,35 @@ lazy_static! { "mito gc orphaned index files count", ).unwrap(); + /// Histogram for GC operation duration by stage. + pub static ref GC_DURATION_SECONDS: HistogramVec = register_histogram_vec!( + "greptime_mito_gc_duration_seconds", + "GC operation duration by stage", + &[STAGE_LABEL], + exponential_buckets(0.01, 10.0, 6).unwrap(), + ).unwrap(); + + /// Counter for GC runs by mode. + pub static ref GC_RUNS_TOTAL: IntCounterVec = register_int_counter_vec!( + "greptime_mito_gc_runs_total", + "Total GC runs by mode", + &["mode"], + ).unwrap(); + + /// Counter for GC errors by type. + pub static ref GC_ERRORS_TOTAL: IntCounterVec = register_int_counter_vec!( + "greptime_mito_gc_errors_total", + "Total GC errors by type", + &["error_type"], + ).unwrap(); + + /// Counter for total files deleted by GC, labeled by file type. + pub static ref GC_FILES_DELETED_TOTAL: IntCounterVec = register_int_counter_vec!( + "greptime_mito_gc_files_deleted_total", + "Total files deleted by GC", + &[FILE_TYPE_LABEL], + ).unwrap(); + /// Total number of files downloaded during cache fill on region open. pub static ref CACHE_FILL_DOWNLOADED_FILES: IntCounter = register_int_counter!( "mito_cache_fill_downloaded_files",