From abcfbd7f414c62b21f1806c65d33efebe54bd684 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Tue, 4 Nov 2025 23:23:40 -0800 Subject: [PATCH] chore(metrics): add region server requests failures count metrics (#7173) * chore/add-region-insert-failure-metric: Add metric for failed insert requests to region server in datanode module Signed-off-by: Lei, HUANG * chore/add-region-insert-failure-metric: Add metric for tracking failed region server requests - Introduce a new metric `REGION_SERVER_REQUEST_FAILURE_COUNT` to count failed region server requests. - Update `REGION_SERVER_INSERT_FAIL_COUNT` metric description for consistency. - Implement error handling in `RegionServerHandler` to increment the new failure metric on request errors. Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- src/datanode/src/metrics.rs | 16 ++++++++++++++++ src/datanode/src/region_server.rs | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs index 1b0e513375..4e763f5858 100644 --- a/src/datanode/src/metrics.rs +++ b/src/datanode/src/metrics.rs @@ -75,4 +75,20 @@ lazy_static! { &[RESULT_TYPE] ) .unwrap(); + + /// Total count of failed region server requests. + pub static ref REGION_SERVER_REQUEST_FAILURE_COUNT: IntCounterVec = register_int_counter_vec!( + "greptime_datanode_region_request_fail_count", + "failed region server requests count", + &[REGION_REQUEST_TYPE] + ) + .unwrap(); + + /// Total count of failed insert requests to region server. + pub static ref REGION_SERVER_INSERT_FAIL_COUNT: IntCounterVec = register_int_counter_vec!( + "greptime_datanode_region_failed_insert_count", + "failed region server insert requests count", + &[REGION_REQUEST_TYPE] + ) + .unwrap(); } diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index 03f90bd0dc..ff80c8b10a 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -600,6 +600,8 @@ impl RegionServer { #[async_trait] impl RegionServerHandler for RegionServer { async fn handle(&self, request: region_request::Body) -> ServerResult { + let failed_requests_cnt = crate::metrics::REGION_SERVER_REQUEST_FAILURE_COUNT + .with_label_values(&[request.as_ref()]); let response = match &request { region_request::Body::Creates(_) | region_request::Body::Drops(_) @@ -617,6 +619,9 @@ impl RegionServerHandler for RegionServer { _ => self.handle_requests_in_serial(request).await, } .map_err(BoxedError::new) + .inspect_err(|_| { + failed_requests_cnt.inc(); + }) .context(ExecuteGrpcRequestSnafu)?; Ok(RegionResponseV1 { @@ -1230,6 +1235,11 @@ impl RegionServerInner { }) } Err(err) => { + if matches!(region_change, RegionChange::Ingest) { + crate::metrics::REGION_SERVER_INSERT_FAIL_COUNT + .with_label_values(&[request_type]) + .inc(); + } // Removes the region status if the operation fails. self.unset_region_status(region_id, &engine, region_change); Err(err)