From 83048a4adcd601960e96da84d5a064267a1df02d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 3 Feb 2023 12:37:34 +0200 Subject: [PATCH] Handle errors during metric collection. (#3521) Don't exit the loop if one of the tenants failed to scrape its metrics. fixes #3490 --- pageserver/src/consumption_metrics.rs | 99 +++++++++++++++++---------- 1 file changed, 61 insertions(+), 38 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index d848ec5ee5..6204ab92ab 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -83,10 +83,7 @@ pub async fn collect_metrics( return Ok(()); }, _ = ticker.tick() => { - if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await - { - error!("metrics collection failed: {err:?}"); - } + collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await; } } } @@ -97,17 +94,18 @@ pub async fn collect_metrics( /// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`. /// Cache metrics to avoid sending the same metrics multiple times. /// +/// This function handles all errors internally +/// and doesn't break iteration if just one tenant fails. +/// /// TODO /// - refactor this function (chunking+sending part) to reuse it in proxy module; -/// - improve error handling. Now if one tenant fails to collect metrics, -/// the whole iteration fails and metrics for other tenants are not collected. pub async fn collect_metrics_iteration( client: &reqwest::Client, cached_metrics: &mut HashMap, metric_collection_endpoint: &reqwest::Url, node_id: NodeId, ctx: &RequestContext, -) -> anyhow::Result<()> { +) { let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new(); trace!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", @@ -115,7 +113,13 @@ pub async fn collect_metrics_iteration( ); // get list of tenants - let tenants = mgr::list_tenants().await?; + let tenants = match mgr::list_tenants().await { + Ok(tenants) => tenants, + Err(err) => { + error!("failed to list tenants: {:?}", err); + return; + } + }; // iterate through list of Active tenants and collect metrics for (tenant_id, tenant_state) in tenants { @@ -123,7 +127,15 @@ pub async fn collect_metrics_iteration( continue; } - let tenant = mgr::get_tenant(tenant_id, true).await?; + let tenant = match mgr::get_tenant(tenant_id, true).await { + Ok(tenant) => tenant, + Err(err) => { + // It is possible that tenant was deleted between + // `list_tenants` and `get_tenant`, so just warn about it. + warn!("failed to get tenant {tenant_id:?}: {err:?}"); + continue; + } + }; let mut tenant_resident_size = 0; @@ -142,29 +154,51 @@ pub async fn collect_metrics_iteration( timeline_written_size, )); - let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?; - // Only send timeline logical size when it is fully calculated. - if is_exact { - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: Some(timeline.timeline_id), - metric: TIMELINE_LOGICAL_SIZE, - }, - timeline_logical_size, - )); - } + match timeline.get_current_logical_size(ctx) { + // Only send timeline logical size when it is fully calculated. + Ok((size, is_exact)) if is_exact => { + current_metrics.push(( + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: TIMELINE_LOGICAL_SIZE, + }, + size, + )); + } + Ok((_, _)) => {} + Err(err) => { + error!( + "failed to get current logical size for timeline {}: {err:?}", + timeline.timeline_id + ); + continue; + } + }; } let timeline_resident_size = timeline.get_resident_physical_size(); tenant_resident_size += timeline_resident_size; } - let tenant_remote_size = tenant.get_remote_size().await?; - debug!( - "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}", - tenant_id, tenant_state, tenant_resident_size, tenant_remote_size - ); + match tenant.get_remote_size().await { + Ok(tenant_remote_size) => { + current_metrics.push(( + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: REMOTE_STORAGE_SIZE, + }, + tenant_remote_size, + )); + } + Err(err) => { + error!( + "failed to get remote size for tenant {}: {err:?}", + tenant_id + ); + } + } current_metrics.push(( PageserverConsumptionMetricsKey { @@ -175,15 +209,6 @@ pub async fn collect_metrics_iteration( tenant_resident_size, )); - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: None, - metric: REMOTE_STORAGE_SIZE, - }, - tenant_remote_size, - )); - // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one let tenant_synthetic_size = tenant.get_cached_synthetic_size(); @@ -205,7 +230,7 @@ pub async fn collect_metrics_iteration( if current_metrics.is_empty() { trace!("no new metrics to send"); - return Ok(()); + return; } // Send metrics. @@ -256,8 +281,6 @@ pub async fn collect_metrics_iteration( } } } - - Ok(()) } /// Caclculate synthetic size for each active tenant