Handle errors during metric collection. (#3521)

Don't exit the loop if one of the tenants failed to scrape its metrics.
fixes #3490
This commit is contained in:
Anastasia Lubennikova
2023-02-03 12:37:34 +02:00
committed by GitHub
parent f71b1b174d
commit 83048a4adc

View File

@@ -83,10 +83,7 @@ pub async fn collect_metrics(
return Ok(());
},
_ = ticker.tick() => {
if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
{
error!("metrics collection failed: {err:?}");
}
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await;
}
}
}
@@ -97,17 +94,18 @@ pub async fn collect_metrics(
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
/// Cache metrics to avoid sending the same metrics multiple times.
///
/// This function handles all errors internally
/// and doesn't break iteration if just one tenant fails.
///
/// TODO
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
/// - improve error handling. Now if one tenant fails to collect metrics,
/// the whole iteration fails and metrics for other tenants are not collected.
pub async fn collect_metrics_iteration(
client: &reqwest::Client,
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) {
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
trace!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
@@ -115,7 +113,13 @@ pub async fn collect_metrics_iteration(
);
// get list of tenants
let tenants = mgr::list_tenants().await?;
let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
Err(err) => {
error!("failed to list tenants: {:?}", err);
return;
}
};
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
@@ -123,7 +127,15 @@ pub async fn collect_metrics_iteration(
continue;
}
let tenant = mgr::get_tenant(tenant_id, true).await?;
let tenant = match mgr::get_tenant(tenant_id, true).await {
Ok(tenant) => tenant,
Err(err) => {
// It is possible that tenant was deleted between
// `list_tenants` and `get_tenant`, so just warn about it.
warn!("failed to get tenant {tenant_id:?}: {err:?}");
continue;
}
};
let mut tenant_resident_size = 0;
@@ -142,29 +154,51 @@ pub async fn collect_metrics_iteration(
timeline_written_size,
));
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
// Only send timeline logical size when it is fully calculated.
if is_exact {
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: TIMELINE_LOGICAL_SIZE,
},
timeline_logical_size,
));
}
match timeline.get_current_logical_size(ctx) {
// Only send timeline logical size when it is fully calculated.
Ok((size, is_exact)) if is_exact => {
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: TIMELINE_LOGICAL_SIZE,
},
size,
));
}
Ok((_, _)) => {}
Err(err) => {
error!(
"failed to get current logical size for timeline {}: {err:?}",
timeline.timeline_id
);
continue;
}
};
}
let timeline_resident_size = timeline.get_resident_physical_size();
tenant_resident_size += timeline_resident_size;
}
let tenant_remote_size = tenant.get_remote_size().await?;
debug!(
"collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
);
match tenant.get_remote_size().await {
Ok(tenant_remote_size) => {
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: REMOTE_STORAGE_SIZE,
},
tenant_remote_size,
));
}
Err(err) => {
error!(
"failed to get remote size for tenant {}: {err:?}",
tenant_id
);
}
}
current_metrics.push((
PageserverConsumptionMetricsKey {
@@ -175,15 +209,6 @@ pub async fn collect_metrics_iteration(
tenant_resident_size,
));
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: REMOTE_STORAGE_SIZE,
},
tenant_remote_size,
));
// Note that this metric is calculated in a separate bgworker
// Here we only use cached value, which may lag behind the real latest one
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
@@ -205,7 +230,7 @@ pub async fn collect_metrics_iteration(
if current_metrics.is_empty() {
trace!("no new metrics to send");
return Ok(());
return;
}
// Send metrics.
@@ -256,8 +281,6 @@ pub async fn collect_metrics_iteration(
}
}
}
Ok(())
}
/// Caclculate synthetic size for each active tenant