diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 8f2b88d191..bde2cedca7 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -3,7 +3,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError}; +use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use pageserver_api::models::TenantState; @@ -256,8 +256,6 @@ async fn calculate_synthetic_size_worker( info!("calculate_synthetic_size_worker stopped"); }; - let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; - loop { let started_at = Instant::now(); @@ -280,29 +278,14 @@ async fn calculate_synthetic_size_worker( continue; } - if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) { - // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? - // We can put in some prioritization for consumption metrics. - // Same for the loop that fetches computed metrics. - // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, - // which turns out is really handy to understand the system. - if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await { - // this error can be returned if timeline is shutting down, but it does not - // mean the synthetic size worker should terminate. we do not need any checks - // in this function because `mgr::get_tenant` will error out after shutdown has - // progressed to shutting down tenants. - let is_cancelled = matches!( - e.downcast_ref::(), - Some(PageReconstructError::Cancelled) - ); + let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else { + continue; + }; - if !is_cancelled { - error!( - "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}" - ); - } - } - } + // there is never any reason to exit calculate_synthetic_size_worker following any + // return value -- we don't need to care about shutdown because no tenant is found when + // pageserver is shut down. + calculate_and_log(&tenant, cancel, ctx).await; } crate::tenant::tasks::warn_when_period_overrun( @@ -321,3 +304,31 @@ async fn calculate_synthetic_size_worker( } } } + +async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) { + const CAUSE: LogicalSizeCalculationCause = + LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; + + // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? + // We can put in some prioritization for consumption metrics. + // Same for the loop that fetches computed metrics. + // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, + // which turns out is really handy to understand the system. + let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else { + return; + }; + + // this error can be returned if timeline is shutting down, but it does not + // mean the synthetic size worker should terminate. we do not need any checks + // in this function because `mgr::get_tenant` will error out after shutdown has + // progressed to shutting down tenants. + let shutting_down = matches!( + e.downcast_ref::(), + Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) + ); + + if !shutting_down { + let tenant_shard_id = tenant.tenant_shard_id(); + error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + } +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index fee50460a5..9faacaef89 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -136,11 +136,6 @@ impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::NeedsDownload(_, _) => { - // This shouldn't happen, because we use a RequestContext that requests to - // download any missing layer files on-demand. - ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) - } PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 81dbc04793..a2a31f395e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -29,7 +29,7 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{id::TenantTimelineId, sync::gate::Gate}; +use utils::sync::gate::Gate; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::ops::{Deref, Range}; @@ -377,9 +377,6 @@ pub enum PageReconstructError { #[error(transparent)] Other(#[from] anyhow::Error), - /// The operation would require downloading a layer that is missing locally. - NeedsDownload(TenantTimelineId, LayerFileName), - /// The operation was cancelled Cancelled, @@ -408,14 +405,6 @@ impl std::fmt::Debug for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), - Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { - write!( - f, - "layer {}/{} needs download", - tenant_timeline_id, - layer_file_name.file_name() - ) - } Self::Cancelled => write!(f, "cancelled"), Self::AncestorStopping(timeline_id) => { write!(f, "ancestor timeline {timeline_id} is being stopped") @@ -429,14 +418,6 @@ impl std::fmt::Display for PageReconstructError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { Self::Other(err) => err.fmt(f), - Self::NeedsDownload(tenant_timeline_id, layer_file_name) => { - write!( - f, - "layer {}/{} needs download", - tenant_timeline_id, - layer_file_name.file_name() - ) - } Self::Cancelled => write!(f, "cancelled"), Self::AncestorStopping(timeline_id) => { write!(f, "ancestor timeline {timeline_id} is being stopped")