pageserver: improved synthetic size & find_gc_cutoff error handling (#8051)

## Problem This PR refactors some error handling to avoid log spam on tenant/timeline shutdown. - "ignoring failure to find gc cutoffs: timeline shutting down." logs (https://github.com/neondatabase/neon/issues/8012) - "synthetic_size_worker: failed to calculate synthetic size for tenant ...: Failed to refresh gc_info before gathering inputs: tenant shutting down", for example here: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8049/9502988669/index.html#suites/3fc871d9ee8127d8501d607e03205abb/1a074a66548bbcea Closes: https://github.com/neondatabase/neon/issues/8012 ## Summary of changes - Refactor: Add a PageReconstructError variant to GcError: this is the only kind of error that find_gc_cutoffs can emit. - Functional change: only ignore shutdown PageReconstructError variant: for other variants, treat it as a real error - Refactor: add a structured CalculateSyntheticSizeError type and use it instead of anyhow::Error in synthetic size calculations - Functional change: while iterating through timelines gathering logical sizes, only drop out if the whole tenant is cancelled: individual timeline cancellations indicate deletion in progress and we can just ignore those.
2026-01-08 05:52:55 +00:00 · 2024-06-14 11:08:11 +01:00
parent 6843fd8f89
commit eb0ca9b648
7 changed files with 115 additions and 71 deletions
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,10 +2,9 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{
-    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
-};
+use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -350,19 +349,12 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // Same for the loop that fetches computed metrics.
    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
    // which turns out is really handy to understand the system.
-    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
-        return;
-    };
-
-    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate.
-    let shutting_down = matches!(
-        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled)
-    );
-
-    if !shutting_down {
-        let tenant_shard_id = tenant.tenant_shard_id();
-        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+    match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
+        Ok(_) => {}
+        Err(CalculateSyntheticSizeError::Cancelled) => {}
+        Err(e) => {
+            let tenant_shard_id = tenant.tenant_shard_id();
+            error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+        }
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1135,7 +1135,10 @@ async fn tenant_size_handler(
            &ctx,
        )
        .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;

    let mut sizes = None;
    let accepts_html = headers
@@ -1143,9 +1146,7 @@ async fn tenant_size_handler(
        .map(|v| v == "text/html")
        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
-        let storage_model = inputs
-            .calculate_model()
-            .map_err(ApiError::InternalServerError)?;
+        let storage_model = inputs.calculate_model();
        let size = storage_model.calculate();

        // If request header expects html, return html
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -509,11 +509,24 @@ pub(crate) enum GcError {
    #[error(transparent)]
    Remote(anyhow::Error),

+    // An error reading while calculating GC cutoffs
+    #[error(transparent)]
+    GcCutoffs(PageReconstructError),
+
    // If GC was invoked for a particular timeline, this error means it didn't exist
    #[error("timeline not found")]
    TimelineNotFound,
 }

+impl From<PageReconstructError> for GcError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::TimelineCancelled,
+            other => Self::GcCutoffs(other),
+        }
+    }
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -2921,17 +2934,9 @@ impl Tenant {
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

-            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
-
-            match res {
-                Ok(cutoffs) => {
-                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
-                    assert!(old.is_none());
-                }
-                Err(e) => {
-                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
-                }
-            }
+            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+            assert!(old.is_none());
        }

        if !self.is_active() || self.cancel.is_cancelled() {
@@ -3553,7 +3558,7 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<size::ModelInputs> {
+    ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
        let logical_sizes_at_once = self
            .conf
            .concurrent_tenant_size_logical_size_queries
@@ -3568,8 +3573,8 @@ impl Tenant {
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = tokio::select! {
            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
-            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
+            _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
+            _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
        };

        size::gather_inputs(
@@ -3593,10 +3598,10 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, size::CalculateSyntheticSizeError> {
        let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;

-        let size = inputs.calculate()?;
+        let size = inputs.calculate();

        self.set_cached_synthetic_size(size);

--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

-use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -11,7 +10,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::{LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -43,6 +42,44 @@ pub struct SegmentMeta {
    pub kind: LsnKind,
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum CalculateSyntheticSizeError {
+    /// Something went wrong internally to the calculation of logical size at a particular branch point
+    #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
+    LogicalSize {
+        timeline_id: TimelineId,
+        lsn: Lsn,
+        error: CalculateLogicalSizeError,
+    },
+
+    /// Something went wrong internally when calculating GC parameters at start of size calculation
+    #[error(transparent)]
+    GcInfo(GcError),
+
+    /// Totally unexpected errors, like panics joining a task
+    #[error(transparent)]
+    Fatal(anyhow::Error),
+
+    /// The LSN we are trying to calculate a size at no longer exists at the point we query it
+    #[error("Could not find size at {lsn} in timeline {timeline_id}")]
+    LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
+
+    /// Tenant shut down while calculating size
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+impl From<GcError> for CalculateSyntheticSizeError {
+    fn from(value: GcError) -> Self {
+        match value {
+            GcError::TenantCancelled | GcError::TimelineCancelled => {
+                CalculateSyntheticSizeError::Cancelled
+            }
+            other => CalculateSyntheticSizeError::GcInfo(other),
+        }
+    }
+}
+
 impl SegmentMeta {
    fn size_needed(&self) -> bool {
        match self.kind {
@@ -116,12 +153,9 @@ pub(super) async fn gather_inputs(
    cause: LogicalSizeCalculationCause,
    cancel: &CancellationToken,
    ctx: &RequestContext,
-) -> anyhow::Result<ModelInputs> {
+) -> Result<ModelInputs, CalculateSyntheticSizeError> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    tenant
-        .refresh_gc_info(cancel, ctx)
-        .await
-        .context("Failed to refresh gc_info before gathering inputs")?;
+    tenant.refresh_gc_info(cancel, ctx).await?;

    // Collect information about all the timelines
    let mut timelines = tenant.list_timelines();
@@ -327,6 +361,12 @@ pub(super) async fn gather_inputs(
    )
    .await?;

+    if tenant.cancel.is_cancelled() {
+        // If we're shutting down, return an error rather than a sparse result that might include some
+        // timelines from before we started shutting down
+        return Err(CalculateSyntheticSizeError::Cancelled);
+    }
+
    Ok(ModelInputs {
        segments,
        timeline_inputs,
@@ -345,7 +385,7 @@ async fn fill_logical_sizes(
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
-) -> anyhow::Result<()> {
+) -> Result<(), CalculateSyntheticSizeError> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
        timelines
            .iter()
@@ -387,7 +427,7 @@ async fn fill_logical_sizes(
    }

    // Perform the size lookups
-    let mut have_any_error = false;
+    let mut have_any_error = None;
    while let Some(res) = joinset.join_next().await {
        // each of these come with Result<anyhow::Result<_>, JoinError>
        // because of spawn + spawn_blocking
@@ -398,21 +438,36 @@ async fn fill_logical_sizes(
            Err(join_error) => {
                // cannot really do anything, as this panic is likely a bug
                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
-                have_any_error = true;
+
+                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
+                    anyhow::anyhow!(join_error)
+                        .context("task that calls spawn_ondemand_logical_size_calculation"),
+                ));
            }
            Ok(Err(recv_result_error)) => {
                // cannot really do anything, as this panic is likely a bug
                error!("failed to receive logical size query result: {recv_result_error:#}");
-                have_any_error = true;
+                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
+                    anyhow::anyhow!(recv_result_error)
+                        .context("Receiving logical size query result"),
+                ));
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
+                if matches!(error, CalculateLogicalSizeError::Cancelled) {
+                    // Skip this: it's okay if one timeline among many is shutting down while we
+                    // calculate inputs for the overall tenant.
+                    continue;
+                } else {
                    warn!(
                        timeline_id=%timeline.timeline_id,
                        "failed to calculate logical size at {lsn}: {error:#}"
                    );
+                    have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
+                        timeline_id: timeline.timeline_id,
+                        lsn,
+                        error,
+                    });
                }
-                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -426,10 +481,10 @@ async fn fill_logical_sizes(
    // prune any keys not needed anymore; we record every used key and added key.
    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));

-    if have_any_error {
+    if let Some(error) = have_any_error {
        // we cannot complete this round, because we are missing data.
        // we have however cached all we were able to request calculation on.
-        anyhow::bail!("failed to calculate some logical_sizes");
+        return Err(error);
    }

    // Insert the looked up sizes to the Segments
@@ -444,32 +499,29 @@ async fn fill_logical_sizes(
        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
            seg.segment.size = Some(*size);
        } else {
-            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
+            return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
        }
    }
    Ok(())
 }

 impl ModelInputs {
-    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
+    pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
        // Convert SegmentMetas into plain Segments
-        let storage = StorageModel {
+        StorageModel {
            segments: self
                .segments
                .iter()
                .map(|seg| seg.segment.clone())
                .collect(),
-        };
-
-        Ok(storage)
+        }
    }

    // calculate total project size
-    pub fn calculate(&self) -> anyhow::Result<u64> {
-        let storage = self.calculate_model()?;
+    pub fn calculate(&self) -> u64 {
+        let storage = self.calculate_model();
        let sizes = storage.calculate();
-
-        Ok(sizes.total_size)
+        sizes.total_size
    }
 }

@@ -656,7 +708,7 @@ fn verify_size_for_multiple_branches() {
 "#;
    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();

-    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
+    assert_eq!(inputs.calculate(), 37_851_408);
 }

 #[test]
@@ -711,7 +763,7 @@ fn verify_size_for_one_branch() {

    let model: ModelInputs = serde_json::from_str(doc).unwrap();

-    let res = model.calculate_model().unwrap().calculate();
+    let res = model.calculate_model().calculate();

    println!("calculated synthetic size: {}", res.total_size);
    println!("result: {:?}", serde_json::to_string(&res.segments));
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4823,7 +4823,7 @@ impl Timeline {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<GcCutoffs> {
+    ) -> Result<GcCutoffs, PageReconstructError> {
        let _timer = self
            .metrics
            .find_gc_cutoffs_histo
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -94,8 +94,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*WARN.*path=/v1/utilization .*request was dropped before completing",
    # Can happen during shutdown
    ".*scheduling deletion on drop failed: queue is in state Stopped.*",
-    # Can happen during shutdown
-    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )


--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -678,10 +678,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
        with pytest.raises(PageserverApiException, match=matcher):
            completion.result()

-    # this happens on both cases
-    env.pageserver.allowed_errors.append(
-        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
-    )
    # this happens only in the case of deletion (http response logging)
    env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")