WIP: Add pg_tracing extension

2026-01-25 14:20:38 +00:00 · 2025-04-07 19:53:52 +03:00
14 changed files with 209 additions and 364 deletions
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1388,6 +1388,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

+#########################################################################################
+# Layer "pg_tracing"
+# compile pg_tracing extension
+#
+#########################################################################################
+FROM build-deps AS pg_tracing-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION:?}" in \
+      "v14" | "v15") \
+        echo "pg_tracing not supported on this PostgreSQL version." && exit 0 \
+	;; \
+      *) \
+      ;; \
+    esac && \
+    wget https://github.com/DataDog/pg_tracing/archive/refs/tags/v0.1.3.tar.gz -O pg_tracing.tar.gz && \
+    echo "d0a7cca7279bb29601ba6c4c1aaeb3a44d71e6afa3b78aae1e3b7269e688f907 pg_tracing.tar.gz" | sha256sum --check && \
+    mkdir pg_tracing-src && cd pg_tracing-src && tar xzf ../pg_tracing.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_tracing-build
+COPY --from=pg_tracing-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_tracing-src
+RUN case "${PG_VERSION:?}" in \
+      "v14" | "v15") \
+        echo "pg_tracing not supported on this PostgreSQL version." && exit 0 \
+        ;; \
+      *) \
+      ;; \
+    esac && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install
+
 #########################################################################################
 #
 # Layer "pg_mooncake"
@@ -1617,6 +1649,7 @@ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_tracing-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1793,6 +1826,7 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/
 #COPY --from=wal2json-src /ext-src/ /ext-src/
 COPY --from=pg_ivm-src /ext-src/ /ext-src/
 COPY --from=pg_partman-src /ext-src/ /ext-src/
+COPY --from=pg_tracing-src /ext-src/ /ext-src/
 #COPY --from=pg_mooncake-src /ext-src/ /ext-src/
 COPY --from=pg_repack-src /ext-src/ /ext-src/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -80,22 +80,10 @@ pub enum TenantState {
    ///
    /// Transitions out of this state are possible through `set_broken()`.
    Stopping {
-        /// The barrier can be used to wait for shutdown to complete. The first caller to set
-        /// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers
-        /// will wait for the first caller's existing barrier.
-        ///
-        /// None is set when an attach is cancelled, to signal to shutdown that the attach has in
-        /// fact cancelled:
-        ///
-        /// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant.
-        /// 2. `attach` sets `TenantState::Stopping(None)` and exits.
-        /// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets
-        ///    `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner.
-        //
        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
        // otherwise it will not be skipped during deserialization
        #[serde(skip)]
-        progress: Option<completion::Barrier>,
+        progress: completion::Barrier,
    },
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
@@ -2731,15 +2719,10 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (
-                line!(),
-                TenantState::Stopping { progress: None },
-                "Stopping",
-            ),
            (
                line!(),
                TenantState::Stopping {
-                    progress: Some(completion::Barrier::default()),
+                    progress: utils::completion::Barrier::default(),
                },
                "Stopping",
            ),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,7 +45,6 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::{
    FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError,
-    download_tenant_manifest,
 };
 use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
@@ -227,8 +226,7 @@ struct TimelinePreload {
 }

 pub(crate) struct TenantPreload {
-    /// The tenant manifest from remote storage, or None if no manifest was found.
-    tenant_manifest: Option<TenantManifest>,
+    tenant_manifest: TenantManifest,
    /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest.
    timelines: HashMap<TimelineId, Option<TimelinePreload>>,
 }
@@ -284,15 +282,12 @@ pub struct Tenant {
    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,

-    /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
-    /// been either downloaded or uploaded. Always Some after tenant attach.
+    /// Serialize writes of the tenant manifest to remote storage.  If there are concurrent operations
+    /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
+    /// each other (this could be optimized to coalesce writes if necessary).
    ///
-    /// Initially populated during tenant attach, updated via `maybe_upload_tenant_manifest`.
-    ///
-    /// Do not modify this directly. It is used to check whether a new manifest needs to be
-    /// uploaded. The manifest is constructed in `build_tenant_manifest`, and uploaded via
-    /// `maybe_upload_tenant_manifest`.
-    remote_tenant_manifest: tokio::sync::Mutex<Option<TenantManifest>>,
+    /// The contents of the Mutex are the last manifest we successfully uploaded
+    tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,

    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
@@ -1359,41 +1354,36 @@ impl Tenant {
                    }
                }

-                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
-                    t.state.send_modify(|state| match state {
-                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
-                        // TenantState::Stopping before we get here, but this may be outdated.
-                        // Let's find out with a testing assertion. If this doesn't fire, and the
-                        // logs don't show this happening in production, remove the Stopping cases.
-                        TenantState::Stopping{..} if cfg!(any(test, feature = "testing")) => {
-                            panic!("unexpected TenantState::Stopping during attach")
-                        }
-                        // If the tenant is cancelled, assume the error was caused by cancellation.
-                        TenantState::Attaching if t.cancel.is_cancelled() => {
-                            info!("attach cancelled, setting tenant state to Stopping: {err}");
-                            // NB: progress None tells `set_stopping` that attach has cancelled.
-                            *state = TenantState::Stopping { progress: None };
-                        }
-                        // According to the old code, DeleteTenantFlow may already have set this to
-                        // Stopping. Retain its progress.
-                        // TODO: there is no DeleteTenantFlow. Is this still needed? See above.
-                        TenantState::Stopping { progress } if t.cancel.is_cancelled() => {
-                            assert!(progress.is_some(), "concurrent attach cancellation");
-                            info!("attach cancelled, already Stopping: {err}");
-                        }
-                        // Mark the tenant as broken.
-                        TenantState::Attaching | TenantState::Stopping { .. } => {
-                            error!("attach failed, setting tenant state to Broken (was {state}): {err:?}");
-                            *state = TenantState::broken_from_reason(err.to_string())
-                        }
-                        // The attach task owns the tenant state until activated.
-                        state => panic!("invalid tenant state {state} during attach: {err:?}"),
-                    });
+                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                enum BrokenVerbosity {
+                    Error,
+                    Info
                }
+                let make_broken =
+                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
+                        match verbosity {
+                            BrokenVerbosity::Info => {
+                                info!("attach cancelled, setting tenant state to Broken: {err}");
+                            },
+                            BrokenVerbosity::Error => {
+                                error!("attach failed, setting tenant state to Broken: {err:?}");
+                            }
+                        }
+                        t.state.send_modify(|state| {
+                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
+                            // if it errors, we will call make_broken when tenant is already in Stopping.
+                            assert!(
+                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                                "the attach task owns the tenant state until activation is complete"
+                            );
+
+                            *state = TenantState::broken_from_reason(err.to_string());
+                        });
+                    };

                // TODO: should also be rejecting tenant conf changes that violate this check.
                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
-                    make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
+                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    return Ok(());
                }

@@ -1445,8 +1435,10 @@ impl Tenant {
                            // stayed in Activating for such a long time that shutdown found it in
                            // that state.
                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
-                            // Set the tenant to Stopping to signal `set_stopping` that we're done.
-                            make_broken_or_stopping(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
+                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
+                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
+                            // just shutting down), but ensures progress.
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
                            return Ok(());
                        },
                    )
@@ -1465,7 +1457,7 @@ impl Tenant {
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
-                                make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
+                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                                return Ok(());
                            }
                        }
@@ -1491,7 +1483,9 @@ impl Tenant {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
-                    Err(e) => make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)),
+                    Err(e) => {
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                    }
                }

                // If we are doing an opportunistic warmup attachment at startup, initialize
@@ -1531,27 +1525,28 @@ impl Tenant {
            cancel.clone(),
        )
        .await?;
-
-        let tenant_manifest = match download_tenant_manifest(
-            remote_storage,
-            &self.tenant_shard_id,
-            self.generation,
-            &cancel,
-        )
-        .await
-        {
-            Ok((tenant_manifest, _, _)) => Some(tenant_manifest),
-            Err(DownloadError::NotFound) => None,
-            Err(err) => return Err(err.into()),
-        };
+        let (offloaded_add, tenant_manifest) =
+            match remote_timeline_client::download_tenant_manifest(
+                remote_storage,
+                &self.tenant_shard_id,
+                self.generation,
+                &cancel,
+            )
+            .await
+            {
+                Ok((tenant_manifest, _generation, _manifest_mtime)) => (
+                    format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
+                    tenant_manifest,
+                ),
+                Err(DownloadError::NotFound) => {
+                    ("no manifest".to_string(), TenantManifest::empty())
+                }
+                Err(e) => Err(e)?,
+            };

        info!(
-            "found {} timelines ({} offloaded timelines)",
-            remote_timeline_ids.len(),
-            tenant_manifest
-                .as_ref()
-                .map(|m| m.offloaded_timelines.len())
-                .unwrap_or(0)
+            "found {} timelines, and {offloaded_add}",
+            remote_timeline_ids.len()
        );

        for k in other_keys {
@@ -1560,13 +1555,11 @@ impl Tenant {

        // Avoid downloading IndexPart of offloaded timelines.
        let mut offloaded_with_prefix = HashSet::new();
-        if let Some(tenant_manifest) = &tenant_manifest {
-            for offloaded in tenant_manifest.offloaded_timelines.iter() {
-                if remote_timeline_ids.remove(&offloaded.timeline_id) {
-                    offloaded_with_prefix.insert(offloaded.timeline_id);
-                } else {
-                    // We'll take care later of timelines in the manifest without a prefix
-                }
+        for offloaded in tenant_manifest.offloaded_timelines.iter() {
+            if remote_timeline_ids.remove(&offloaded.timeline_id) {
+                offloaded_with_prefix.insert(offloaded.timeline_id);
+            } else {
+                // We'll take care later of timelines in the manifest without a prefix
            }
        }

@@ -1640,14 +1633,12 @@ impl Tenant {

        let mut offloaded_timeline_ids = HashSet::new();
        let mut offloaded_timelines_list = Vec::new();
-        if let Some(tenant_manifest) = &preload.tenant_manifest {
-            for timeline_manifest in tenant_manifest.offloaded_timelines.iter() {
-                let timeline_id = timeline_manifest.timeline_id;
-                let offloaded_timeline =
-                    OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
-                offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
-                offloaded_timeline_ids.insert(timeline_id);
-            }
+        for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
+            let timeline_id = timeline_manifest.timeline_id;
+            let offloaded_timeline =
+                OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
+            offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
+            offloaded_timeline_ids.insert(timeline_id);
        }
        // Complete deletions for offloaded timeline id's from manifest.
        // The manifest will be uploaded later in this function.
@@ -1805,21 +1796,15 @@ impl Tenant {
            .context("resume_deletion")
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }
+        let needs_manifest_upload =
+            offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len();
        {
            let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
            offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
        }
-
-        // Stash the preloaded tenant manifest, and upload a new manifest if changed.
-        //
-        // NB: this must happen after the tenant is fully populated above. In particular the
-        // offloaded timelines, which are included in the manifest.
-        {
-            let mut guard = self.remote_tenant_manifest.lock().await;
-            assert!(guard.is_none(), "tenant manifest set before preload"); // first populated here
-            *guard = preload.tenant_manifest;
+        if needs_manifest_upload {
+            self.store_tenant_manifest().await?;
        }
-        self.maybe_upload_tenant_manifest().await?;

        // The local filesystem contents are a cache of what's in the remote IndexPart;
        // IndexPart is the source of truth.
@@ -2233,7 +2218,7 @@ impl Tenant {
        };

        // Upload new list of offloaded timelines to S3
-        self.maybe_upload_tenant_manifest().await?;
+        self.store_tenant_manifest().await?;

        // Activate the timeline (if it makes sense)
        if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -3444,7 +3429,7 @@ impl Tenant {
            shutdown_mode
        };

-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -3524,13 +3509,25 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    ///
+    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
+    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
+    async fn set_stopping(
+        &self,
+        progress: completion::Barrier,
+        _allow_transition_from_loading: bool,
+        allow_transition_from_attaching: bool,
+    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
+            TenantState::Attaching if allow_transition_from_attaching => true,
            TenantState::Activating(_) | TenantState::Attaching => {
-                info!("waiting for {state} to turn Active|Broken|Stopping");
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
                false
            }
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
@@ -3541,24 +3538,25 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Attaching => {
-                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+            TenantState::Activating(_) => {
+                unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
+            }
+            TenantState::Attaching => {
+                if !allow_transition_from_attaching {
+                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
-                *current_state = TenantState::Stopping { progress: Some(progress) };
+                *current_state = TenantState::Stopping { progress };
                // Continue stopping outside the closure. We need to grab timelines.lock()
                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
                true
            }
-            TenantState::Stopping { progress: None } => {
-                // An attach was cancelled, and the attach transitioned the tenant from Attaching to
-                // Stopping(None) to let us know it exited. Register our progress and continue.
-                *current_state = TenantState::Stopping { progress: Some(progress) };
-                true
-            }
            TenantState::Broken { reason, .. } => {
                info!(
                    "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
@@ -3566,7 +3564,7 @@ impl Tenant {
                err = Some(SetStoppingError::Broken);
                false
            }
-            TenantState::Stopping { progress: Some(progress) } => {
+            TenantState::Stopping { progress } => {
                info!("Tenant is already in Stopping state");
                err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
                false
@@ -4067,19 +4065,18 @@ impl Tenant {

    /// Generate an up-to-date TenantManifest based on the state of this Tenant.
    fn build_tenant_manifest(&self) -> TenantManifest {
-        // Collect the offloaded timelines, and sort them for deterministic output.
-        let offloaded_timelines = self
-            .timelines_offloaded
-            .lock()
-            .unwrap()
-            .values()
-            .map(|tli| tli.manifest())
-            .sorted_by_key(|m| m.timeline_id)
-            .collect_vec();
+        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
+
+        let mut timeline_manifests = timelines_offloaded
+            .iter()
+            .map(|(_timeline_id, offloaded)| offloaded.manifest())
+            .collect::<Vec<_>>();
+        // Sort the manifests so that our output is deterministic
+        timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);

        TenantManifest {
            version: LATEST_TENANT_MANIFEST_VERSION,
-            offloaded_timelines,
+            offloaded_timelines: timeline_manifests,
        }
    }

@@ -4302,7 +4299,7 @@ impl Tenant {
            timelines: Mutex::new(HashMap::new()),
            timelines_creating: Mutex::new(HashSet::new()),
            timelines_offloaded: Mutex::new(HashMap::new()),
-            remote_tenant_manifest: Default::default(),
+            tenant_manifest_upload: Default::default(),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
@@ -5535,35 +5532,27 @@ impl Tenant {
            .unwrap_or(0)
    }

-    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
-    /// manifest in `Self::remote_tenant_manifest`.
-    ///
-    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
-    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
-    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
-    /// this when the manifest is more widely used and we have a better idea of the data model.
-    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
-        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
-        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
-        // simple coalescing mechanism.
+    /// Serialize and write the latest TenantManifest to remote storage.
+    pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
+        // Only one manifest write may be done at at time, and the contents of the manifest
+        // must be loaded while holding this lock. This makes it safe to call this function
+        // from anywhere without worrying about colliding updates.
        let mut guard = tokio::select! {
-            guard = self.remote_tenant_manifest.lock() => guard,
-            _ = self.cancel.cancelled() => return Err(TenantManifestError::Cancelled),
+            g = self.tenant_manifest_upload.lock() => {
+                g
+            },
+            _ = self.cancel.cancelled() => {
+                return Err(TenantManifestError::Cancelled);
+            }
        };

-        // Build a new manifest.
        let manifest = self.build_tenant_manifest();
-
-        // Check if the manifest has changed. We ignore the version number here, to avoid
-        // uploading every manifest on version number bumps.
-        if let Some(old) = guard.as_ref() {
-            if manifest.eq_ignoring_version(old) {
-                return Ok(());
-            }
+        if Some(&manifest) == (*guard).as_ref() {
+            // Optimisation: skip uploads that don't change anything.
+            return Ok(());
        }

-        // Upload the manifest. Remote storage does no retries internally, so retry here.
+        // Remote storage does no retries internally, so wrap it
        match backoff::retry(
            || async {
                upload_tenant_manifest(
@@ -5575,7 +5564,7 @@ impl Tenant {
                )
                .await
            },
-            |_| self.cancel.is_cancelled(),
+            |_e| self.cancel.is_cancelled(),
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "uploading tenant manifest",
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -3,15 +3,11 @@ use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-/// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant
-/// shard-wide information that must be persisted in remote storage.
-///
-/// The manifest is always updated on tenant attach, and as needed.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
+/// Tenant-shard scoped manifest
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct TenantManifest {
-    /// The manifest version. Incremented on manifest format changes, even non-breaking ones.
-    /// Manifests must generally always be backwards and forwards compatible for one release, to
-    /// allow release rollbacks.
+    /// Debugging aid describing the version of this manifest.
+    /// Can also be used for distinguishing breaking changes later on.
    pub version: usize,

    /// The list of offloaded timelines together with enough information
@@ -20,7 +16,6 @@ pub struct TenantManifest {
    /// Note: the timelines mentioned in this list might be deleted, i.e.
    /// we don't hold an invariant that the references aren't dangling.
    /// Existence of index-part.json is the actual indicator of timeline existence.
-    #[serde(default)]
    pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
 }

@@ -29,7 +24,7 @@ pub struct TenantManifest {
 /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
 /// but the two datastructures serve different needs, this is for a persistent disk format
 /// that must be backwards compatible, while the other is only for informative purposes.
-#[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)]
+#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
 pub struct OffloadedTimelineManifest {
    pub timeline_id: TimelineId,
    /// Whether the timeline has a parent it has been branched off from or not
@@ -40,114 +35,20 @@ pub struct OffloadedTimelineManifest {
    pub archived_at: NaiveDateTime,
 }

-/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
-/// do not use deny_unknown_fields, so new fields are not breaking.
 pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;

 impl TenantManifest {
-    /// Returns true if the manifests are equal, ignoring the version number. This avoids
-    /// re-uploading all manifests just because the version number is bumped.
-    pub fn eq_ignoring_version(&self, other: &Self) -> bool {
-        // Fast path: if the version is equal, just compare directly.
-        if self.version == other.version {
-            return self == other;
+    pub(crate) fn empty() -> Self {
+        Self {
+            version: LATEST_TENANT_MANIFEST_VERSION,
+            offloaded_timelines: vec![],
        }
-
-        // We could alternatively just clone and modify the version here.
-        let Self {
-            version: _, // ignore version
-            offloaded_timelines,
-        } = self;
-
-        offloaded_timelines == &other.offloaded_timelines
    }
-
-    /// Decodes a manifest from JSON.
    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
-        serde_json::from_slice(bytes)
+        serde_json::from_slice::<Self>(bytes)
    }

-    /// Encodes a manifest as JSON.
-    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
        serde_json::to_vec(self)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::str::FromStr;
-
-    use utils::id::TimelineId;
-
-    use super::*;
-
-    /// Empty manifests should be parsed. Version is required.
-    #[test]
-    fn parse_empty() -> anyhow::Result<()> {
-        let json = r#"{
-             "version": 0
-         }"#;
-        let expected = TenantManifest {
-            version: 0,
-            offloaded_timelines: Vec::new(),
-        };
-        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
-        Ok(())
-    }
-
-    /// Unknown fields should be ignored, for forwards compatibility.
-    #[test]
-    fn parse_unknown_fields() -> anyhow::Result<()> {
-        let json = r#"{
-             "version": 1,
-             "foo": "bar"
-         }"#;
-        let expected = TenantManifest {
-            version: 1,
-            offloaded_timelines: Vec::new(),
-        };
-        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
-        Ok(())
-    }
-
-    /// v1 manifests should be parsed, for backwards compatibility.
-    #[test]
-    fn parse_v1() -> anyhow::Result<()> {
-        let json = r#"{
-             "version": 1,
-             "offloaded_timelines": [
-                 {
-                     "timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
-                     "archived_at": "2025-03-07T11:07:11.373105434"
-                 },
-                 {
-                     "timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
-                     "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
-                     "ancestor_retain_lsn": "0/1F79038",
-                     "archived_at": "2025-03-05T11:10:22.257901390"
-                 }
-             ]
-         }"#;
-        let expected = TenantManifest {
-            version: 1,
-            offloaded_timelines: vec![
-                OffloadedTimelineManifest {
-                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
-                    ancestor_timeline_id: None,
-                    ancestor_retain_lsn: None,
-                    archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
-                },
-                OffloadedTimelineManifest {
-                    timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
-                    ancestor_timeline_id: Some(TimelineId::from_str(
-                        "5c4df612fd159e63c1b7853fe94d97da",
-                    )?),
-                    ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
-                    archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
-                },
-            ],
-        };
-        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
-        Ok(())
-    }
-}
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -61,7 +61,6 @@ pub(crate) async fn upload_index_part(
        .await
        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
-
 /// Serializes and uploads the given tenant manifest data to the remote storage.
 pub(crate) async fn upload_tenant_manifest(
    storage: &GenericRemoteStorage,
@@ -77,14 +76,16 @@ pub(crate) async fn upload_tenant_manifest(
    });
    pausable_failpoint!("before-upload-manifest-pausable");

-    let serialized = Bytes::from(tenant_manifest.to_json_bytes()?);
-    let tenant_manifest_size = serialized.len();
-    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+    let serialized = tenant_manifest.to_json_bytes()?;
+    let serialized = Bytes::from(serialized);

+    let tenant_manifest_site = serialized.len();
+
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
    storage
        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(serialized))),
-            tenant_manifest_size,
+            tenant_manifest_site,
            &remote_path,
            cancel,
        )
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -268,12 +268,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                error_run += 1;
                let backoff =
                    exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
-                log_compaction_error(
-                    &err,
-                    Some((error_run, backoff)),
-                    cancel.is_cancelled(),
-                    false,
-                );
+                log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
                continue;
            }
        }
@@ -290,7 +285,6 @@ pub(crate) fn log_compaction_error(
    err: &CompactionError,
    retry_info: Option<(u32, Duration)>,
    task_cancelled: bool,
-    degrade_to_warning: bool,
 ) {
    use CompactionError::*;

@@ -339,7 +333,6 @@ pub(crate) fn log_compaction_error(
        }
    } else {
        match level {
-            Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
            Level::ERROR => error!("Compaction failed: {err:#}"),
            Level::INFO => info!("Compaction failed: {err:#}"),
            level => unimplemented!("unexpected level {level:?}"),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1940,7 +1940,7 @@ impl Timeline {
            )
            .await;
        if let Err(err) = &res {
-            log_compaction_error(err, None, cancel.is_cancelled(), false);
+            log_compaction_error(err, None, cancel.is_cancelled());
        }
        res
    }
@@ -6353,33 +6353,10 @@ impl Timeline {

    /// Reconstruct a value, using the given base image and WAL records in 'data'.
    async fn reconstruct_value(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, false)
-            .await
-    }
-
-    /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
-    /// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
-    async fn reconstruct_value_wo_critical_error(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, true)
-            .await
-    }
-
-    async fn reconstruct_value_inner(
        &self,
        key: Key,
        request_lsn: Lsn,
        mut data: ValueReconstructState,
-        no_critical_error: bool,
    ) -> Result<Bytes, PageReconstructError> {
        // Perform WAL redo if needed
        data.records.reverse();
@@ -6436,9 +6413,7 @@ impl Timeline {
                    Ok(img) => img,
                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                    Err(walredo::Error::Other(err)) => {
-                        if !no_critical_error {
-                            critical!("walredo failure during page reconstruction: {err:?}");
-                        }
+                        critical!("walredo failure during page reconstruction: {err:?}");
                        return Err(PageReconstructError::WalRedo(
                            err.context("reconstruct a page image"),
                        ));
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -448,7 +448,7 @@ impl GcCompactionQueue {
    ) -> Result<CompactionOutcome, CompactionError> {
        let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
        if let Err(err) = &res {
-            log_compaction_error(err, None, cancel.is_cancelled(), true);
+            log_compaction_error(err, None, cancel.is_cancelled());
        }
        match res {
            Ok(res) => Ok(res),
@@ -2410,9 +2410,7 @@ impl Timeline {
                } else {
                    lsn_split_points[i]
                };
-                let img = self
-                    .reconstruct_value_wo_critical_error(key, request_lsn, state)
-                    .await?;
+                let img = self.reconstruct_value(key, request_lsn, state).await?;
                Some((request_lsn, img))
            } else {
                None
@@ -3108,6 +3106,8 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

+        let mut keys_processed = 0;
+
        while let Some(((key, lsn, val), desc)) = merge_iter
            .next_with_trace()
            .await
@@ -3118,7 +3118,9 @@ impl Timeline {
                return Err(CompactionError::ShuttingDown);
            }

+            keys_processed += 1;
            let should_yield = yield_for_l0
+                && keys_processed % 1000 == 0
                && self
                    .l0_compaction_trigger
                    .notified()
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -410,13 +410,10 @@ impl DeleteTimelineFlow {
        // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
        // However, we handle this case in tenant loading code so the next time we attach, the issue is
        // resolved.
-        tenant
-            .maybe_upload_tenant_manifest()
-            .await
-            .map_err(|err| match err {
-                TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
-                err => DeleteTimelineError::Other(err.into()),
-            })?;
+        tenant.store_tenant_manifest().await.map_err(|e| match e {
+            TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
+            _ => DeleteTimelineError::Other(e.into()),
+        })?;

        *guard = Self::Finished;

--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -111,7 +111,7 @@ pub(crate) async fn offload_timeline(
    // at the next restart attach it again.
    // For that to happen, we'd need to make the manifest reflect our *intended* state,
    // not our actual state of offloaded timelines.
-    tenant.maybe_upload_tenant_manifest().await?;
+    tenant.store_tenant_manifest().await?;

    tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1563,12 +1563,8 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					/* Skip hole tags */
-					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
-					{
-						for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-							n_pages += GET_STATE(entry, i) == AVAILABLE;
-					}
+					for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+						n_pages += GET_STATE(entry, i) == AVAILABLE;
 				}
 			}
 		}
@@ -1596,19 +1592,16 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			{
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
+					if (GET_STATE(entry, i) == AVAILABLE)
 					{
-						if (GET_STATE(entry, i) == AVAILABLE)
-						{
-							fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
-							fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].forknum = entry->key.forkNum;
-							fctx->record[n].blocknum = entry->key.blockNum + i;
-							fctx->record[n].accesscount = entry->access_count;
-							n += 1;
-						}
+						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
+						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
+						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
+						fctx->record[n].forknum = entry->key.forkNum;
+						fctx->record[n].blocknum = entry->key.blockNum + i;
+						fctx->record[n].accesscount = entry->access_count;
+						n += 1;
 					}
 				}
 			}
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -49,8 +49,6 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
    conn = endpoint.connect()
    cur = conn.cursor()

-    cur.execute("create extension neon")
-
    def get_lfc_size() -> tuple[int, int]:
        lfc_file_path = endpoint.lfc_path()
        lfc_file_size = lfc_file_path.stat().st_size
@@ -105,23 +103,3 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
        time.sleep(1)

    assert int(lfc_file_blocks) <= 128 * 1024
-
-    # Now test that number of rows returned by local_cache is the same as file_cache_used_pages.
-    # Perform several iterations to make cache cache content stabilized.
-    nretries = 10
-    while True:
-        cur.execute("select count(*) from local_cache")
-        local_cache_size = cur.fetchall()[0][0]
-
-        cur.execute(
-            "select lfc_value::bigint FROM neon_lfc_stats where lfc_key='file_cache_used_pages'"
-        )
-        used_pages = cur.fetchall()[0][0]
-
-        if local_cache_size == used_pages or nretries == 0:
-            break
-
-        nretries = nretries - 1
-        time.sleep(1)
-
-    assert local_cache_size == used_pages
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4109,7 +4109,6 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
    env.storage_controller.allowed_errors.extend(
        [
            ".*Call to safekeeper.* management API still failed after.*",
-            ".*Call to safekeeper.* management API failed, will retry.*",
            ".*reconcile_one.*tenant_id={tenant_id}.*Call to safekeeper.* management API still failed after.*",
        ]
    )
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -318,7 +318,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
        neon_env_builder.pageserver_remote_storage,
        prefix=f"tenants/{str(tenant_id)}/",
    )
-    assert_prefix_not_empty(
+    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
    )
@@ -387,7 +387,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
            sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
            assert sum == sum_again

-        assert_prefix_not_empty(
+        assert_prefix_empty(
            neon_env_builder.pageserver_remote_storage,
            prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
        )
@@ -924,7 +924,7 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
        neon_env_builder.pageserver_remote_storage,
        prefix=f"tenants/{str(tenant_id)}/",
    )
-    assert_prefix_not_empty(
+    assert_prefix_empty(
        neon_env_builder.pageserver_remote_storage,
        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
    )