fix(pageserver): clean up aux files before detaching (#11299)

## Problem Related to https://github.com/neondatabase/cloud/issues/26091 and https://github.com/neondatabase/cloud/issues/25840 Close https://github.com/neondatabase/neon/issues/11297 Discussion on Slack: https://neondb.slack.com/archives/C033RQ5SPDH/p1742320666313969 ## Summary of changes * When detaching, scan all aux files within `sparse_non_inherited_keyspace` in the ancestor timeline and create an image layer exactly at the ancestor LSN. All scanned keys will map to an empty value, which is a delete tombstone. - Note that end_lsn for rewritten delta layers = ancestor_lsn + 1, so the image layer will have image_end_lsn=end_lsn. With the current `select_layer` logic, the read path will always first read the image layer. * Add a test case. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Christian Schwarz <christian@neon.tech>
2026-05-25 17:10:38 +00:00 · 2025-04-03 11:55:22 -04:00
parent 581bb5d7d5
commit 131b32ef48
4 changed files with 210 additions and 4 deletions
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3188,7 +3188,8 @@ async fn list_aux_files(
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let files = timeline
        .list_aux_files(body.lsn, &ctx, io_concurrency)
        .await?;
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -2,10 +2,14 @@ use std::collections::HashSet;
 use std::sync::Arc;

 use anyhow::Context;
+use bytes::Bytes;
 use http_utils::error::ApiError;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::DetachBehavior;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_compaction::helpers::overlaps_with;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::Tenant;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
+use crate::tenant::storage_layer::{
+    AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
+    ValuesReconstructState,
+};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 #[derive(Debug, thiserror::Error)]
@@ -170,6 +177,92 @@ impl Attempt {
    }
 }

+async fn generate_tombstone_image_layer(
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+    ctx: &RequestContext,
+) -> Result<Option<ResidentLayer>, Error> {
+    tracing::info!(
+        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
+    );
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        detached.conf,
+        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
+    );
+    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+    // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+    // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
+    let key_range = Key::sparse_non_inherited_keyspace();
+    // avoid generating a "future layer" which will then be removed
+    let image_lsn = ancestor_lsn;
+
+    {
+        let layers = detached.layers.read().await;
+        for layer in layers.all_persistent_layers() {
+            if !layer.is_delta
+                && layer.lsn_range.start == image_lsn
+                && overlaps_with(&key_range, &layer.key_range)
+            {
+                tracing::warn!(
+                    layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
+                );
+                return Ok(None);
+            }
+        }
+    }
+
+    let data = ancestor
+        .get_vectored_impl(
+            KeySpace::single(key_range.clone()),
+            image_lsn,
+            &mut reconstruct_state,
+            ctx,
+        )
+        .await
+        .context("failed to retrieve aux keys")
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
+    if !data.is_empty() {
+        // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
+        // upon compaction but theoretically possible.
+        let mut image_layer_writer = ImageLayerWriter::new(
+            detached.conf,
+            detached.timeline_id,
+            detached.tenant_shard_id,
+            &key_range,
+            image_lsn,
+            ctx,
+        )
+        .await
+        .context("failed to create image layer writer")
+        .map_err(Error::Prepare)?;
+        for key in data.keys() {
+            image_layer_writer
+                .put_image(*key, Bytes::new(), ctx)
+                .await
+                .context("failed to write key")
+                .map_err(|e| Error::launder(e, Error::Prepare))?;
+        }
+        let (desc, path) = image_layer_writer
+            .finish(ctx)
+            .await
+            .context("failed to finish image layer writer for removing the metadata keys")
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        detached
+            .remote_client
+            .upload_layer_file(&generated, &detached.cancel)
+            .await
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        tracing::info!(layer=%generated, "wrote image layer");
+        Ok(Some(generated))
+    } else {
+        tracing::info!("no aux keys found in ancestor");
+        Ok(None)
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
@@ -352,10 +445,16 @@ pub(super) async fn prepare(

    // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
    let mut new_layers: Vec<Layer> =
-        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
+        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
+
+    if let Some(tombstone_layer) =
+        generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
+    {
+        new_layers.push(tombstone_layer.into());
+    }

    {
-        tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
+        tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");

        let mut tasks = tokio::task::JoinSet::new();

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1192,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        log.info(f"Got perf info response code: {res.status_code}")
        self.verbose_error(res)
        return res.json()
+
+    def ingest_aux_files(
+        self,
+        tenant_id: TenantId | TenantShardId,
+        timeline_id: TimelineId,
+        aux_files: dict[str, bytes],
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files",
+            json={
+                "aux_files": aux_files,
+            },
+        )
+        self.verbose_error(res)
+        return res.json()
+
+    def list_aux_files(
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
+    ) -> Any:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files",
+            json={"lsn": str(lsn)},
+        )
+        self.verbose_error(res)
+        return res.json()
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1768,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu
    workload_child.validate(env.pageserver.id)


+def test_timeline_detach_with_aux_files_with_detach_v1(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Validate that "branches do not inherit their parent" is invariant over detach_ancestor.
+
+    Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary.
+    We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent,
+    thereby making parent aux files reappear.
+    """
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    http = env.pageserver.http_client()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
+    lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    endpoint.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
+    )
+    lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    endpoint.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')"
+    )
+    lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set(
+        []
+    )
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set(
+        ["pg_replslot/test_slot_parent_1/state"]
+    )
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
+        ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
+    )
+
+    # Restore at LSN1
+    branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1)
+    endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant)
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
+
+    # Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore,
+    # but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior
+    # in the future.
+    # TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore
+    # feature before branch detach was introduced because it will clean up the aux files and stop logical replication.
+    endpoint2.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
+    )
+    lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
+        ["pg_replslot/test_slot_restore/state"]
+    )
+
+    print("lsn0=", lsn0)
+    print("lsn1=", lsn1)
+    print("lsn2=", lsn2)
+    print("lsn3=", lsn3)
+    # Detach the restore branch so that main doesn't have any child branches.
+    all_reparented = http.detach_ancestor(
+        env.initial_tenant, branch_timeline_id, detach_behavior="v1"
+    )
+    assert all_reparented == set([])
+
+    # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
+        ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
+    ), "main branch unaffected"
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
+        ["pg_replslot/test_slot_restore/state"]
+    )
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
+
+
 # TODO:
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.