From 131b32ef48ae4eb1a64b9a066fa0e29346c84f42 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 3 Apr 2025 11:55:22 -0400 Subject: [PATCH] fix(pageserver): clean up aux files before detaching (#11299) ## Problem Related to https://github.com/neondatabase/cloud/issues/26091 and https://github.com/neondatabase/cloud/issues/25840 Close https://github.com/neondatabase/neon/issues/11297 Discussion on Slack: https://neondb.slack.com/archives/C033RQ5SPDH/p1742320666313969 ## Summary of changes * When detaching, scan all aux files within `sparse_non_inherited_keyspace` in the ancestor timeline and create an image layer exactly at the ancestor LSN. All scanned keys will map to an empty value, which is a delete tombstone. - Note that end_lsn for rewritten delta layers = ancestor_lsn + 1, so the image layer will have image_end_lsn=end_lsn. With the current `select_layer` logic, the read path will always first read the image layer. * Add a test case. --------- Signed-off-by: Alex Chi Z Co-authored-by: Christian Schwarz --- pageserver/src/http/routes.rs | 3 +- .../src/tenant/timeline/detach_ancestor.rs | 105 +++++++++++++++++- test_runner/fixtures/pageserver/http.py | 25 +++++ .../regress/test_timeline_detach_ancestor.py | 81 ++++++++++++++ 4 files changed, 210 insertions(+), 4 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index adc38e32e8..8dcb654e59 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3188,7 +3188,8 @@ async fn list_aux_files( timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, ); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let files = timeline .list_aux_files(body.lsn, &ctx, io_concurrency) .await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index ca1d81c691..1b0d22dc82 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -2,10 +2,14 @@ use std::collections::HashSet; use std::sync::Arc; use anyhow::Context; +use bytes::Bytes; use http_utils::error::ApiError; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::DetachBehavior; use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::shard::ShardIdentity; +use pageserver_compaction::helpers::overlaps_with; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind; use crate::tenant::Tenant; use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; use crate::tenant::storage_layer::layer::local_layer_path; -use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::tenant::storage_layer::{ + AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, + ValuesReconstructState, +}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] @@ -170,6 +177,92 @@ impl Attempt { } } +async fn generate_tombstone_image_layer( + detached: &Arc, + ancestor: &Arc, + ancestor_lsn: Lsn, + ctx: &RequestContext, +) -> Result, Error> { + tracing::info!( + "removing non-inherited keys by writing an image layer with tombstones at the detach LSN" + ); + let io_concurrency = IoConcurrency::spawn_from_conf( + detached.conf, + detached.gate.enter().map_err(|_| Error::ShuttingDown)?, + ); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should + // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute. + let key_range = Key::sparse_non_inherited_keyspace(); + // avoid generating a "future layer" which will then be removed + let image_lsn = ancestor_lsn; + + { + let layers = detached.layers.read().await; + for layer in layers.all_persistent_layers() { + if !layer.is_delta + && layer.lsn_range.start == image_lsn + && overlaps_with(&key_range, &layer.key_range) + { + tracing::warn!( + layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files" + ); + return Ok(None); + } + } + } + + let data = ancestor + .get_vectored_impl( + KeySpace::single(key_range.clone()), + image_lsn, + &mut reconstruct_state, + ctx, + ) + .await + .context("failed to retrieve aux keys") + .map_err(|e| Error::launder(e, Error::Prepare))?; + if !data.is_empty() { + // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated + // upon compaction but theoretically possible. + let mut image_layer_writer = ImageLayerWriter::new( + detached.conf, + detached.timeline_id, + detached.tenant_shard_id, + &key_range, + image_lsn, + ctx, + ) + .await + .context("failed to create image layer writer") + .map_err(Error::Prepare)?; + for key in data.keys() { + image_layer_writer + .put_image(*key, Bytes::new(), ctx) + .await + .context("failed to write key") + .map_err(|e| Error::launder(e, Error::Prepare))?; + } + let (desc, path) = image_layer_writer + .finish(ctx) + .await + .context("failed to finish image layer writer for removing the metadata keys") + .map_err(|e| Error::launder(e, Error::Prepare))?; + let generated = Layer::finish_creating(detached.conf, detached, desc, &path) + .map_err(|e| Error::launder(e, Error::Prepare))?; + detached + .remote_client + .upload_layer_file(&generated, &detached.cancel) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + tracing::info!(layer=%generated, "wrote image layer"); + Ok(Some(generated)) + } else { + tracing::info!("no aux keys found in ancestor"); + Ok(None) + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, @@ -352,10 +445,16 @@ pub(super) async fn prepare( // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after let mut new_layers: Vec = - Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1); + + if let Some(tombstone_layer) = + generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await? + { + new_layers.push(tombstone_layer.into()); + } { - tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); let mut tasks = tokio::task::JoinSet::new(); diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 8211da32fe..c2d176bf5a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1192,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter): log.info(f"Got perf info response code: {res.status_code}") self.verbose_error(res) return res.json() + + def ingest_aux_files( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + aux_files: dict[str, bytes], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files", + json={ + "aux_files": aux_files, + }, + ) + self.verbose_error(res) + return res.json() + + def list_aux_files( + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn + ) -> Any: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files", + json={"lsn": str(lsn)}, + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 34c251285f..a71652af8a 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1768,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu workload_child.validate(env.pageserver.id) +def test_timeline_detach_with_aux_files_with_detach_v1( + neon_env_builder: NeonEnvBuilder, +): + """ + Validate that "branches do not inherit their parent" is invariant over detach_ancestor. + + Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary. + We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent, + thereby making parent aux files reappear. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')" + ) + lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')" + ) + lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set( + [] + ) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set( + ["pg_replslot/test_slot_parent_1/state"] + ) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( + ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] + ) + + # Restore at LSN1 + branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1) + endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + + # Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore, + # but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior + # in the future. + # TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore + # feature before branch detach was introduced because it will clean up the aux files and stop logical replication. + endpoint2.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" + ) + lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( + ["pg_replslot/test_slot_restore/state"] + ) + + print("lsn0=", lsn0) + print("lsn1=", lsn1) + print("lsn2=", lsn2) + print("lsn3=", lsn3) + # Detach the restore branch so that main doesn't have any child branches. + all_reparented = http.detach_ancestor( + env.initial_tenant, branch_timeline_id, detach_behavior="v1" + ) + assert all_reparented == set([]) + + # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( + ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] + ), "main branch unaffected" + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( + ["pg_replslot/test_slot_restore/state"] + ) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general.