fix(pageserver): clean up aux files before detaching (#11299)

## Problem

Related to https://github.com/neondatabase/cloud/issues/26091 and
https://github.com/neondatabase/cloud/issues/25840

Close https://github.com/neondatabase/neon/issues/11297

Discussion on Slack:
https://neondb.slack.com/archives/C033RQ5SPDH/p1742320666313969

## Summary of changes

* When detaching, scan all aux files within
`sparse_non_inherited_keyspace` in the ancestor timeline and create an
image layer exactly at the ancestor LSN. All scanned keys will map to an
empty value, which is a delete tombstone.
- Note that end_lsn for rewritten delta layers = ancestor_lsn + 1, so
the image layer will have image_end_lsn=end_lsn. With the current
`select_layer` logic, the read path will always first read the image
layer.
* Add a test case.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-04-03 11:55:22 -04:00
committed by GitHub
parent 581bb5d7d5
commit 131b32ef48
4 changed files with 210 additions and 4 deletions

View File

@@ -3188,7 +3188,8 @@ async fn list_aux_files(
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let files = timeline
.list_aux_files(body.lsn, &ctx, io_concurrency)
.await?;

View File

@@ -2,10 +2,14 @@ use std::collections::HashSet;
use std::sync::Arc;
use anyhow::Context;
use bytes::Bytes;
use http_utils::error::ApiError;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::DetachBehavior;
use pageserver_api::models::detach_ancestor::AncestorDetached;
use pageserver_api::shard::ShardIdentity;
use pageserver_compaction::helpers::overlaps_with;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
@@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind;
use crate::tenant::Tenant;
use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
use crate::tenant::storage_layer::{
AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
ValuesReconstructState,
};
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
#[derive(Debug, thiserror::Error)]
@@ -170,6 +177,92 @@ impl Attempt {
}
}
async fn generate_tombstone_image_layer(
detached: &Arc<Timeline>,
ancestor: &Arc<Timeline>,
ancestor_lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<ResidentLayer>, Error> {
tracing::info!(
"removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
);
let io_concurrency = IoConcurrency::spawn_from_conf(
detached.conf,
detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
);
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
// Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
// not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
let key_range = Key::sparse_non_inherited_keyspace();
// avoid generating a "future layer" which will then be removed
let image_lsn = ancestor_lsn;
{
let layers = detached.layers.read().await;
for layer in layers.all_persistent_layers() {
if !layer.is_delta
&& layer.lsn_range.start == image_lsn
&& overlaps_with(&key_range, &layer.key_range)
{
tracing::warn!(
layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
);
return Ok(None);
}
}
}
let data = ancestor
.get_vectored_impl(
KeySpace::single(key_range.clone()),
image_lsn,
&mut reconstruct_state,
ctx,
)
.await
.context("failed to retrieve aux keys")
.map_err(|e| Error::launder(e, Error::Prepare))?;
if !data.is_empty() {
// TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
// upon compaction but theoretically possible.
let mut image_layer_writer = ImageLayerWriter::new(
detached.conf,
detached.timeline_id,
detached.tenant_shard_id,
&key_range,
image_lsn,
ctx,
)
.await
.context("failed to create image layer writer")
.map_err(Error::Prepare)?;
for key in data.keys() {
image_layer_writer
.put_image(*key, Bytes::new(), ctx)
.await
.context("failed to write key")
.map_err(|e| Error::launder(e, Error::Prepare))?;
}
let (desc, path) = image_layer_writer
.finish(ctx)
.await
.context("failed to finish image layer writer for removing the metadata keys")
.map_err(|e| Error::launder(e, Error::Prepare))?;
let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
.map_err(|e| Error::launder(e, Error::Prepare))?;
detached
.remote_client
.upload_layer_file(&generated, &detached.cancel)
.await
.map_err(|e| Error::launder(e, Error::Prepare))?;
tracing::info!(layer=%generated, "wrote image layer");
Ok(Some(generated))
} else {
tracing::info!("no aux keys found in ancestor");
Ok(None)
}
}
/// See [`Timeline::prepare_to_detach_from_ancestor`]
pub(super) async fn prepare(
detached: &Arc<Timeline>,
@@ -352,10 +445,16 @@ pub(super) async fn prepare(
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
let mut new_layers: Vec<Layer> =
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
if let Some(tombstone_layer) =
generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
{
new_layers.push(tombstone_layer.into());
}
{
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
let mut tasks = tokio::task::JoinSet::new();

View File

@@ -1192,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
log.info(f"Got perf info response code: {res.status_code}")
self.verbose_error(res)
return res.json()
def ingest_aux_files(
self,
tenant_id: TenantId | TenantShardId,
timeline_id: TimelineId,
aux_files: dict[str, bytes],
):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files",
json={
"aux_files": aux_files,
},
)
self.verbose_error(res)
return res.json()
def list_aux_files(
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
) -> Any:
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files",
json={"lsn": str(lsn)},
)
self.verbose_error(res)
return res.json()

View File

@@ -1768,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu
workload_child.validate(env.pageserver.id)
def test_timeline_detach_with_aux_files_with_detach_v1(
neon_env_builder: NeonEnvBuilder,
):
"""
Validate that "branches do not inherit their parent" is invariant over detach_ancestor.
Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary.
We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent,
thereby making parent aux files reappear.
"""
env = neon_env_builder.init_start(
initial_tenant_conf={
"gc_period": "1s",
"lsn_lease_length": "0s",
}
)
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
endpoint.safe_psql(
"SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
)
lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
endpoint.safe_psql(
"SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')"
)
lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set(
[]
)
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set(
["pg_replslot/test_slot_parent_1/state"]
)
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
)
# Restore at LSN1
branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1)
endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant)
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
# Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore,
# but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior
# in the future.
# TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore
# feature before branch detach was introduced because it will clean up the aux files and stop logical replication.
endpoint2.safe_psql(
"SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
)
lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
["pg_replslot/test_slot_restore/state"]
)
print("lsn0=", lsn0)
print("lsn1=", lsn1)
print("lsn2=", lsn2)
print("lsn3=", lsn3)
# Detach the restore branch so that main doesn't have any child branches.
all_reparented = http.detach_ancestor(
env.initial_tenant, branch_timeline_id, detach_behavior="v1"
)
assert all_reparented == set([])
# We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
), "main branch unaffected"
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
["pg_replslot/test_slot_restore/state"]
)
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
# TODO:
# - branch near existing L1 boundary, image layers?
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.