mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 17:10:38 +00:00
fix(pageserver): clean up aux files before detaching (#11299)
## Problem Related to https://github.com/neondatabase/cloud/issues/26091 and https://github.com/neondatabase/cloud/issues/25840 Close https://github.com/neondatabase/neon/issues/11297 Discussion on Slack: https://neondb.slack.com/archives/C033RQ5SPDH/p1742320666313969 ## Summary of changes * When detaching, scan all aux files within `sparse_non_inherited_keyspace` in the ancestor timeline and create an image layer exactly at the ancestor LSN. All scanned keys will map to an empty value, which is a delete tombstone. - Note that end_lsn for rewritten delta layers = ancestor_lsn + 1, so the image layer will have image_end_lsn=end_lsn. With the current `select_layer` logic, the read path will always first read the image layer. * Add a test case. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
@@ -3188,7 +3188,8 @@ async fn list_aux_files(
|
||||
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
|
||||
);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
let files = timeline
|
||||
.list_aux_files(body.lsn, &ctx, io_concurrency)
|
||||
.await?;
|
||||
|
||||
@@ -2,10 +2,14 @@ use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use http_utils::error::ApiError;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::DetachBehavior;
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
@@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -170,6 +177,92 @@ impl Attempt {
|
||||
}
|
||||
}
|
||||
|
||||
async fn generate_tombstone_image_layer(
|
||||
detached: &Arc<Timeline>,
|
||||
ancestor: &Arc<Timeline>,
|
||||
ancestor_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ResidentLayer>, Error> {
|
||||
tracing::info!(
|
||||
"removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
|
||||
);
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
detached.conf,
|
||||
detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
|
||||
);
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
// Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
|
||||
// not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
|
||||
let key_range = Key::sparse_non_inherited_keyspace();
|
||||
// avoid generating a "future layer" which will then be removed
|
||||
let image_lsn = ancestor_lsn;
|
||||
|
||||
{
|
||||
let layers = detached.layers.read().await;
|
||||
for layer in layers.all_persistent_layers() {
|
||||
if !layer.is_delta
|
||||
&& layer.lsn_range.start == image_lsn
|
||||
&& overlaps_with(&key_range, &layer.key_range)
|
||||
{
|
||||
tracing::warn!(
|
||||
layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let data = ancestor
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key_range.clone()),
|
||||
image_lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("failed to retrieve aux keys")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
if !data.is_empty() {
|
||||
// TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
|
||||
// upon compaction but theoretically possible.
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
detached.conf,
|
||||
detached.timeline_id,
|
||||
detached.tenant_shard_id,
|
||||
&key_range,
|
||||
image_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("failed to create image layer writer")
|
||||
.map_err(Error::Prepare)?;
|
||||
for key in data.keys() {
|
||||
image_layer_writer
|
||||
.put_image(*key, Bytes::new(), ctx)
|
||||
.await
|
||||
.context("failed to write key")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
}
|
||||
let (desc, path) = image_layer_writer
|
||||
.finish(ctx)
|
||||
.await
|
||||
.context("failed to finish image layer writer for removing the metadata keys")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
detached
|
||||
.remote_client
|
||||
.upload_layer_file(&generated, &detached.cancel)
|
||||
.await
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
tracing::info!(layer=%generated, "wrote image layer");
|
||||
Ok(Some(generated))
|
||||
} else {
|
||||
tracing::info!("no aux keys found in ancestor");
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Timeline::prepare_to_detach_from_ancestor`]
|
||||
pub(super) async fn prepare(
|
||||
detached: &Arc<Timeline>,
|
||||
@@ -352,10 +445,16 @@ pub(super) async fn prepare(
|
||||
|
||||
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
|
||||
let mut new_layers: Vec<Layer> =
|
||||
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
|
||||
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
|
||||
|
||||
if let Some(tombstone_layer) =
|
||||
generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
|
||||
{
|
||||
new_layers.push(tombstone_layer.into());
|
||||
}
|
||||
|
||||
{
|
||||
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
|
||||
tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
|
||||
@@ -1192,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
log.info(f"Got perf info response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def ingest_aux_files(
|
||||
self,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
aux_files: dict[str, bytes],
|
||||
):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files",
|
||||
json={
|
||||
"aux_files": aux_files,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def list_aux_files(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
|
||||
) -> Any:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files",
|
||||
json={"lsn": str(lsn)},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
@@ -1768,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu
|
||||
workload_child.validate(env.pageserver.id)
|
||||
|
||||
|
||||
def test_timeline_detach_with_aux_files_with_detach_v1(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Validate that "branches do not inherit their parent" is invariant over detach_ancestor.
|
||||
|
||||
Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary.
|
||||
We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent,
|
||||
thereby making parent aux files reappear.
|
||||
"""
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"gc_period": "1s",
|
||||
"lsn_lease_length": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
http = env.pageserver.http_client()
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
|
||||
lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
endpoint.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
|
||||
)
|
||||
lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
endpoint.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')"
|
||||
)
|
||||
lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set(
|
||||
[]
|
||||
)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set(
|
||||
["pg_replslot/test_slot_parent_1/state"]
|
||||
)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
|
||||
["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
|
||||
)
|
||||
|
||||
# Restore at LSN1
|
||||
branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1)
|
||||
endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant)
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
|
||||
# Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore,
|
||||
# but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior
|
||||
# in the future.
|
||||
# TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore
|
||||
# feature before branch detach was introduced because it will clean up the aux files and stop logical replication.
|
||||
endpoint2.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
|
||||
)
|
||||
lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
|
||||
["pg_replslot/test_slot_restore/state"]
|
||||
)
|
||||
|
||||
print("lsn0=", lsn0)
|
||||
print("lsn1=", lsn1)
|
||||
print("lsn2=", lsn2)
|
||||
print("lsn3=", lsn3)
|
||||
# Detach the restore branch so that main doesn't have any child branches.
|
||||
all_reparented = http.detach_ancestor(
|
||||
env.initial_tenant, branch_timeline_id, detach_behavior="v1"
|
||||
)
|
||||
assert all_reparented == set([])
|
||||
|
||||
# We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
|
||||
["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
|
||||
), "main branch unaffected"
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
|
||||
["pg_replslot/test_slot_restore/state"]
|
||||
)
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
|
||||
|
||||
# TODO:
|
||||
# - branch near existing L1 boundary, image layers?
|
||||
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.
|
||||
|
||||
Reference in New Issue
Block a user