From c1ff7db1874c6b5ff8ee134b944f1c54616ba37b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 28 Apr 2025 14:54:26 -0400 Subject: [PATCH] fix(pageserver): consider tombstones in replorigin (#11752) ## Problem We didn't consider tombstones in replorigin read path in the past. This was fine because tombstones are stored as LSN::Invalid before we universally define what the tombstone is for sparse keyspaces. Now we remove non-inherited keys during detach ancestor and write the universal tombstone "empty image". So we need to consider it across all the read paths. related: https://github.com/neondatabase/neon/pull/11299 ## Summary of changes Empty value gets ignored for replorigin scans. --------- Signed-off-by: Alex Chi Z --- pageserver/src/pgdatadir_mapping.rs | 16 +++++- pageserver/src/tenant.rs | 52 ++++++++++++++++++- .../src/tenant/timeline/detach_ancestor.rs | 2 +- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 81e548a095..ccb48d8bc1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1084,8 +1084,17 @@ impl Timeline { let mut result = HashMap::new(); for (k, v) in kv { let v = v?; + if v.is_empty() { + // This is a tombstone -- we can skip it. + // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of + // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone, + // we also need to consider that. Such tombstones might be written on the detach ancestor code path to + // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.) + continue; + } let origin_id = k.field6 as RepOriginId; - let origin_lsn = Lsn::des(&v).unwrap(); + let origin_lsn = Lsn::des(&v) + .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } @@ -2578,6 +2587,11 @@ impl DatadirModification<'_> { } } + #[cfg(test)] + pub fn put_for_unit_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index dcd043c4a1..e59db74479 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5947,7 +5947,9 @@ mod tests { use itertools::Itertools; #[cfg(feature = "testing")] use models::CompactLsnRange; - use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + use pageserver_api::key::{ + AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key, + }; use pageserver_api::keyspace::KeySpace; #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; @@ -8183,6 +8185,54 @@ mod tests { assert_eq!(files.get("pg_logical/mappings/test2"), None); } + #[tokio::test] + async fn test_repl_origin_tombstones() { + let harness = TenantHarness::create("test_repl_origin_tombstones") + .await + .unwrap(); + + let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let repl_lsn = Lsn(0x10); + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new())); + modification.set_replorigin(1, repl_lsn).await.unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // we can read everything from the storage + let repl_origins = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); + assert_eq!(repl_origins.len(), 1); + assert_eq!(repl_origins[&1], lsn); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test( + repl_origin_key(3), + Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")), + ); + modification.commit(&ctx).await.unwrap(); + } + let result = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await; + assert!(result.is_err()); + } + #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 8e95c3a8ff..649b33e294 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -178,7 +178,7 @@ impl Attempt { } } -async fn generate_tombstone_image_layer( +pub(crate) async fn generate_tombstone_image_layer( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn,