From df7e301a5401ac1da2792b00ace7323f913b4fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 6 Jun 2025 13:54:07 +0200 Subject: [PATCH] safekeeper: special error if a timeline has been deleted (#12155) We might delete timelines on safekeepers before we are deleting them on pageservers. This should be an exceptional situation, but can occur. As the first step to improve behaviour here, emit a special error that is less scary/obscure than "was not found in global map". It is for example emitted when the pageserver tries to run `IDENTIFY_SYSTEM` on a timeline that has been deleted on the safekeeper. Found when analyzing the failure of `test_scrubber_physical_gc_timeline_deletion` when enabling `--timelines-onto-safekeepers` on the pytests. Due to safekeeper restarts, there is no hard guarantee that we will keep issuing this error, so we need to think of something better if we start encountering this in staging/prod. But I would say that the introduction of `--timelines-onto-safekeepers` in the pytests and into staging won't change much about this: we are already deleting timelines from there. In `test_scrubber_physical_gc_timeline_deletion`, we'd just be leaking the timeline before on the safekeepers. Part of #11712 --- safekeeper/src/timeline.rs | 2 ++ safekeeper/src/timelines_global_map.rs | 8 +++++++- test_runner/regress/test_safekeeper_deletion.py | 2 ++ test_runner/regress/test_wal_acceptor.py | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 588bd4f2c9..2bee41537f 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -395,6 +395,8 @@ pub enum TimelineError { Cancelled(TenantTimelineId), #[error("Timeline {0} was not found in global map")] NotFound(TenantTimelineId), + #[error("Timeline {0} has been deleted")] + Deleted(TenantTimelineId), #[error("Timeline {0} creation is in progress")] CreationInProgress(TenantTimelineId), #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index e3f7d88f7c..6e41ada1b3 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -78,7 +78,13 @@ impl GlobalTimelinesState { Some(GlobalMapTimeline::CreationInProgress) => { Err(TimelineError::CreationInProgress(*ttid)) } - None => Err(TimelineError::NotFound(*ttid)), + None => { + if self.has_tombstone(ttid) { + Err(TimelineError::Deleted(*ttid)) + } else { + Err(TimelineError::NotFound(*ttid)) + } + } } } diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py index b681a86103..bc79969e9a 100644 --- a/test_runner/regress/test_safekeeper_deletion.py +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -30,6 +30,7 @@ def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabl env.pageserver.allowed_errors.extend( [ ".*Timeline .* was not found in global map.*", + ".*Timeline .* has been deleted.*", ".*Timeline .* was cancelled and cannot be used anymore.*", ] ) @@ -198,6 +199,7 @@ def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder) env.pageserver.allowed_errors.extend( [ ".*Timeline.*was cancelled.*", + ".*Timeline.*has been deleted.*", ".*Timeline.*was not found.*", ] ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 6a7c7a8bef..b9183286af 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -433,6 +433,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ ".*Timeline .* was not found in global map.*", + ".*Timeline .* has been deleted.*", ".*Timeline .* was cancelled and cannot be used anymore.*", ] ) @@ -1934,6 +1935,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ ".*Timeline .* was not found in global map.*", + ".*Timeline .* has been deleted.*", ".*Timeline .* was cancelled and cannot be used anymore.*", ] )