From 5e3fbef7210a84870cb012837db6830aeab3d38d Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:10:09 -0500 Subject: [PATCH] fix(pageserver): queue stopped error should be ignored during create timeline (#9767) close https://github.com/neondatabase/neon/issues/9730 The test case tests if anything goes wrong during pageserver restart + *during timeline creation not complete*. Therefore, queue is stopped error is normal in this case, except that it should be categorized as a shutdown error instead of a real error. ## Summary of changes * More comments for the test case. * Queue stopped error will now be forwarded as CreateTimelineError::ShuttingDown. --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant.rs | 6 ++++++ test_runner/regress/test_tenants.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e88dee7c6c..46317e93ee 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2446,6 +2446,12 @@ impl Tenant { .remote_client .wait_completion() .await + .map_err(|e| match e { + WaitCompletionError::NotInitialized( + e, // If the queue is already stopped, it's a shutdown error. + ) if e.is_stopping() => CreateTimelineError::ShuttingDown, + e => CreateTimelineError::Other(e.into()), + }) .context("wait for timeline initial uploads to complete")?; // The creating task is responsible for activating the timeline. diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 5a499ea98b..158c3fddb0 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -369,12 +369,16 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): - Bad response codes during shutdown (e.g. returning 500 instead of 503) - Issues where a tenant is still starting up while we receive a request for it - Issues with interrupting/resuming tenant/timeline creation in shutdown + - Issues with a timeline is not created successfully because of restart. """ env = neon_env_builder.init_configs() env.start() tenant_id: TenantId = env.initial_tenant timeline_id = env.initial_timeline + # At this point, the initial tenant/timeline might not have been created successfully, + # and this is the case we want to test. + # Multiple creation requests which race will generate this error on the pageserver # and storage controller respectively env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")