From 6b1c4cc9833b24e4bd7e9ca0fc1ee12c6912ed4f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 24 Nov 2023 18:17:56 +0200 Subject: [PATCH] fix: long timeline create cancelled by tenant delete (#5917) Fix the fallible vs. infallible check order with `UninitTimeline::finish_creation` so that the incomplete timeline can be removed. Currently the order of drop guard unwrapping causes uninit files to be left on pageserver, blocking the tenant deletion. Cc: #5914 Cc: #investigation-2023-11-23-stuck-tenant-deletion --- pageserver/src/tenant.rs | 1 + pageserver/src/tenant/timeline.rs | 2 + pageserver/src/tenant/timeline/uninit.rs | 27 ++++++-- test_runner/fixtures/neon_fixtures.py | 2 +- test_runner/regress/test_tenant_delete.py | 77 ++++++++++++++++++++++- 5 files changed, 101 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ef14f75a47..249a9a80c5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1858,6 +1858,7 @@ impl Tenant { }); }) }; + // test_long_timeline_create_then_tenant_delete is leaning on this message tracing::info!("Waiting for timelines..."); while let Some(res) = js.join_next().await { match res { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 38b8832281..9493ed1c9a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2600,6 +2600,8 @@ impl Timeline { ) }; + pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable"); + if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 6b68fdeb84..f9bb6ca419 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -45,12 +45,20 @@ impl<'t> UninitializedTimeline<'t> { let timeline_id = self.timeline_id; let tenant_id = self.owning_tenant.tenant_id; - let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { - format!("No timeline for initalization found for {tenant_id}/{timeline_id}") - })?; + if self.raw_timeline.is_none() { + return Err(anyhow::anyhow!( + "No timeline for initialization found for {tenant_id}/{timeline_id}" + )); + } // Check that the caller initialized disk_consistent_lsn - let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); + let new_disk_consistent_lsn = self + .raw_timeline + .as_ref() + .expect("checked above") + .0 + .get_disk_consistent_lsn(); + anyhow::ensure!( new_disk_consistent_lsn.is_valid(), "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" @@ -62,6 +70,13 @@ impl<'t> UninitializedTimeline<'t> { "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" ), Entry::Vacant(v) => { + // after taking here should be no fallible operations, because the drop guard will not + // cleanup after and would block for example the tenant deletion + let (new_timeline, uninit_mark) = + self.raw_timeline.take().expect("already checked"); + + // this is the mutual exclusion between different retries to create the timeline; + // this should be an assertion. uninit_mark.remove_uninit_mark().with_context(|| { format!( "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" @@ -70,10 +85,10 @@ impl<'t> UninitializedTimeline<'t> { v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); + + Ok(new_timeline) } } - - Ok(new_timeline) } /// Prepares timeline data by loading it from the basebackup archive. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6b5fa631a4..fc7e834bd2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1646,7 +1646,7 @@ class NeonPageserver(PgProtocol): # env.pageserver.allowed_errors.append(".*could not open garage door.*") # # The entries in the list are regular experessions. - self.allowed_errors = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) + self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 6e35890922..0dd1f9a295 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,6 +1,7 @@ import enum import os import shutil +from threading import Thread import pytest from fixtures.log_helper import log @@ -27,7 +28,7 @@ from fixtures.remote_storage import ( available_s3_storages, ) from fixtures.types import TenantId -from fixtures.utils import run_pg_bench_small +from fixtures.utils import run_pg_bench_small, wait_until @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -399,4 +400,78 @@ def test_tenant_delete_is_resumed_on_attach( ) +def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): + """Reproduction of 2023-11-23 stuck tenants investigation""" + + # do not use default tenant/timeline creation because it would output the failpoint log message too early + env = neon_env_builder.init_configs() + env.start() + pageserver_http = env.pageserver.http_client() + + # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero + env.pageserver.allowed_errors.append( + ".*Timeline got dropped without initializing, cleaning its files" + ) + # the response hit_pausable_failpoint_and_later_fail + env.pageserver.allowed_errors.append( + f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn" + ) + + pageserver_http.tenant_create(env.initial_tenant) + + failpoint = "flush-layer-cancel-after-writing-layer-out-pausable" + pageserver_http.configure_failpoints((failpoint, "pause")) + + def hit_pausable_failpoint_and_later_fail(): + with pytest.raises( + PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn" + ): + pageserver_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline + ) + + def start_deletion(): + pageserver_http.tenant_delete(env.initial_tenant) + + def has_hit_failpoint(): + assert env.pageserver.log_contains(f"at failpoint {failpoint}") is not None + + def deletion_has_started_waiting_for_timelines(): + assert env.pageserver.log_contains("Waiting for timelines...") is not None + + def tenant_is_deleted(): + try: + pageserver_http.tenant_status(env.initial_tenant) + except PageserverApiException as e: + assert e.status_code == 404 + else: + raise RuntimeError("tenant was still accessible") + + creation = Thread(target=hit_pausable_failpoint_and_later_fail) + creation.start() + + deletion = None + + try: + wait_until(10, 1, has_hit_failpoint) + + # it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated + # then deletion should fail and set the tenant broken + deletion = Thread(target=start_deletion) + deletion.start() + + wait_until(10, 1, deletion_has_started_waiting_for_timelines) + + pageserver_http.configure_failpoints((failpoint, "off")) + + creation.join() + deletion.join() + + wait_until(10, 1, tenant_is_deleted) + finally: + creation.join() + if deletion is not None: + deletion.join() + + # TODO test concurrent deletions with "hang" failpoint