From d820aa1d08c8a0e7f3fb2891640fa6ef6e357394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 24 Jan 2024 13:06:05 +0100 Subject: [PATCH] Disable initdb cancellation (#6451) ## Problem The initdb cancellation added in #5921 is not sufficient to reliably abort the entire initdb process. Initdb also spawns children. The tests added by #6310 (#6385) and #6436 now do initdb cancellations on a more regular basis. In #6385, I attempted to issue `killpg` (after giving it a new process group ID) to kill not just the initdb but all its spawned subprocesses, but this didn't work. Initdb doesn't take *that* long in the end either, so we just wait until it concludes. ## Summary of changes * revert initdb cancellation support added in #5921 * still return `Err(Cancelled)` upon cancellation, but this is just to not have to remove the cancellation infrastructure * fixes to the `test_tenant_delete_races_timeline_creation` test to make it reliably pass Fixes #6385 --- pageserver/src/tenant.rs | 37 +++++++++++------------ test_runner/fixtures/pageserver/utils.py | 18 ++++++++++- test_runner/regress/test_tenant_delete.py | 17 ++++++++--- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d4d4208ac2..31af54d146 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -91,7 +91,6 @@ use std::fs; use std::fs::File; use std::io; use std::ops::Bound::Included; -use std::process::Stdio; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -3762,27 +3761,25 @@ async fn run_initdb( .env_clear() .env("LD_LIBRARY_PATH", &initdb_lib_dir) .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - // If the `select!` below doesn't finish the `wait_with_output`, - // let the task get `wait()`ed for asynchronously by tokio. - // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE. - // TODO: fix for this is non-trivial, see - // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021 - // - .kill_on_drop(true) .spawn()?; - tokio::select! { - initdb_output = initdb_command.wait_with_output() => { - let initdb_output = initdb_output?; - if !initdb_output.status.success() { - return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr)); - } - } - _ = cancel.cancelled() => { - return Err(InitdbError::Cancelled); - } + // Ideally we'd select here with the cancellation token, but the problem is that + // we can't safely terminate initdb: it launches processes of its own, and killing + // initdb doesn't kill them. After we return from this function, we want the target + // directory to be able to be cleaned up. + // See https://github.com/neondatabase/neon/issues/6385 + let initdb_output = initdb_command.wait_with_output().await?; + if !initdb_output.status.success() { + return Err(InitdbError::Failed( + initdb_output.status, + initdb_output.stderr, + )); + } + + // This isn't true cancellation support, see above. Still return an error to + // excercise the cancellation code path. + if cancel.is_cancelled() { + return Err(InitdbError::Cancelled); } Ok(()) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 972e0b714d..6b2651e447 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -371,8 +371,24 @@ def tenant_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int, + ignore_errors: bool = False, ): - pageserver_http.tenant_delete(tenant_id=tenant_id) + if not ignore_errors: + pageserver_http.tenant_delete(tenant_id=tenant_id) + else: + interval = 0.5 + + def delete_request_sent(): + try: + pageserver_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + log.debug(e) + if e.status_code == 404: + return + except Exception as e: + log.debug(e) + + wait_until(iterations, interval=interval, func=delete_request_sent) wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 7a5b1c0fc2..bafc5711a1 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -578,6 +578,9 @@ def test_tenant_delete_races_timeline_creation( ".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled" ) + # This can occur sometimes. + CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*" + env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads @@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation( ".*POST.*/timeline.* request was dropped before completing", # Timeline creation runs into this error CANCELLED_ERROR, + # Timeline deletion can run into this error during deletion + CONFLICT_MESSAGE, + ".*tenant_delete_handler.*still waiting, taking longer than expected.*", ] ) @@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation( ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) def tenant_delete(): - ps_http.tenant_delete(tenant_id) + def tenant_delete_inner(): + ps_http.tenant_delete(tenant_id) + + wait_until(100, 0.5, tenant_delete_inner) Thread(target=tenant_delete).start() @@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation( ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) iterations = poll_for_remote_storage_iterations(remote_storage_kind) - try: - tenant_delete_wait_completed(ps_http, tenant_id, iterations) - except PageserverApiException: - pass + + tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True) # Physical deletion should have happened assert_prefix_empty(