mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
Disable initdb cancellation (#6451)
## Problem The initdb cancellation added in #5921 is not sufficient to reliably abort the entire initdb process. Initdb also spawns children. The tests added by #6310 (#6385) and #6436 now do initdb cancellations on a more regular basis. In #6385, I attempted to issue `killpg` (after giving it a new process group ID) to kill not just the initdb but all its spawned subprocesses, but this didn't work. Initdb doesn't take *that* long in the end either, so we just wait until it concludes. ## Summary of changes * revert initdb cancellation support added in #5921 * still return `Err(Cancelled)` upon cancellation, but this is just to not have to remove the cancellation infrastructure * fixes to the `test_tenant_delete_races_timeline_creation` test to make it reliably pass Fixes #6385
This commit is contained in:
@@ -91,7 +91,6 @@ use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::ops::Bound::Included;
|
||||
use std::process::Stdio;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -3762,27 +3761,25 @@ async fn run_initdb(
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
// If the `select!` below doesn't finish the `wait_with_output`,
|
||||
// let the task get `wait()`ed for asynchronously by tokio.
|
||||
// This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
|
||||
// TODO: fix for this is non-trivial, see
|
||||
// https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
|
||||
//
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
tokio::select! {
|
||||
initdb_output = initdb_command.wait_with_output() => {
|
||||
let initdb_output = initdb_output?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
initdb_output.stderr,
|
||||
));
|
||||
}
|
||||
|
||||
// This isn't true cancellation support, see above. Still return an error to
|
||||
// excercise the cancellation code path.
|
||||
if cancel.is_cancelled() {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -371,8 +371,24 @@ def tenant_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
iterations: int,
|
||||
ignore_errors: bool = False,
|
||||
):
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
if not ignore_errors:
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
else:
|
||||
interval = 0.5
|
||||
|
||||
def delete_request_sent():
|
||||
try:
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
except Exception as e:
|
||||
log.debug(e)
|
||||
|
||||
wait_until(iterations, interval=interval, func=delete_request_sent)
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
|
||||
|
||||
|
||||
|
||||
@@ -578,6 +578,9 @@ def test_tenant_delete_races_timeline_creation(
|
||||
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
|
||||
)
|
||||
|
||||
# This can occur sometimes.
|
||||
CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
@@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation(
|
||||
".*POST.*/timeline.* request was dropped before completing",
|
||||
# Timeline creation runs into this error
|
||||
CANCELLED_ERROR,
|
||||
# Timeline deletion can run into this error during deletion
|
||||
CONFLICT_MESSAGE,
|
||||
".*tenant_delete_handler.*still waiting, taking longer than expected.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation(
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
|
||||
def tenant_delete():
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
def tenant_delete_inner():
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
|
||||
wait_until(100, 0.5, tenant_delete_inner)
|
||||
|
||||
Thread(target=tenant_delete).start()
|
||||
|
||||
@@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation(
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
try:
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
except PageserverApiException:
|
||||
pass
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
|
||||
Reference in New Issue
Block a user