Disable initdb cancellation (#6451)

## Problem

The initdb cancellation added in #5921 is not sufficient to reliably
abort the entire initdb process. Initdb also spawns children. The tests
added by #6310 (#6385) and #6436 now do initdb cancellations on a more
regular basis.

In #6385, I attempted to issue `killpg` (after giving it a new process
group ID) to kill not just the initdb but all its spawned subprocesses,
but this didn't work. Initdb doesn't take *that* long in the end either,
so we just wait until it concludes.

## Summary of changes

* revert initdb cancellation support added in #5921
* still return `Err(Cancelled)` upon cancellation, but this is just to
not have to remove the cancellation infrastructure
* fixes to the `test_tenant_delete_races_timeline_creation` test to make
it reliably pass

Fixes #6385
This commit is contained in:
Arpad Müller
2024-01-24 13:06:05 +01:00
committed by GitHub
parent 996abc9563
commit d820aa1d08
3 changed files with 46 additions and 26 deletions

View File

@@ -91,7 +91,6 @@ use std::fs;
use std::fs::File;
use std::io;
use std::ops::Bound::Included;
use std::process::Stdio;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
@@ -3762,27 +3761,25 @@ async fn run_initdb(
.env_clear()
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
// If the `select!` below doesn't finish the `wait_with_output`,
// let the task get `wait()`ed for asynchronously by tokio.
// This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
// TODO: fix for this is non-trivial, see
// https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
//
.kill_on_drop(true)
.spawn()?;
tokio::select! {
initdb_output = initdb_command.wait_with_output() => {
let initdb_output = initdb_output?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
}
}
_ = cancel.cancelled() => {
return Err(InitdbError::Cancelled);
}
// Ideally we'd select here with the cancellation token, but the problem is that
// we can't safely terminate initdb: it launches processes of its own, and killing
// initdb doesn't kill them. After we return from this function, we want the target
// directory to be able to be cleaned up.
// See https://github.com/neondatabase/neon/issues/6385
let initdb_output = initdb_command.wait_with_output().await?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(
initdb_output.status,
initdb_output.stderr,
));
}
// This isn't true cancellation support, see above. Still return an error to
// excercise the cancellation code path.
if cancel.is_cancelled() {
return Err(InitdbError::Cancelled);
}
Ok(())

View File

@@ -371,8 +371,24 @@ def tenant_delete_wait_completed(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
iterations: int,
ignore_errors: bool = False,
):
pageserver_http.tenant_delete(tenant_id=tenant_id)
if not ignore_errors:
pageserver_http.tenant_delete(tenant_id=tenant_id)
else:
interval = 0.5
def delete_request_sent():
try:
pageserver_http.tenant_delete(tenant_id=tenant_id)
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
return
except Exception as e:
log.debug(e)
wait_until(iterations, interval=interval, func=delete_request_sent)
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)

View File

@@ -578,6 +578,9 @@ def test_tenant_delete_races_timeline_creation(
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
)
# This can occur sometimes.
CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
@@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation(
".*POST.*/timeline.* request was dropped before completing",
# Timeline creation runs into this error
CANCELLED_ERROR,
# Timeline deletion can run into this error during deletion
CONFLICT_MESSAGE,
".*tenant_delete_handler.*still waiting, taking longer than expected.*",
]
)
@@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation(
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
def tenant_delete():
ps_http.tenant_delete(tenant_id)
def tenant_delete_inner():
ps_http.tenant_delete(tenant_id)
wait_until(100, 0.5, tenant_delete_inner)
Thread(target=tenant_delete).start()
@@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation(
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
try:
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
except PageserverApiException:
pass
tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
# Physical deletion should have happened
assert_prefix_empty(