Disable initdb cancellation (#6451)

## Problem

The initdb cancellation added in #5921 is not sufficient to reliably
abort the entire initdb process. Initdb also spawns children. The tests
added by #6310 (#6385) and #6436 now do initdb cancellations on a more
regular basis.

In #6385, I attempted to issue `killpg` (after giving it a new process
group ID) to kill not just the initdb but all its spawned subprocesses,
but this didn't work. Initdb doesn't take *that* long in the end either,
so we just wait until it concludes.

## Summary of changes

* revert initdb cancellation support added in #5921
* still return `Err(Cancelled)` upon cancellation, but this is just to
not have to remove the cancellation infrastructure
* fixes to the `test_tenant_delete_races_timeline_creation` test to make
it reliably pass

Fixes #6385
This commit is contained in:
Arpad Müller
2024-01-24 13:06:05 +01:00
committed by GitHub
parent 996abc9563
commit d820aa1d08
3 changed files with 46 additions and 26 deletions

View File

@@ -371,8 +371,24 @@ def tenant_delete_wait_completed(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
iterations: int,
ignore_errors: bool = False,
):
pageserver_http.tenant_delete(tenant_id=tenant_id)
if not ignore_errors:
pageserver_http.tenant_delete(tenant_id=tenant_id)
else:
interval = 0.5
def delete_request_sent():
try:
pageserver_http.tenant_delete(tenant_id=tenant_id)
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
return
except Exception as e:
log.debug(e)
wait_until(iterations, interval=interval, func=delete_request_sent)
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)

View File

@@ -578,6 +578,9 @@ def test_tenant_delete_races_timeline_creation(
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
)
# This can occur sometimes.
CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
@@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation(
".*POST.*/timeline.* request was dropped before completing",
# Timeline creation runs into this error
CANCELLED_ERROR,
# Timeline deletion can run into this error during deletion
CONFLICT_MESSAGE,
".*tenant_delete_handler.*still waiting, taking longer than expected.*",
]
)
@@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation(
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
def tenant_delete():
ps_http.tenant_delete(tenant_id)
def tenant_delete_inner():
ps_http.tenant_delete(tenant_id)
wait_until(100, 0.5, tenant_delete_inner)
Thread(target=tenant_delete).start()
@@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation(
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
try:
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
except PageserverApiException:
pass
tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
# Physical deletion should have happened
assert_prefix_empty(