mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
wait for tenant to be active before polling for timeline absence (#4856)
## Problem https://neon-github-public-dev.s3.amazonaws.com/reports/main/5692829577/index.html#suites/f588e0a787c49e67b29490359c589fae/4c50937643d68a66 ## Summary of changes wait for tenant to be active after restart before polling for timeline absence
This commit is contained in:
@@ -197,10 +197,9 @@ def wait_timeline_detail_404(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
wait_longer: bool = False,
|
||||
iterations: int,
|
||||
):
|
||||
last_exc = None
|
||||
iterations = 10 if wait_longer else 2
|
||||
for _ in range(iterations):
|
||||
time.sleep(0.250)
|
||||
try:
|
||||
@@ -220,8 +219,8 @@ def timeline_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3
|
||||
iterations: int = 20,
|
||||
**delete_args,
|
||||
):
|
||||
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
|
||||
|
||||
@@ -229,6 +229,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
|
||||
|
||||
# These failpoints are earlier than background task is spawned.
|
||||
# so they result in api request failure.
|
||||
if failpoint in (
|
||||
@@ -245,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
tenant_id=env.initial_tenant,
|
||||
timeline_id=timeline_id,
|
||||
expected_state="Broken",
|
||||
iterations=2, # effectively try immediately and retry once in one second
|
||||
iterations=iterations,
|
||||
)
|
||||
|
||||
reason = timeline_info["state"]["Broken"]["reason"]
|
||||
@@ -254,21 +256,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
# failpoint may not be the only error in the stack
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
|
||||
if check is Check.RETRY_WITH_RESTART:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
|
||||
|
||||
if failpoint == "timeline-delete-before-index-deleted-at":
|
||||
# We crashed before persisting this to remote storage, need to retry delete request
|
||||
|
||||
# Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
|
||||
wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
|
||||
|
||||
timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
|
||||
else:
|
||||
# Pageserver should've resumed deletion after restart.
|
||||
wait_timeline_detail_404(
|
||||
ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
|
||||
ps_http, env.initial_tenant, timeline_id, iterations=iterations
|
||||
)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
@@ -276,7 +276,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
ps_http.configure_failpoints((failpoint, "off"))
|
||||
|
||||
timeline_delete_wait_completed(
|
||||
ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
|
||||
ps_http, env.initial_tenant, timeline_id, iterations=iterations
|
||||
)
|
||||
|
||||
# Check remote is impty
|
||||
@@ -569,7 +569,7 @@ def test_concurrent_timeline_delete_stuck_on(
|
||||
try:
|
||||
log.info("first call start")
|
||||
timeline_delete_wait_completed(
|
||||
ps_http, env.initial_tenant, child_timeline_id, timeout=10
|
||||
ps_http, env.initial_tenant, child_timeline_id, timeout=20
|
||||
)
|
||||
log.info("first call success")
|
||||
result_queue.put("success")
|
||||
@@ -683,7 +683,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
|
||||
wait_until(50, 0.1, first_request_finished)
|
||||
|
||||
# check that the timeline is gone
|
||||
wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
|
||||
wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
Reference in New Issue
Block a user