From c3fe335eaf45c833e52353234699a86d653735b5 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 1 Aug 2023 18:28:18 +0300 Subject: [PATCH] wait for tenant to be active before polling for timeline absence (#4856) ## Problem https://neon-github-public-dev.s3.amazonaws.com/reports/main/5692829577/index.html#suites/f588e0a787c49e67b29490359c589fae/4c50937643d68a66 ## Summary of changes wait for tenant to be active after restart before polling for timeline absence --- test_runner/fixtures/pageserver/utils.py | 7 +++---- test_runner/regress/test_timeline_delete.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index f8a4423ffa..119c99bb96 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -197,10 +197,9 @@ def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - wait_longer: bool = False, + iterations: int, ): last_exc = None - iterations = 10 if wait_longer else 2 for _ in range(iterations): time.sleep(0.250) try: @@ -220,8 +219,8 @@ def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3 + iterations: int = 20, **delete_args, ): pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) - wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer) + wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 9226ca21d2..764bfe62f9 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -229,6 +229,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http.configure_failpoints((failpoint, "return")) + iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4 + # These failpoints are earlier than background task is spawned. # so they result in api request failure. if failpoint in ( @@ -245,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=2, # effectively try immediately and retry once in one second + iterations=iterations, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -254,21 +256,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # failpoint may not be the only error in the stack assert reason.endswith(f"failpoint: {failpoint}"), reason - wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3 if check is Check.RETRY_WITH_RESTART: env.pageserver.stop() env.pageserver.start() + + wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations) + if failpoint == "timeline-delete-before-index-deleted-at": # We crashed before persisting this to remote storage, need to retry delete request - - # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading) - wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2) - timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) else: # Pageserver should've resumed deletion after restart. wait_timeline_detail_404( - ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ps_http, env.initial_tenant, timeline_id, iterations=iterations ) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed @@ -276,7 +276,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http.configure_failpoints((failpoint, "off")) timeline_delete_wait_completed( - ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ps_http, env.initial_tenant, timeline_id, iterations=iterations ) # Check remote is impty @@ -569,7 +569,7 @@ def test_concurrent_timeline_delete_stuck_on( try: log.info("first call start") timeline_delete_wait_completed( - ps_http, env.initial_tenant, child_timeline_id, timeout=10 + ps_http, env.initial_tenant, child_timeline_id, timeout=20 ) log.info("first call success") result_queue.put("success") @@ -683,7 +683,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_until(50, 0.1, first_request_finished) # check that the timeline is gone - wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id) + wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2) @pytest.mark.parametrize(