From 700a36ee6bc83b2a41c856f22e7e8afd1e6e2c21 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 10 Dec 2022 10:18:55 +0200 Subject: [PATCH] Wait for certain tenant status in the remote storage test (#3055) Closes https://github.com/neondatabase/neon/issues/3052 From what I could understand from the PR, we did not wait enough before the attach failed. Extended the wait period a bit and put a check for a status instead of plain `sleep` to fail if we don't get the expected status. --- test_runner/fixtures/neon_fixtures.py | 21 +++++++++++++++ test_runner/regress/test_remote_storage.py | 3 ++- test_runner/regress/test_tenant_detach.py | 30 ++++------------------ 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d20e591e9b..5fbde5e03b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2938,6 +2938,27 @@ def wait_for_upload( ) +# Does not use `wait_until` for debugging purposes +def wait_until_tenant_state( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + expected_state: str, + iterations: int, +) -> bool: + for _ in range(iterations): + try: + tenant = pageserver_http.tenant_status(tenant_id=tenant_id) + log.debug(f"Tenant {tenant_id} data: {tenant}") + if tenant["state"] == expected_state: + return True + except Exception as e: + log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") + + time.sleep(1) + + raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") + + def last_record_lsn( pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 506955b1df..7152bc8b6a 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -19,6 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, wait_for_last_record_lsn, wait_for_upload, + wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import print_gc_result, query_scalar, wait_until @@ -134,7 +135,7 @@ def test_remote_storage_backup_and_restore( client.tenant_attach(tenant_id) # is there a better way to assert that failpoint triggered? - time.sleep(10) + wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) # assert cannot attach timeline that is scheduled for download # FIXME implement layer download retries diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 0d3465cc01..59811c565c 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -12,6 +12,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -230,7 +231,7 @@ def test_ignored_tenant_reattach( # now, load it from the local files and expect it works pageserver_http.tenant_load(tenant_id=ignored_tenant_id) - wait_until_tenant_status(pageserver_http, ignored_tenant_id, "Active", 5) + wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5) tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] tenants_after_attach.sort() @@ -289,7 +290,7 @@ def test_ignored_tenant_download_missing_layers( # now, load it from the local files and expect it to work due to remote storage restoration pageserver_http.tenant_load(tenant_id=tenant_id) - wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5) + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] tenants_after_attach.sort() @@ -340,7 +341,7 @@ def test_ignored_tenant_stays_broken_without_metadata( # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory pageserver_http.tenant_load(tenant_id=tenant_id) - wait_until_tenant_status(pageserver_http, tenant_id, "Broken", 5) + wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5) # Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally @@ -441,7 +442,7 @@ def test_ignore_while_attaching( # But can load it from local files, that will restore attach. pageserver_http.tenant_load(tenant_id) - wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5) + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) pg.stop() pg.start() @@ -481,24 +482,3 @@ def ensure_test_data(data_id: int, data: str, pg: Postgres): assert ( query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data ), "Should have timeline data back" - - -# Does not use `wait_until` for debugging purposes -def wait_until_tenant_status( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - expected_status: str, - iterations: int, -) -> bool: - for _ in range(iterations): - try: - tenant = pageserver_http.tenant_status(tenant_id=tenant_id) - log.debug(f"Tenant {tenant_id} status: {tenant}") - if tenant["state"] == expected_status: - return True - except Exception as e: - log.debug(f"Tenant {tenant_id} status retrieval failure: {e}") - - time.sleep(1) - - raise Exception(f"Tenant {tenant_id} did not become {expected_status} in {iterations} seconds")