Wait for certain tenant status in the remote storage test (#3055)

Closes https://github.com/neondatabase/neon/issues/3052

From what I could understand from the PR, we did not wait enough before
the attach failed.
Extended the wait period a bit and put a check for a status instead of
plain `sleep` to fail if we don't get the expected status.
This commit is contained in:
Kirill Bulatov
2022-12-10 10:18:55 +02:00
committed by GitHub
parent b8a5664fb9
commit 700a36ee6b
3 changed files with 28 additions and 26 deletions

View File

@@ -2938,6 +2938,27 @@ def wait_for_upload(
)
# Does not use `wait_until` for debugging purposes
def wait_until_tenant_state(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
expected_state: str,
iterations: int,
) -> bool:
for _ in range(iterations):
try:
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
log.debug(f"Tenant {tenant_id} data: {tenant}")
if tenant["state"] == expected_state:
return True
except Exception as e:
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
time.sleep(1)
raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
def last_record_lsn(
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
) -> Lsn:

View File

@@ -19,6 +19,7 @@ from fixtures.neon_fixtures import (
wait_for_last_flush_lsn,
wait_for_last_record_lsn,
wait_for_upload,
wait_until_tenant_state,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import print_gc_result, query_scalar, wait_until
@@ -134,7 +135,7 @@ def test_remote_storage_backup_and_restore(
client.tenant_attach(tenant_id)
# is there a better way to assert that failpoint triggered?
time.sleep(10)
wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
# assert cannot attach timeline that is scheduled for download
# FIXME implement layer download retries

View File

@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
available_remote_storages,
wait_for_last_record_lsn,
wait_for_upload,
wait_until_tenant_state,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar
@@ -230,7 +231,7 @@ def test_ignored_tenant_reattach(
# now, load it from the local files and expect it works
pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
wait_until_tenant_status(pageserver_http, ignored_tenant_id, "Active", 5)
wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_attach.sort()
@@ -289,7 +290,7 @@ def test_ignored_tenant_download_missing_layers(
# now, load it from the local files and expect it to work due to remote storage restoration
pageserver_http.tenant_load(tenant_id=tenant_id)
wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_attach.sort()
@@ -340,7 +341,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
# now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
pageserver_http.tenant_load(tenant_id=tenant_id)
wait_until_tenant_status(pageserver_http, tenant_id, "Broken", 5)
wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5)
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
@@ -441,7 +442,7 @@ def test_ignore_while_attaching(
# But can load it from local files, that will restore attach.
pageserver_http.tenant_load(tenant_id)
wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
pg.stop()
pg.start()
@@ -481,24 +482,3 @@ def ensure_test_data(data_id: int, data: str, pg: Postgres):
assert (
query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
), "Should have timeline data back"
# Does not use `wait_until` for debugging purposes
def wait_until_tenant_status(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
expected_status: str,
iterations: int,
) -> bool:
for _ in range(iterations):
try:
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
log.debug(f"Tenant {tenant_id} status: {tenant}")
if tenant["state"] == expected_status:
return True
except Exception as e:
log.debug(f"Tenant {tenant_id} status retrieval failure: {e}")
time.sleep(1)
raise Exception(f"Tenant {tenant_id} did not become {expected_status} in {iterations} seconds")