diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index 3c7bd08996..afbe3c55c7 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -6,7 +6,7 @@ from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload +from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -109,9 +109,9 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) log.info("waiting for timeline redownload") - wait_for(number_of_iterations=10, - interval=1, - func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) + wait_until(number_of_iterations=10, + interval=1, + func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 0e5dd6eadf..91506e120d 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -10,7 +10,7 @@ from typing import Optional import signal import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir +from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir from fixtures.utils import lsn_from_hex @@ -191,7 +191,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, # call to attach timeline to new pageserver new_pageserver_http.timeline_attach(tenant, timeline) # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_for( + new_timeline_detail = wait_until( number_of_iterations=5, interval=1, func=lambda: assert_local(new_pageserver_http, tenant, timeline)) diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index 8fca56143e..203e73037f 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -34,7 +34,12 @@ from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (etcd_path, get_self_dir, mkdir_if_needed, subprocess_capture, lsn_from_hex) +from .utils import (etcd_path, + get_self_dir, + mkdir_if_needed, + subprocess_capture, + lsn_from_hex, + lsn_to_hex) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -2065,7 +2070,11 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos assert (mismatch, error) == ([], []) -def wait_for(number_of_iterations: int, interval: int, func): +def wait_until(number_of_iterations: int, interval: int, func): + """ + Wait until 'func' returns successfully, without exception. Returns the last return value + from the the function. + """ last_exception = None for i in range(number_of_iterations): try: @@ -2092,9 +2101,15 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail['remote']['remote_consistent_lsn'] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + if detail['remote'] is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it it has been uploaded to remote + # storage yet. + return 0 + else: + lsn_str = detail['remote']['remote_consistent_lsn'] + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, @@ -2102,8 +2117,15 @@ def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for local timeline upload up to specified lsn""" - - wait_for(10, 1, lambda: remote_consistent_lsn(pageserver_http_client, tenant, timeline) >= lsn) + for i in range(10): + current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn))) def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, @@ -2121,5 +2143,12 @@ def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, timeline: uuid.UUID, lsn: int): """waits for pageserver to catch up to a certain lsn""" - - wait_for(10, 1, lambda: last_record_lsn(pageserver_http_client, tenant, timeline) >= lsn) + for i in range(10): + current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info("waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + time.sleep(1) + raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn)))