diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 5b44b96069..28e890b39d 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -10,7 +10,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, SnapshotDir, ) -from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state +from fixtures.pageserver.utils import ( + wait_until_all_tenants_state, + wait_until_tenant_active, + wait_until_tenant_state, +) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId, TimelineId import fixtures.pageserver.remote_storage @@ -20,8 +24,6 @@ from fixtures import work_queue @dataclass class SingleTimeline: env: NeonEnv - timeline_id: TimelineId - tenants: List[TenantId] def single_timeline( @@ -43,10 +45,6 @@ def single_timeline( save_snapshot = False env = neon_env_builder.from_repo_dir(snapshot_dir.path) ps_http = env.pageserver.http_client() - tenants = list( - {TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))} - ) - template_timeline = env.initial_timeline else: if snapshot_dir.path.exists(): shutil.rmtree(snapshot_dir.path) @@ -137,11 +135,13 @@ def single_timeline( env.start() - log.info(f"wait for tenants to become active") - for tenant in tenants: - wait_until_tenant_active(ps_http, tenant, iterations=ncopies, period=1) + log.info(f"wait for all tenants to become active") + wait_until_all_tenants_state( + ps_http, "Active", iterations=ncopies, period=1, http_error_ok=False + ) # ensure all layers are resident for predictiable performance + tenants = [info["id"] for info in ps_http.tenant_list()] for tenant in tenants: for timeline in ps_http.tenant_status(tenant)["timelines"]: info = ps_http.layer_map_info(tenant, timeline) @@ -149,4 +149,4 @@ def single_timeline( assert not layer.remote log.info("ready") - return SingleTimeline(env, template_timeline, tenants) + return SingleTimeline(env) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index e7b78cfb9a..6f286cb7d5 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -61,6 +61,14 @@ def wait_for_upload( ) +def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str): + if tenant_info["state"]["slug"] == expected_state: + return True + if tenant_info["state"]["slug"] == "Broken": + raise RuntimeError(f"tenant became Broken, not {expected_state}") + return False + + def wait_until_tenant_state( pageserver_http: PageserverHttpClient, tenant_id: TenantId, @@ -78,10 +86,8 @@ def wait_until_tenant_state( log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") else: log.debug(f"Tenant {tenant_id} data: {tenant}") - if tenant["state"]["slug"] == expected_state: + if _tenant_in_expected_state(tenant, expected_state): return tenant - if tenant["state"]["slug"] == "Broken": - raise RuntimeError(f"tenant became Broken, not {expected_state}") time.sleep(period) @@ -90,6 +96,34 @@ def wait_until_tenant_state( ) +def wait_until_all_tenants_state( + pageserver_http: PageserverHttpClient, + expected_state: str, + iterations: int, + period: float = 1.0, + http_error_ok: bool = True, +): + """ + Like wait_until_tenant_state, but checks all tenants. + """ + for _ in range(iterations): + try: + tenants = pageserver_http.tenant_list() + except Exception as e: + if http_error_ok: + log.debug(f"Failed to list tenants: {e}") + else: + raise + else: + if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)): + return + time.sleep(period) + + raise Exception( + f"Not all tenants became active {expected_state} within {iterations * period} seconds" + ) + + def wait_until_timeline_state( pageserver_http: PageserverHttpClient, tenant_id: TenantId, diff --git a/test_runner/performance/test_pageserver_pagebench.py b/test_runner/performance/test_pageserver_pagebench.py index 55409090ff..8fe35f5f88 100644 --- a/test_runner/performance/test_pageserver_pagebench.py +++ b/test_runner/performance/test_pageserver_pagebench.py @@ -61,11 +61,7 @@ def test_getpage_throughput( zenbenchmark: NeonBenchmarker, pg_bin: PgBin, ): - env, template_timeline, tenants = ( - getpage_throughput_fixture.env, - getpage_throughput_fixture.timeline_id, - getpage_throughput_fixture.tenants, - ) + env = getpage_throughput_fixture.env ps_http = env.pageserver.http_client() # run the benchmark with one client per timeline, each doing 10k requests to random keys. @@ -80,7 +76,8 @@ def test_getpage_throughput( "--runtime", duration, # "--per-target-rate-limit", "50", - *[f"{tenant}/{template_timeline}" for tenant in tenants], + # don't specify the targets, our fixture prepares us exactly 20k tenants, + # and pagebench will auto-discover them ] log.info(f"command: {' '.join(cmd)}") basepath = pg_bin.run_capture(cmd, with_command_header=False)