many_tenants: fix: waiting for all tenants to become active doesn't work with 20k tenants

While at it, also remove the returning of template_timeline and tenants from the many_tenants return type. It wasn't correctly rehydrated anyway.
2026-05-27 18:10:37 +00:00 · 2024-01-09 16:15:14 +00:00
parent 8c855f4e1f
commit 77efb8b58b
3 changed files with 51 additions and 20 deletions
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -10,7 +10,11 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    SnapshotDir,
 )
-from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
+from fixtures.pageserver.utils import (
+    wait_until_all_tenants_state,
+    wait_until_tenant_active,
+    wait_until_tenant_state,
+)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 import fixtures.pageserver.remote_storage
@@ -20,8 +24,6 @@ from fixtures import work_queue
@dataclass
 class SingleTimeline:
    env: NeonEnv
-    timeline_id: TimelineId
-    tenants: List[TenantId]


 def single_timeline(
@@ -43,10 +45,6 @@ def single_timeline(
        save_snapshot = False
        env = neon_env_builder.from_repo_dir(snapshot_dir.path)
        ps_http = env.pageserver.http_client()
-        tenants = list(
-            {TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))}
-        )
-        template_timeline = env.initial_timeline
    else:
        if snapshot_dir.path.exists():
            shutil.rmtree(snapshot_dir.path)
@@ -137,11 +135,13 @@ def single_timeline(

    env.start()

-    log.info(f"wait for tenants to become active")
-    for tenant in tenants:
-        wait_until_tenant_active(ps_http, tenant, iterations=ncopies, period=1)
+    log.info(f"wait for all tenants to become active")
+    wait_until_all_tenants_state(
+        ps_http, "Active", iterations=ncopies, period=1, http_error_ok=False
+    )

    # ensure all layers are resident for predictiable performance
+    tenants = [info["id"] for info in ps_http.tenant_list()]
    for tenant in tenants:
        for timeline in ps_http.tenant_status(tenant)["timelines"]:
            info = ps_http.layer_map_info(tenant, timeline)
@@ -149,4 +149,4 @@ def single_timeline(
                assert not layer.remote

    log.info("ready")
-    return SingleTimeline(env, template_timeline, tenants)
+    return SingleTimeline(env)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -61,6 +61,14 @@ def wait_for_upload(
    )


+def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
+    if tenant_info["state"]["slug"] == expected_state:
+        return True
+    if tenant_info["state"]["slug"] == "Broken":
+        raise RuntimeError(f"tenant became Broken, not {expected_state}")
+    return False
+
+
 def wait_until_tenant_state(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
@@ -78,10 +86,8 @@ def wait_until_tenant_state(
            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
        else:
            log.debug(f"Tenant {tenant_id} data: {tenant}")
-            if tenant["state"]["slug"] == expected_state:
+            if _tenant_in_expected_state(tenant, expected_state):
                return tenant
-            if tenant["state"]["slug"] == "Broken":
-                raise RuntimeError(f"tenant became Broken, not {expected_state}")

        time.sleep(period)

@@ -90,6 +96,34 @@ def wait_until_tenant_state(
    )


+def wait_until_all_tenants_state(
+    pageserver_http: PageserverHttpClient,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+    http_error_ok: bool = True,
+):
+    """
+    Like wait_until_tenant_state, but checks all tenants.
+    """
+    for _ in range(iterations):
+        try:
+            tenants = pageserver_http.tenant_list()
+        except Exception as e:
+            if http_error_ok:
+                log.debug(f"Failed to list tenants: {e}")
+            else:
+                raise
+        else:
+            if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
+                return
+        time.sleep(period)
+
+    raise Exception(
+        f"Not all tenants became active {expected_state} within {iterations * period} seconds"
+    )
+
+
 def wait_until_timeline_state(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
--- a/test_runner/performance/test_pageserver_pagebench.py
+++ b/test_runner/performance/test_pageserver_pagebench.py
@@ -61,11 +61,7 @@ def test_getpage_throughput(
    zenbenchmark: NeonBenchmarker,
    pg_bin: PgBin,
 ):
-    env, template_timeline, tenants = (
-        getpage_throughput_fixture.env,
-        getpage_throughput_fixture.timeline_id,
-        getpage_throughput_fixture.tenants,
-    )
+    env = getpage_throughput_fixture.env
    ps_http = env.pageserver.http_client()

    # run the benchmark with one client per timeline, each doing 10k requests to random keys.
@@ -80,7 +76,8 @@ def test_getpage_throughput(
        "--runtime",
        duration,
        # "--per-target-rate-limit", "50",
-        *[f"{tenant}/{template_timeline}" for tenant in tenants],
+        # don't specify the targets, our fixture prepares us exactly 20k tenants,
+        # and pagebench will auto-discover them
    ]
    log.info(f"command: {' '.join(cmd)}")
    basepath = pg_bin.run_capture(cmd, with_command_header=False)