many_tenants: fix: waiting for all tenants to become active doesn't work with 20k tenants

While at it, also remove the returning of template_timeline and tenants
from the many_tenants return type. It wasn't correctly rehydrated
anyway.
This commit is contained in:
Christian Schwarz
2024-01-09 16:15:14 +00:00
parent 8c855f4e1f
commit 77efb8b58b
3 changed files with 51 additions and 20 deletions

View File

@@ -10,7 +10,11 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
SnapshotDir,
)
from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
from fixtures.pageserver.utils import (
wait_until_all_tenants_state,
wait_until_tenant_active,
wait_until_tenant_state,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
import fixtures.pageserver.remote_storage
@@ -20,8 +24,6 @@ from fixtures import work_queue
@dataclass
class SingleTimeline:
env: NeonEnv
timeline_id: TimelineId
tenants: List[TenantId]
def single_timeline(
@@ -43,10 +45,6 @@ def single_timeline(
save_snapshot = False
env = neon_env_builder.from_repo_dir(snapshot_dir.path)
ps_http = env.pageserver.http_client()
tenants = list(
{TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))}
)
template_timeline = env.initial_timeline
else:
if snapshot_dir.path.exists():
shutil.rmtree(snapshot_dir.path)
@@ -137,11 +135,13 @@ def single_timeline(
env.start()
log.info(f"wait for tenants to become active")
for tenant in tenants:
wait_until_tenant_active(ps_http, tenant, iterations=ncopies, period=1)
log.info(f"wait for all tenants to become active")
wait_until_all_tenants_state(
ps_http, "Active", iterations=ncopies, period=1, http_error_ok=False
)
# ensure all layers are resident for predictiable performance
tenants = [info["id"] for info in ps_http.tenant_list()]
for tenant in tenants:
for timeline in ps_http.tenant_status(tenant)["timelines"]:
info = ps_http.layer_map_info(tenant, timeline)
@@ -149,4 +149,4 @@ def single_timeline(
assert not layer.remote
log.info("ready")
return SingleTimeline(env, template_timeline, tenants)
return SingleTimeline(env)

View File

@@ -61,6 +61,14 @@ def wait_for_upload(
)
def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
if tenant_info["state"]["slug"] == expected_state:
return True
if tenant_info["state"]["slug"] == "Broken":
raise RuntimeError(f"tenant became Broken, not {expected_state}")
return False
def wait_until_tenant_state(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
@@ -78,10 +86,8 @@ def wait_until_tenant_state(
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
else:
log.debug(f"Tenant {tenant_id} data: {tenant}")
if tenant["state"]["slug"] == expected_state:
if _tenant_in_expected_state(tenant, expected_state):
return tenant
if tenant["state"]["slug"] == "Broken":
raise RuntimeError(f"tenant became Broken, not {expected_state}")
time.sleep(period)
@@ -90,6 +96,34 @@ def wait_until_tenant_state(
)
def wait_until_all_tenants_state(
pageserver_http: PageserverHttpClient,
expected_state: str,
iterations: int,
period: float = 1.0,
http_error_ok: bool = True,
):
"""
Like wait_until_tenant_state, but checks all tenants.
"""
for _ in range(iterations):
try:
tenants = pageserver_http.tenant_list()
except Exception as e:
if http_error_ok:
log.debug(f"Failed to list tenants: {e}")
else:
raise
else:
if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
return
time.sleep(period)
raise Exception(
f"Not all tenants became active {expected_state} within {iterations * period} seconds"
)
def wait_until_timeline_state(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,

View File

@@ -61,11 +61,7 @@ def test_getpage_throughput(
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
):
env, template_timeline, tenants = (
getpage_throughput_fixture.env,
getpage_throughput_fixture.timeline_id,
getpage_throughput_fixture.tenants,
)
env = getpage_throughput_fixture.env
ps_http = env.pageserver.http_client()
# run the benchmark with one client per timeline, each doing 10k requests to random keys.
@@ -80,7 +76,8 @@ def test_getpage_throughput(
"--runtime",
duration,
# "--per-target-rate-limit", "50",
*[f"{tenant}/{template_timeline}" for tenant in tenants],
# don't specify the targets, our fixture prepares us exactly 20k tenants,
# and pagebench will auto-discover them
]
log.info(f"command: {' '.join(cmd)}")
basepath = pg_bin.run_capture(cmd, with_command_header=False)