mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 18:10:37 +00:00
many_tenants: fix: waiting for all tenants to become active doesn't work with 20k tenants
While at it, also remove the returning of template_timeline and tenants from the many_tenants return type. It wasn't correctly rehydrated anyway.
This commit is contained in:
@@ -10,7 +10,11 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
SnapshotDir,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_until_all_tenants_state,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
import fixtures.pageserver.remote_storage
|
||||
@@ -20,8 +24,6 @@ from fixtures import work_queue
|
||||
@dataclass
|
||||
class SingleTimeline:
|
||||
env: NeonEnv
|
||||
timeline_id: TimelineId
|
||||
tenants: List[TenantId]
|
||||
|
||||
|
||||
def single_timeline(
|
||||
@@ -43,10 +45,6 @@ def single_timeline(
|
||||
save_snapshot = False
|
||||
env = neon_env_builder.from_repo_dir(snapshot_dir.path)
|
||||
ps_http = env.pageserver.http_client()
|
||||
tenants = list(
|
||||
{TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))}
|
||||
)
|
||||
template_timeline = env.initial_timeline
|
||||
else:
|
||||
if snapshot_dir.path.exists():
|
||||
shutil.rmtree(snapshot_dir.path)
|
||||
@@ -137,11 +135,13 @@ def single_timeline(
|
||||
|
||||
env.start()
|
||||
|
||||
log.info(f"wait for tenants to become active")
|
||||
for tenant in tenants:
|
||||
wait_until_tenant_active(ps_http, tenant, iterations=ncopies, period=1)
|
||||
log.info(f"wait for all tenants to become active")
|
||||
wait_until_all_tenants_state(
|
||||
ps_http, "Active", iterations=ncopies, period=1, http_error_ok=False
|
||||
)
|
||||
|
||||
# ensure all layers are resident for predictiable performance
|
||||
tenants = [info["id"] for info in ps_http.tenant_list()]
|
||||
for tenant in tenants:
|
||||
for timeline in ps_http.tenant_status(tenant)["timelines"]:
|
||||
info = ps_http.layer_map_info(tenant, timeline)
|
||||
@@ -149,4 +149,4 @@ def single_timeline(
|
||||
assert not layer.remote
|
||||
|
||||
log.info("ready")
|
||||
return SingleTimeline(env, template_timeline, tenants)
|
||||
return SingleTimeline(env)
|
||||
|
||||
@@ -61,6 +61,14 @@ def wait_for_upload(
|
||||
)
|
||||
|
||||
|
||||
def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
|
||||
if tenant_info["state"]["slug"] == expected_state:
|
||||
return True
|
||||
if tenant_info["state"]["slug"] == "Broken":
|
||||
raise RuntimeError(f"tenant became Broken, not {expected_state}")
|
||||
return False
|
||||
|
||||
|
||||
def wait_until_tenant_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
@@ -78,10 +86,8 @@ def wait_until_tenant_state(
|
||||
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
|
||||
else:
|
||||
log.debug(f"Tenant {tenant_id} data: {tenant}")
|
||||
if tenant["state"]["slug"] == expected_state:
|
||||
if _tenant_in_expected_state(tenant, expected_state):
|
||||
return tenant
|
||||
if tenant["state"]["slug"] == "Broken":
|
||||
raise RuntimeError(f"tenant became Broken, not {expected_state}")
|
||||
|
||||
time.sleep(period)
|
||||
|
||||
@@ -90,6 +96,34 @@ def wait_until_tenant_state(
|
||||
)
|
||||
|
||||
|
||||
def wait_until_all_tenants_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
expected_state: str,
|
||||
iterations: int,
|
||||
period: float = 1.0,
|
||||
http_error_ok: bool = True,
|
||||
):
|
||||
"""
|
||||
Like wait_until_tenant_state, but checks all tenants.
|
||||
"""
|
||||
for _ in range(iterations):
|
||||
try:
|
||||
tenants = pageserver_http.tenant_list()
|
||||
except Exception as e:
|
||||
if http_error_ok:
|
||||
log.debug(f"Failed to list tenants: {e}")
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
|
||||
return
|
||||
time.sleep(period)
|
||||
|
||||
raise Exception(
|
||||
f"Not all tenants became active {expected_state} within {iterations * period} seconds"
|
||||
)
|
||||
|
||||
|
||||
def wait_until_timeline_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -61,11 +61,7 @@ def test_getpage_throughput(
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
env, template_timeline, tenants = (
|
||||
getpage_throughput_fixture.env,
|
||||
getpage_throughput_fixture.timeline_id,
|
||||
getpage_throughput_fixture.tenants,
|
||||
)
|
||||
env = getpage_throughput_fixture.env
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# run the benchmark with one client per timeline, each doing 10k requests to random keys.
|
||||
@@ -80,7 +76,8 @@ def test_getpage_throughput(
|
||||
"--runtime",
|
||||
duration,
|
||||
# "--per-target-rate-limit", "50",
|
||||
*[f"{tenant}/{template_timeline}" for tenant in tenants],
|
||||
# don't specify the targets, our fixture prepares us exactly 20k tenants,
|
||||
# and pagebench will auto-discover them
|
||||
]
|
||||
log.info(f"command: {' '.join(cmd)}")
|
||||
basepath = pg_bin.run_capture(cmd, with_command_header=False)
|
||||
|
||||
Reference in New Issue
Block a user