From 5f4fe8f72ac2c0279d7fc85c01d5cd4c38e4d1af Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 Jan 2024 09:45:37 +0000
Subject: [PATCH] fight various timeouts at high tenant count

---
 test_runner/fixtures/pageserver/many_tenants.py      | 4 ++--
 test_runner/performance/test_pageserver_pagebench.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index a0c14f0b60..60d0d17c46 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -90,7 +90,7 @@ def single_timeline(
 
         work_queue.do(22, tenants, attach_broken)
 
-        env.pageserver.stop()  # clears the failpoint as a side-effect
+        env.pageserver.stop(immediate=True)  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
         tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
         log.info(f"python-side on-demand download the layer files into local tenant dir")
         fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
@@ -100,7 +100,7 @@ def single_timeline(
 
     log.info(f"wait for tenants to become active")
     for tenant in tenants:
-        wait_until_tenant_active(ps_http, tenant)
+        wait_until_tenant_active(ps_http, tenant, iterations=ncopies, period=1)
 
     # ensure all layers are resident for predictiable performance
     for tenant in tenants:
diff --git a/test_runner/performance/test_pageserver_pagebench.py b/test_runner/performance/test_pageserver_pagebench.py
index e56e7e31bc..8a0d36bc90 100644
--- a/test_runner/performance/test_pageserver_pagebench.py
+++ b/test_runner/performance/test_pageserver_pagebench.py
@@ -92,4 +92,6 @@ def test_getpage_throughput(
 
     log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
 
+    env.pageserver.stop(immediate=True) # with 20k tenants, we hit neon_local's shutdown timeout of 10 seconds
+
     zenbenchmark.record_pagebench_results("get-page-latest-lsn", results, duration)