From 4c2c8d67081ee7856246e92df68dc13b1009c1a6 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 1 Nov 2024 12:25:04 +0100 Subject: [PATCH] test_runner: fix `tenant_get_shards` with one pageserver (#9603) ## Problem `tenant_get_shards()` does not work with a sharded tenant on 1 pageserver, as it assumes an unsharded tenant in this case. This special case appears to have been added to handle e.g. `test_emergency_mode`, where the storage controller is stopped. This breaks e.g. the sharded ingest benchmark in #9591 when run with a single shard. ## Summary of changes Correctly look up shards even with a single pageserver, but add a special case that assumes an unsharded tenant if the storage controller is stopped and the caller provides an explicit pageserver, in order to accomodate `test_emergency_mode`. --- test_runner/fixtures/neon_fixtures.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1b9bc873f4..e4d6e6da5d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1397,7 +1397,7 @@ def neon_simple_env( pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnv]: """ - Simple Neon environment, with no authentication and no safekeepers. + Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. This fixture will use RemoteStorageKind.LOCAL_FS with pageserver. """ @@ -4701,6 +4701,7 @@ def tenant_get_shards( If the caller provides `pageserver_id`, it will be used for all shards, even if the shard is indicated by storage controller to be on some other pageserver. + If the storage controller is not running, assume an unsharded tenant. Caller should over the response to apply their per-pageserver action to each shard @@ -4710,17 +4711,17 @@ def tenant_get_shards( else: override_pageserver = None - if len(env.pageservers) > 1: - return [ - ( - TenantShardId.parse(s["shard_id"]), - override_pageserver or env.get_pageserver(s["node_id"]), - ) - for s in env.storage_controller.locate(tenant_id) - ] - else: - # Assume an unsharded tenant - return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)] + if not env.storage_controller.running and override_pageserver is not None: + log.warning(f"storage controller not running, assuming unsharded tenant {tenant_id}") + return [(TenantShardId(tenant_id, 0, 0), override_pageserver)] + + return [ + ( + TenantShardId.parse(s["shard_id"]), + override_pageserver or env.get_pageserver(s["node_id"]), + ) + for s in env.storage_controller.locate(tenant_id) + ] def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):