From e7d62a257d8c56e2289733a9890557f9dbff93cb Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 19 Jun 2024 11:55:59 +0100 Subject: [PATCH] test: fix tenant duplication utility generation numbers (#8096) ## Problem We have this set of test utilities which duplicate a tenant by copying everything that's in remote storage and then attaching a tenant to the pageserver and storage controller. When the "copied tenants" are created on the storage controller, they start off from generation number 0. This means that they can't see anything past that generation. This issues has existed ever since generation numbers have been introduced, but we've largely been lucky for the generation to stay stable during the template tenant creation. ## Summary of Changes Extend the storage controller debug attach hook to accept a generation override. Use that in the tenant duplication logic to set the generation number to something greater than the naturally reached generation. This allows the tenants to see all layer files. --- control_plane/src/storage_controller.rs | 2 ++ storage_controller/src/service.rs | 3 ++- test_runner/fixtures/neon_fixtures.py | 16 ++++++++++++++-- test_runner/fixtures/pageserver/many_tenants.py | 2 ++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index b6b7ea7762..72948e203f 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -46,6 +46,7 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, + pub generation_override: Option, } #[derive(Serialize, Deserialize)] @@ -440,6 +441,7 @@ impl StorageController { let request = AttachHookRequest { tenant_shard_id, node_id: Some(pageserver_id), + generation_override: None, }; let response = self diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index c94af113db..181e262638 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1234,13 +1234,14 @@ impl Service { let locked = self.inner.write().unwrap(); !locked.tenants.contains_key(&attach_req.tenant_shard_id) }; + if insert { let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: 0, - generation: Some(0), + generation: attach_req.generation_override.or(Some(0)), generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index bad93ff39a..8994db8cf2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2159,12 +2159,19 @@ class NeonStorageController(MetricsGetter, LogUtils): return time.time() - t1 def attach_hook_issue( - self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int + self, + tenant_shard_id: Union[TenantId, TenantShardId], + pageserver_id: int, + generation_override: Optional[int] = None, ) -> int: + body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} + if generation_override is not None: + body["generation_override"] = generation_override + response = self.request( "POST", f"{self.env.storage_controller_api}/debug/v1/attach-hook", - json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}, + json=body, headers=self.headers(TokenScope.ADMIN), ) gen = response.json()["gen"] @@ -2635,6 +2642,7 @@ class NeonPageserver(PgProtocol, LogUtils): config: None | Dict[str, Any] = None, config_null: bool = False, generation: Optional[int] = None, + override_storage_controller_generation: bool = False, ): """ Tenant attachment passes through here to acquire a generation number before proceeding @@ -2643,6 +2651,10 @@ class NeonPageserver(PgProtocol, LogUtils): client = self.http_client() if generation is None: generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) + elif override_storage_controller_generation: + generation = self.env.storage_controller.attach_hook_issue( + tenant_id, self.id, generation + ) return client.tenant_attach( tenant_id, config, diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index def80a1c3e..8730d8ef75 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -66,6 +66,8 @@ def single_timeline( env.pageserver.tenant_attach( tenant, config=template_config.copy(), + generation=100, + override_storage_controller_generation=True, ) time.sleep(0.1) wait_until_tenant_state(ps_http, tenant, "Broken", 10)