From 3c8565a1941b3ad4cfb0c1de068d7e4bec7baf0a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 11 Apr 2025 13:31:12 +0200 Subject: [PATCH] test_runner: propagate config via `attach_hook` for test fix (#11529) ## Problem The `pagebench` benchmarks set up an initial dataset by creating a template tenant, copying the remote storage to a bunch of new tenants, and attaching them to Pageservers. In #11420, we found that `test_pageserver_characterize_throughput_with_n_tenants` had degraded performance because it set a custom tenant config in Pageservers that was then replaced with the default tenant config by the storage controller. The initial fix was to register the tenants directly in the storage controller, but this created the tenants with generation 1. This broke `test_basebackup_with_high_slru_count`, where the template tenant was at generation 2, leading to all layer files at generation 2 being ignored. Resolves #11485. Touches #11381. ## Summary of changes This patch addresses both test issues by modifying `attach_hook` to also take a custom tenant config. This allows attaching tenants to Pageservers from pre-existing remote storage, specifying both the generation and tenant config when registering them in the storage controller. --- control_plane/src/storage_controller.rs | 8 +++++-- storage_controller/src/service.rs | 24 ++++++++++--------- test_runner/fixtures/neon_fixtures.py | 12 ++++++---- .../fixtures/pageserver/many_tenants.py | 12 ++++------ .../pagebench/test_large_slru_basebackup.py | 12 ++++------ 5 files changed, 36 insertions(+), 32 deletions(-) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 8000576e87..a4b56ae5c0 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -13,7 +13,9 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs { pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, - pub generation_override: Option, + pub generation_override: Option, // only new tenants + pub config: Option, // only new tenants } #[derive(Serialize, Deserialize)] @@ -805,6 +808,7 @@ impl StorageController { tenant_shard_id, node_id: Some(pageserver_id), generation_override: None, + config: None, }; let response = self diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4790f80162..0982e56155 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1852,6 +1852,7 @@ impl Service { }; if insert { + let config = attach_req.config.clone().unwrap_or_default(); let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, @@ -1860,7 +1861,7 @@ impl Service { generation: attach_req.generation_override.or(Some(0)), generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), @@ -1883,16 +1884,16 @@ impl Service { Ok(()) => { tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( + let mut shard = TenantShard::new( attach_req.tenant_shard_id, - TenantShard::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Attached(0), - None, - ), + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + None, ); + shard.config = config; + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert(attach_req.tenant_shard_id, shard); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); } } @@ -1977,11 +1978,12 @@ impl Service { .set_attached(scheduler, attach_req.node_id); tracing::info!( - "attach_hook: tenant {} set generation {:?}, pageserver {}", + "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}", attach_req.tenant_shard_id, tenant_shard.generation, // TODO: this is an odd number of 0xf's - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)), + attach_req.config, ); // Trick the reconciler into not doing anything for this tenant: this helps diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 75a0596f58..9d4068b583 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1986,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils): tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, generation_override: int | None = None, + config: None | dict[str, Any] = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: body["generation_override"] = generation_override + if config is not None: + body["config"] = config response = self.request( "POST", @@ -2980,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils): to call into the pageserver HTTP client. """ client = self.http_client() - if generation is None: - generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) - elif override_storage_controller_generation: + if generation is None or override_storage_controller_generation: generation = self.env.storage_controller.attach_hook_issue( - tenant_id, self.id, generation + tenant_id, + self.id, + generation_override=generation if override_storage_controller_generation else None, + config=config, ) return client.tenant_attach( tenant_id, diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index eedb693e3d..71c750b9eb 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -65,13 +65,11 @@ def single_timeline( assert ps_http.tenant_list() == [] def attach(tenant): - # NB: create the new tenant in the storage controller with the correct tenant config. This - # will pick up the existing tenant data from remote storage. If we just attach it to the - # Pageserver, the storage controller will reset the tenant config to the default. - env.create_tenant( - tenant_id=tenant, - timeline_id=template_timeline, - conf=template_config, + env.pageserver.tenant_attach( + tenant, + config=template_config, + generation=100, + override_storage_controller_generation=True, ) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index efd423104d..8af52dcbd0 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count( n_txns = 500000 - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, n_txns) - env = setup_pageserver_with_tenants( - neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + neon_env_builder, + f"large_slru_count-{n_tenants}-{n_txns}", + n_tenants, + lambda env: setup_tenant_template(env, n_txns), ) run_benchmark(env, pg_bin, record, duration) @@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } template_tenant, template_timeline = env.create_tenant(set_default=True)