diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ba1411b615..72ebbafd3b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -36,7 +36,9 @@ use pageserver_api::config::{ use pageserver_api::controller_api::{ NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, }; -use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; @@ -1129,12 +1131,16 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any let tenant_id = get_tenant_id(args.tenant_id, env)?; let tenant_conf: HashMap<_, _> = args.config.iter().flat_map(|c| c.split_once(':')).collect(); + let config = PageServerNode::parse_config(tenant_conf)?; - pageserver - .tenant_config(tenant_id, tenant_conf) + let req = TenantConfigRequest { tenant_id, config }; + + let storage_controller = StorageController::from_env(env); + storage_controller + .set_tenant_config(&req) .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; - println!("tenant {tenant_id} successfully configured on the pageserver"); + println!("tenant {tenant_id} successfully configured via storcon"); } } Ok(()) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 439d7936a7..bbd7f67720 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -14,7 +14,7 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -878,4 +878,9 @@ impl StorageController { ) .await } + + pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> { + self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req)) + .await + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 96b67fa81e..667b53b725 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2004,21 +2004,41 @@ impl Service { tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); let mut cleanup = Vec::new(); + let mut mismatched_locations = 0; { let mut locked = self.inner.write().unwrap(); - for (tenant_shard_id, observed_loc) in configs.tenant_shards { + for (tenant_shard_id, reported) in configs.tenant_shards { let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_shard + + let on_record = &mut tenant_shard .observed .locations - .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + .entry(node.get_id()) + .or_insert_with(|| ObservedStateLocation { conf: None }) + .conf; + + // If the location reported by the node does not match our observed state, + // then we mark it as uncertain and let the background reconciliation loop + // deal with it. + // + // Note that this also covers net new locations reported by the node. + if *on_record != reported { + mismatched_locations += 1; + *on_record = None; + } } } + if mismatched_locations > 0 { + tracing::info!( + "Set observed state to None for {mismatched_locations} mismatched locations" + ); + } + for tenant_shard_id in cleanup { tracing::info!("Detaching {tenant_shard_id}"); match node diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 29919f2fe7..5eaf69cfa1 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1749,18 +1749,23 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # Restart the failed pageserver victim_ps.start() + env.storage_controller.reconcile_until_idle() + # We expect that the re-attach call correctly tipped off the pageserver that its locations # are all secondaries now. locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] assert len(locations) == 2 assert all(loc[1]["mode"] == "Secondary" for loc in locations) - # We expect that this situation resulted from the re_attach call, and not any explicit - # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + # We expect that this situation resulted from background reconciliations + # Reconciler runs: assert that the reconciliation count has gone up by exactly + # one for each shard reconciles_after_restart = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) - assert reconciles_after_restart == reconciles_before_restart + + assert reconciles_before_restart is not None + assert reconciles_after_restart == reconciles_before_restart + 2 def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 713f89c60f..81e727a3aa 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -436,7 +436,7 @@ def test_single_branch_get_tenant_size_grows( # when our tenant is configured with a tiny pitr interval, dropping a table should # cause synthetic size to go down immediately tenant_config["pitr_interval"] = "0s" - env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, tenant_config) (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id )