diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 631fdb4923..52c9c4710d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5158,34 +5158,38 @@ impl Service { *nodes = Arc::new(nodes_mut); } - for (tenant_shard_id, shard) in tenants { - if shard.deref_node(node_id) { - // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise - // it won't properly do anti-affinity. - let mut schedule_context = ScheduleContext::default(); + for (_tenant_id, mut schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + { + for shard in shards { + if shard.deref_node(node_id) { + if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { + // TODO: implement force flag to remove a node even if we can't reschedule + // a tenant + tracing::error!( + "Refusing to delete node, shard {} can't be rescheduled: {e}", + shard.tenant_shard_id + ); + return Err(e.into()); + } else { + tracing::info!( + "Rescheduled shard {} away from node during deletion", + shard.tenant_shard_id + ) + } - if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { - // TODO: implement force flag to remove a node even if we can't reschedule - // a tenant - tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}"); - return Err(e.into()); - } else { - tracing::info!( - "Rescheduled shard {tenant_shard_id} away from node during deletion" - ) + self.maybe_reconcile_shard(shard, nodes); } - self.maybe_reconcile_shard(shard, nodes); + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that + // means any reconciles we spawned will know about the node we're deleting, enabling them + // to do live migrations if it's still online. + shard.observed.locations.remove(&node_id); } - - // Here we remove an existing observed location for the node we're removing, and it will - // not be re-added by a reconciler's completion because we filter out removed nodes in - // process_result. - // - // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that - // means any reconciles we spawned will know about the node we're deleting, enabling them - // to do live migrations if it's still online. - shard.observed.locations.remove(&node_id); } scheduler.node_remove(node_id); @@ -6279,6 +6283,14 @@ impl Service { > DOWNLOAD_FRESHNESS_THRESHOLD { tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + + #[cfg(feature = "testing")] + if progress.heatmap_mtime.is_none() { + // No heatmap might mean the attached location has never uploaded one, or that + // the secondary download hasn't happened yet. This is relatively unusual in the field, + // but fairly common in tests. + self.kick_secondary_download(tenant_shard_id).await; + } } else { // Location looks ready: proceed tracing::info!( @@ -6293,6 +6305,58 @@ impl Service { validated_work } + /// Some aspects of scheduling optimisation wait for secondary locations to be warm. This + /// happens on multi-minute timescales in the field, which is fine because optimisation is meant + /// to be a lazy background thing. However, when testing, it is not practical to wait around, so + /// we have this helper to move things along faster. + #[cfg(feature = "testing")] + async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { + let (attached_node, secondary_node) = { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return; + }; + let (Some(attached), Some(secondary)) = ( + shard.intent.get_attached(), + shard.intent.get_secondary().first(), + ) else { + return; + }; + ( + locked.nodes.get(attached).unwrap().clone(), + locked.nodes.get(secondary).unwrap().clone(), + ) + }; + + // Make remote API calls to upload + download heatmaps: we ignore errors because this is just + // a 'kick' to let scheduling optimisation run more promptly. + attached_node + .with_client_retries( + |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + secondary_node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1))) + .await + }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + } + /// Look for shards which are oversized and in need of splitting async fn autosplit_tenants(self: &Arc) { let Some(split_threshold) = self.config.split_threshold else { diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index c86ba0d4ea..30abf91d3a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -519,6 +519,13 @@ def test_sharding_split_smoke( # We will have 2 shards per pageserver once done (including secondaries) neon_env_builder.num_pageservers = split_shard_count + # Two AZs + def assign_az(ps_cfg): + az = f"az-{(ps_cfg['id'] - 1) % 2}" + ps_cfg["availability_zone"] = az + + neon_env_builder.pageserver_config_override = assign_az + # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given # in number of 8KiB pages. diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index e93e251b4f..685af5caaf 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion( assert victim.id not in shard["node_secondary"] # Reconciles running during deletion should all complete - # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting - # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3 - # test that hasn't uploaded any heatmaps for secondaries. - # In the interim, just do a reconcile_all to enable the consistency check. - # env.storage_controller.reconcile_until_idle() - env.storage_controller.reconcile_all() + env.storage_controller.reconcile_until_idle() # Controller should pass its own consistency checks env.storage_controller.consistency_check() @@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion( env.storage_controller.stop() env.storage_controller.start() assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] - env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. env.storage_controller.consistency_check()