storcon: use proper schedule context during node delete (#9958)

## Problem

I was touching `test_storage_controller_node_deletion` because for AZ
scheduling work I was adding a change to the storage controller (kick
secondaries during optimisation) that made a FIXME in this test defunct.
While looking at it I also realized that we can easily fix the way node
deletion currently doesn't use a proper ScheduleContext, using the
iterator type recently added for that purpose.

## Summary of changes

- A testing-only behavior in storage controller where if a secondary
location isn't yet ready during optimisation, it will be actively
polled.
- Remove workaround in `test_storage_controller_node_deletion` that
previously was needed because optimisation would get stuck on cold
secondaries.
- Update node deletion code to use a `TenantShardContextIterator` and
thereby a proper ScheduleContext
This commit is contained in:
John Spray
2024-12-03 08:59:38 +00:00
committed by Ivan Efremov
parent c4987b0b13
commit 4a488b3e24
3 changed files with 96 additions and 31 deletions

View File

@@ -5158,34 +5158,38 @@ impl Service {
*nodes = Arc::new(nodes_mut);
}
for (tenant_shard_id, shard) in tenants {
if shard.deref_node(node_id) {
// FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
// it won't properly do anti-affinity.
let mut schedule_context = ScheduleContext::default();
for (_tenant_id, mut schedule_context, shards) in
TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
{
for shard in shards {
if shard.deref_node(node_id) {
if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
// TODO: implement force flag to remove a node even if we can't reschedule
// a tenant
tracing::error!(
"Refusing to delete node, shard {} can't be rescheduled: {e}",
shard.tenant_shard_id
);
return Err(e.into());
} else {
tracing::info!(
"Rescheduled shard {} away from node during deletion",
shard.tenant_shard_id
)
}
if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
// TODO: implement force flag to remove a node even if we can't reschedule
// a tenant
tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
return Err(e.into());
} else {
tracing::info!(
"Rescheduled shard {tenant_shard_id} away from node during deletion"
)
self.maybe_reconcile_shard(shard, nodes);
}
self.maybe_reconcile_shard(shard, nodes);
// Here we remove an existing observed location for the node we're removing, and it will
// not be re-added by a reconciler's completion because we filter out removed nodes in
// process_result.
//
// Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
// means any reconciles we spawned will know about the node we're deleting, enabling them
// to do live migrations if it's still online.
shard.observed.locations.remove(&node_id);
}
// Here we remove an existing observed location for the node we're removing, and it will
// not be re-added by a reconciler's completion because we filter out removed nodes in
// process_result.
//
// Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
// means any reconciles we spawned will know about the node we're deleting, enabling them
// to do live migrations if it's still online.
shard.observed.locations.remove(&node_id);
}
scheduler.node_remove(node_id);
@@ -6279,6 +6283,14 @@ impl Service {
> DOWNLOAD_FRESHNESS_THRESHOLD
{
tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
#[cfg(feature = "testing")]
if progress.heatmap_mtime.is_none() {
// No heatmap might mean the attached location has never uploaded one, or that
// the secondary download hasn't happened yet. This is relatively unusual in the field,
// but fairly common in tests.
self.kick_secondary_download(tenant_shard_id).await;
}
} else {
// Location looks ready: proceed
tracing::info!(
@@ -6293,6 +6305,58 @@ impl Service {
validated_work
}
/// Some aspects of scheduling optimisation wait for secondary locations to be warm. This
/// happens on multi-minute timescales in the field, which is fine because optimisation is meant
/// to be a lazy background thing. However, when testing, it is not practical to wait around, so
/// we have this helper to move things along faster.
#[cfg(feature = "testing")]
async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
let (attached_node, secondary_node) = {
let locked = self.inner.read().unwrap();
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
return;
};
let (Some(attached), Some(secondary)) = (
shard.intent.get_attached(),
shard.intent.get_secondary().first(),
) else {
return;
};
(
locked.nodes.get(attached).unwrap().clone(),
locked.nodes.get(secondary).unwrap().clone(),
)
};
// Make remote API calls to upload + download heatmaps: we ignore errors because this is just
// a 'kick' to let scheduling optimisation run more promptly.
attached_node
.with_client_retries(
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
&self.config.jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
secondary_node
.with_client_retries(
|client| async move {
client
.tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
.await
},
&self.config.jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
}
/// Look for shards which are oversized and in need of splitting
async fn autosplit_tenants(self: &Arc<Self>) {
let Some(split_threshold) = self.config.split_threshold else {

View File

@@ -519,6 +519,13 @@ def test_sharding_split_smoke(
# We will have 2 shards per pageserver once done (including secondaries)
neon_env_builder.num_pageservers = split_shard_count
# Two AZs
def assign_az(ps_cfg):
az = f"az-{(ps_cfg['id'] - 1) % 2}"
ps_cfg["availability_zone"] = az
neon_env_builder.pageserver_config_override = assign_az
# 1MiB stripes: enable getting some meaningful data distribution without
# writing large quantities of data in this test. The stripe size is given
# in number of 8KiB pages.

View File

@@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion(
assert victim.id not in shard["node_secondary"]
# Reconciles running during deletion should all complete
# FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
# in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
# test that hasn't uploaded any heatmaps for secondaries.
# In the interim, just do a reconcile_all to enable the consistency check.
# env.storage_controller.reconcile_until_idle()
env.storage_controller.reconcile_all()
env.storage_controller.reconcile_until_idle()
# Controller should pass its own consistency checks
env.storage_controller.consistency_check()
@@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion(
env.storage_controller.stop()
env.storage_controller.start()
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above.
env.storage_controller.consistency_check()