mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 12:00:42 +00:00
storcon: use proper schedule context during node delete (#9958)
## Problem I was touching `test_storage_controller_node_deletion` because for AZ scheduling work I was adding a change to the storage controller (kick secondaries during optimisation) that made a FIXME in this test defunct. While looking at it I also realized that we can easily fix the way node deletion currently doesn't use a proper ScheduleContext, using the iterator type recently added for that purpose. ## Summary of changes - A testing-only behavior in storage controller where if a secondary location isn't yet ready during optimisation, it will be actively polled. - Remove workaround in `test_storage_controller_node_deletion` that previously was needed because optimisation would get stuck on cold secondaries. - Update node deletion code to use a `TenantShardContextIterator` and thereby a proper ScheduleContext
This commit is contained in:
@@ -5158,34 +5158,38 @@ impl Service {
|
||||
*nodes = Arc::new(nodes_mut);
|
||||
}
|
||||
|
||||
for (tenant_shard_id, shard) in tenants {
|
||||
if shard.deref_node(node_id) {
|
||||
// FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
|
||||
// it won't properly do anti-affinity.
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for (_tenant_id, mut schedule_context, shards) in
|
||||
TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
|
||||
{
|
||||
for shard in shards {
|
||||
if shard.deref_node(node_id) {
|
||||
if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
|
||||
// TODO: implement force flag to remove a node even if we can't reschedule
|
||||
// a tenant
|
||||
tracing::error!(
|
||||
"Refusing to delete node, shard {} can't be rescheduled: {e}",
|
||||
shard.tenant_shard_id
|
||||
);
|
||||
return Err(e.into());
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Rescheduled shard {} away from node during deletion",
|
||||
shard.tenant_shard_id
|
||||
)
|
||||
}
|
||||
|
||||
if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
|
||||
// TODO: implement force flag to remove a node even if we can't reschedule
|
||||
// a tenant
|
||||
tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
|
||||
return Err(e.into());
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Rescheduled shard {tenant_shard_id} away from node during deletion"
|
||||
)
|
||||
self.maybe_reconcile_shard(shard, nodes);
|
||||
}
|
||||
|
||||
self.maybe_reconcile_shard(shard, nodes);
|
||||
// Here we remove an existing observed location for the node we're removing, and it will
|
||||
// not be re-added by a reconciler's completion because we filter out removed nodes in
|
||||
// process_result.
|
||||
//
|
||||
// Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
|
||||
// means any reconciles we spawned will know about the node we're deleting, enabling them
|
||||
// to do live migrations if it's still online.
|
||||
shard.observed.locations.remove(&node_id);
|
||||
}
|
||||
|
||||
// Here we remove an existing observed location for the node we're removing, and it will
|
||||
// not be re-added by a reconciler's completion because we filter out removed nodes in
|
||||
// process_result.
|
||||
//
|
||||
// Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
|
||||
// means any reconciles we spawned will know about the node we're deleting, enabling them
|
||||
// to do live migrations if it's still online.
|
||||
shard.observed.locations.remove(&node_id);
|
||||
}
|
||||
|
||||
scheduler.node_remove(node_id);
|
||||
@@ -6279,6 +6283,14 @@ impl Service {
|
||||
> DOWNLOAD_FRESHNESS_THRESHOLD
|
||||
{
|
||||
tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if progress.heatmap_mtime.is_none() {
|
||||
// No heatmap might mean the attached location has never uploaded one, or that
|
||||
// the secondary download hasn't happened yet. This is relatively unusual in the field,
|
||||
// but fairly common in tests.
|
||||
self.kick_secondary_download(tenant_shard_id).await;
|
||||
}
|
||||
} else {
|
||||
// Location looks ready: proceed
|
||||
tracing::info!(
|
||||
@@ -6293,6 +6305,58 @@ impl Service {
|
||||
validated_work
|
||||
}
|
||||
|
||||
/// Some aspects of scheduling optimisation wait for secondary locations to be warm. This
|
||||
/// happens on multi-minute timescales in the field, which is fine because optimisation is meant
|
||||
/// to be a lazy background thing. However, when testing, it is not practical to wait around, so
|
||||
/// we have this helper to move things along faster.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
|
||||
let (attached_node, secondary_node) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
||||
return;
|
||||
};
|
||||
let (Some(attached), Some(secondary)) = (
|
||||
shard.intent.get_attached(),
|
||||
shard.intent.get_secondary().first(),
|
||||
) else {
|
||||
return;
|
||||
};
|
||||
(
|
||||
locked.nodes.get(attached).unwrap().clone(),
|
||||
locked.nodes.get(secondary).unwrap().clone(),
|
||||
)
|
||||
};
|
||||
|
||||
// Make remote API calls to upload + download heatmaps: we ignore errors because this is just
|
||||
// a 'kick' to let scheduling optimisation run more promptly.
|
||||
attached_node
|
||||
.with_client_retries(
|
||||
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
10,
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
secondary_node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
|
||||
.await
|
||||
},
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
10,
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Look for shards which are oversized and in need of splitting
|
||||
async fn autosplit_tenants(self: &Arc<Self>) {
|
||||
let Some(split_threshold) = self.config.split_threshold else {
|
||||
|
||||
@@ -519,6 +519,13 @@ def test_sharding_split_smoke(
|
||||
# We will have 2 shards per pageserver once done (including secondaries)
|
||||
neon_env_builder.num_pageservers = split_shard_count
|
||||
|
||||
# Two AZs
|
||||
def assign_az(ps_cfg):
|
||||
az = f"az-{(ps_cfg['id'] - 1) % 2}"
|
||||
ps_cfg["availability_zone"] = az
|
||||
|
||||
neon_env_builder.pageserver_config_override = assign_az
|
||||
|
||||
# 1MiB stripes: enable getting some meaningful data distribution without
|
||||
# writing large quantities of data in this test. The stripe size is given
|
||||
# in number of 8KiB pages.
|
||||
|
||||
@@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion(
|
||||
assert victim.id not in shard["node_secondary"]
|
||||
|
||||
# Reconciles running during deletion should all complete
|
||||
# FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
|
||||
# in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
|
||||
# test that hasn't uploaded any heatmaps for secondaries.
|
||||
# In the interim, just do a reconcile_all to enable the consistency check.
|
||||
# env.storage_controller.reconcile_until_idle()
|
||||
env.storage_controller.reconcile_all()
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# Controller should pass its own consistency checks
|
||||
env.storage_controller.consistency_check()
|
||||
@@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion(
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
|
||||
env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above.
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user