storcon: in shard splits, inherit parent's AZ (#9946)

## Problem Sharded tenants should be run in a single AZ for best performance, so that computes have AZ-local latency to all the shards. Part of https://github.com/neondatabase/neon/issues/8264 ## Summary of changes - When we split a tenant, instead of updating each shard's preferred AZ to wherever it is scheduled, propagate the preferred AZ from the parent. - Drop the check in `test_shard_preferred_azs` that asserts shards end up in their preferred AZ: this will not be true again until the optimize_attachment logic is updated to make this so. The existing check wasn't testing anything about scheduling, it was just asserting that we set preferred AZ in a way that matches the way things happen to be scheduled at time of split.
2026-01-07 21:42:56 +00:00 · 2024-12-03 16:55:00 +00:00
parent 4d422b937c
commit 71d004289c
2 changed files with 24 additions and 50 deletions
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
+        NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
+        ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
+        TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -468,6 +468,7 @@ struct ShardSplitParams {
    policy: PlacementPolicy,
    config: TenantConfig,
    shard_ident: ShardIdentity,
+    preferred_az_id: Option<AvailabilityZone>,
 }

 // When preparing for a shard split, we may either choose to proceed with the split,
@@ -4103,7 +4104,7 @@ impl Service {
            for parent_id in parent_ids {
                let child_ids = parent_id.split(new_shard_count);

-                let (pageserver, generation, policy, parent_ident, config) = {
+                let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
                    let mut old_state = tenants
                        .remove(&parent_id)
                        .expect("It was present, we just split it");
@@ -4122,6 +4123,7 @@ impl Service {
                        old_state.policy.clone(),
                        old_state.shard,
                        old_state.config.clone(),
+                        old_state.preferred_az().cloned(),
                    )
                };

@@ -4154,6 +4156,9 @@ impl Service {
                    };
                    child_state.generation = Some(generation);
                    child_state.config = config.clone();
+                    if let Some(preferred_az) = &preferred_az {
+                        child_state.set_preferred_az(preferred_az.clone());
+                    }

                    // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                    // as at this point in the split process we have succeeded and this part is infallible:
@@ -4346,6 +4351,7 @@ impl Service {
        let mut policy = None;
        let mut config = None;
        let mut shard_ident = None;
+        let mut preferred_az_id = None;
        // Validate input, and calculate which shards we will create
        let (old_shard_count, targets) =
            {
@@ -4404,6 +4410,9 @@ impl Service {
                    if config.is_none() {
                        config = Some(shard.config.clone());
                    }
+                    if preferred_az_id.is_none() {
+                        preferred_az_id = shard.preferred_az().cloned();
+                    }

                    if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                        tracing::info!(
@@ -4474,6 +4483,7 @@ impl Service {
            policy,
            config,
            shard_ident,
+            preferred_az_id,
        })))
    }

@@ -4496,6 +4506,7 @@ impl Service {
            policy,
            config,
            shard_ident,
+            preferred_az_id,
        } = *params;

        // Drop any secondary locations: pageservers do not support splitting these, and in any case the
@@ -4569,7 +4580,7 @@ impl Service {
                    // Scheduling policies and preferred AZ do not carry through to children
                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                        .unwrap(),
-                    preferred_az_id: None,
+                    preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()),
                });
            }

@@ -4689,47 +4700,6 @@ impl Service {
        let (response, child_locations, waiters) =
            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);

-        // Now that we have scheduled the child shards, attempt to set their preferred AZ
-        // to that of the pageserver they've been attached on.
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            child_locations
-                .iter()
-                .filter_map(|(tid, node_id, _stripe_size)| {
-                    let az_id = locked
-                        .nodes
-                        .get(node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((*tid, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            });
-
-        match updated {
-            Ok(updated) => {
-                let mut locked = self.inner.write().unwrap();
-                for (tid, az_id) in updated {
-                    if let Some(shard) = locked.tenants.get_mut(&tid) {
-                        shard.set_preferred_az(az_id);
-                    }
-                }
-            }
-            Err(err) => {
-                tracing::warn!("Failed to persist preferred AZs after split: {err}");
-            }
-        }
-
        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
        for (child_id, child_ps, stripe_size) in child_locations {