mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
storcon: in shard splits, inherit parent's AZ (#9946)
## Problem Sharded tenants should be run in a single AZ for best performance, so that computes have AZ-local latency to all the shards. Part of https://github.com/neondatabase/neon/issues/8264 ## Summary of changes - When we split a tenant, instead of updating each shard's preferred AZ to wherever it is scheduled, propagate the preferred AZ from the parent. - Drop the check in `test_shard_preferred_azs` that asserts shards end up in their preferred AZ: this will not be true again until the optimize_attachment logic is updated to make this so. The existing check wasn't testing anything about scheduling, it was just asserting that we set preferred AZ in a way that matches the way things happen to be scheduled at time of split.
This commit is contained in:
@@ -44,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
|
||||
NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
|
||||
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
|
||||
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
|
||||
TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
|
||||
TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
|
||||
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
|
||||
@@ -468,6 +468,7 @@ struct ShardSplitParams {
|
||||
policy: PlacementPolicy,
|
||||
config: TenantConfig,
|
||||
shard_ident: ShardIdentity,
|
||||
preferred_az_id: Option<AvailabilityZone>,
|
||||
}
|
||||
|
||||
// When preparing for a shard split, we may either choose to proceed with the split,
|
||||
@@ -4103,7 +4104,7 @@ impl Service {
|
||||
for parent_id in parent_ids {
|
||||
let child_ids = parent_id.split(new_shard_count);
|
||||
|
||||
let (pageserver, generation, policy, parent_ident, config) = {
|
||||
let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
|
||||
let mut old_state = tenants
|
||||
.remove(&parent_id)
|
||||
.expect("It was present, we just split it");
|
||||
@@ -4122,6 +4123,7 @@ impl Service {
|
||||
old_state.policy.clone(),
|
||||
old_state.shard,
|
||||
old_state.config.clone(),
|
||||
old_state.preferred_az().cloned(),
|
||||
)
|
||||
};
|
||||
|
||||
@@ -4154,6 +4156,9 @@ impl Service {
|
||||
};
|
||||
child_state.generation = Some(generation);
|
||||
child_state.config = config.clone();
|
||||
if let Some(preferred_az) = &preferred_az {
|
||||
child_state.set_preferred_az(preferred_az.clone());
|
||||
}
|
||||
|
||||
// The child's TenantShard::splitting is intentionally left at the default value of Idle,
|
||||
// as at this point in the split process we have succeeded and this part is infallible:
|
||||
@@ -4346,6 +4351,7 @@ impl Service {
|
||||
let mut policy = None;
|
||||
let mut config = None;
|
||||
let mut shard_ident = None;
|
||||
let mut preferred_az_id = None;
|
||||
// Validate input, and calculate which shards we will create
|
||||
let (old_shard_count, targets) =
|
||||
{
|
||||
@@ -4404,6 +4410,9 @@ impl Service {
|
||||
if config.is_none() {
|
||||
config = Some(shard.config.clone());
|
||||
}
|
||||
if preferred_az_id.is_none() {
|
||||
preferred_az_id = shard.preferred_az().cloned();
|
||||
}
|
||||
|
||||
if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
|
||||
tracing::info!(
|
||||
@@ -4474,6 +4483,7 @@ impl Service {
|
||||
policy,
|
||||
config,
|
||||
shard_ident,
|
||||
preferred_az_id,
|
||||
})))
|
||||
}
|
||||
|
||||
@@ -4496,6 +4506,7 @@ impl Service {
|
||||
policy,
|
||||
config,
|
||||
shard_ident,
|
||||
preferred_az_id,
|
||||
} = *params;
|
||||
|
||||
// Drop any secondary locations: pageservers do not support splitting these, and in any case the
|
||||
@@ -4569,7 +4580,7 @@ impl Service {
|
||||
// Scheduling policies and preferred AZ do not carry through to children
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
preferred_az_id: None,
|
||||
preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -4689,47 +4700,6 @@ impl Service {
|
||||
let (response, child_locations, waiters) =
|
||||
self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
|
||||
|
||||
// Now that we have scheduled the child shards, attempt to set their preferred AZ
|
||||
// to that of the pageserver they've been attached on.
|
||||
let preferred_azs = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
child_locations
|
||||
.iter()
|
||||
.filter_map(|(tid, node_id, _stripe_size)| {
|
||||
let az_id = locked
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.get_availability_zone_id().clone())?;
|
||||
|
||||
Some((*tid, az_id))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let updated = self
|
||||
.persistence
|
||||
.set_tenant_shard_preferred_azs(preferred_azs)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Failed to persist preferred az ids: {err}"
|
||||
))
|
||||
});
|
||||
|
||||
match updated {
|
||||
Ok(updated) => {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tid, az_id) in updated {
|
||||
if let Some(shard) = locked.tenants.get_mut(&tid) {
|
||||
shard.set_preferred_az(az_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::warn!("Failed to persist preferred AZs after split: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
// Send compute notifications for all the new shards
|
||||
let mut failed_notifications = Vec::new();
|
||||
for (child_id, child_ps, stripe_size) in child_locations {
|
||||
|
||||
Reference in New Issue
Block a user