mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 10:30:40 +00:00
storage controller: quality of life improvements for AZ handling (#10379)
## Problem Since https://github.com/neondatabase/neon/pull/9916, the preferred AZ of a tenant is much more impactful, and we would like to make it more visible in tooling. ## Summary of changes - Include AZ in node describe API - Include AZ info in node & tenant outputs in CLI - Add metrics for per-node shard counts, labelled by AZ - Add a CLI for setting preferred AZ on a tenant - Extend AZ-setting API+CLI to handle None for clearing preferred AZ
This commit is contained in:
@@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// How many shards are not scheduled into their preferred AZ
|
||||
pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
|
||||
|
||||
/// How many shard locations (secondary or attached) on each node
|
||||
pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many _attached_ shard locations on each node
|
||||
pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
|
||||
/// preferred AZ)
|
||||
pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many shards would like to reconcile but were blocked by concurrency limits
|
||||
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
|
||||
|
||||
@@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = NodeLabelGroupSet)]
|
||||
pub(crate) struct NodeLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) az: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) node_id: &'a str,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = ReconcileCompleteLabelGroupSet)]
|
||||
pub(crate) struct ReconcileCompleteLabelGroup {
|
||||
|
||||
@@ -299,6 +299,7 @@ impl Node {
|
||||
id: self.id,
|
||||
availability: self.availability.clone().into(),
|
||||
scheduling: self.scheduling,
|
||||
availability_zone_id: self.availability_zone_id.0.clone(),
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
|
||||
@@ -708,10 +708,11 @@ impl Persistence {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
|
||||
pub(crate) async fn set_tenant_shard_preferred_azs(
|
||||
&self,
|
||||
preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
|
||||
preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
|
||||
@@ -722,7 +723,7 @@ impl Persistence {
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set(preferred_az_id.eq(preferred_az.0.clone()))
|
||||
.set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
|
||||
.execute(conn)?;
|
||||
|
||||
if updated == 1 {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
|
||||
use serde::Serialize;
|
||||
@@ -872,6 +872,33 @@ impl Scheduler {
|
||||
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().attached_shard_count
|
||||
}
|
||||
|
||||
/// Some metrics that we only calculate periodically: this is simpler than
|
||||
/// rigorously updating them on every change.
|
||||
pub(crate) fn update_metrics(&self) {
|
||||
for (node_id, node) in &self.nodes {
|
||||
let node_id_str = format!("{}", node_id);
|
||||
let label_group = NodeLabelGroup {
|
||||
az: &node.az.0,
|
||||
node_id: &node_id_str,
|
||||
};
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_shards
|
||||
.set(label_group.clone(), node.shard_count as i64);
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_attached_shards
|
||||
.set(label_group.clone(), node.attached_shard_count as i64);
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_home_shards
|
||||
.set(label_group.clone(), node.home_shard_count as i64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -2517,7 +2517,7 @@ impl Service {
|
||||
.map(|t| {
|
||||
(
|
||||
t.get_tenant_shard_id().expect("Corrupt shard in database"),
|
||||
load_in_az.clone(),
|
||||
Some(load_in_az.clone()),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
@@ -6390,7 +6390,7 @@ impl Service {
|
||||
/// available. A return value of 0 indicates that everything is fully reconciled already.
|
||||
fn reconcile_all(&self) -> usize {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let pageservers = nodes.clone();
|
||||
|
||||
// This function is an efficient place to update lazy statistics, since we are walking
|
||||
@@ -6451,6 +6451,9 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
// Some metrics are calculated from SchedulerNode state, update these periodically
|
||||
scheduler.update_metrics();
|
||||
|
||||
// Process any deferred tenant drops
|
||||
for (tenant_id, guard) in drop_detached_tenants {
|
||||
self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
|
||||
@@ -6509,7 +6512,7 @@ impl Service {
|
||||
// Shard was dropped between planning and execution;
|
||||
continue;
|
||||
};
|
||||
tracing::info!("Applying optimization: {optimization:?}");
|
||||
tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
|
||||
if shard.apply_optimization(scheduler, optimization) {
|
||||
optimizations_applied += 1;
|
||||
if self.maybe_reconcile_shard(shard, nodes).is_some() {
|
||||
|
||||
@@ -1708,8 +1708,8 @@ impl TenantShard {
|
||||
self.intent.preferred_az_id.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
|
||||
self.intent.preferred_az_id = Some(preferred_az_id);
|
||||
pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option<AvailabilityZone>) {
|
||||
self.intent.preferred_az_id = preferred_az_id;
|
||||
}
|
||||
|
||||
/// Returns all the nodes to which this tenant shard is attached according to the
|
||||
|
||||
Reference in New Issue
Block a user