diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 1653b3c845..9d133e4af1 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,12 +1,16 @@ use futures::StreamExt; -use std::{str::FromStr, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + time::Duration, +}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, - TenantDescribeResponse, TenantPolicyRequest, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -153,6 +157,12 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + TenantSetPreferredAz { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + preferred_az: Option, + }, /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. TenantDrop { @@ -402,11 +412,12 @@ async fn main() -> anyhow::Result<()> { resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); let mut table = comfy_table::Table::new(); - table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); for node in resp { table.add_row([ format!("{}", node.id), node.listen_http_addr, + node.availability_zone_id, format!("{:?}", node.scheduling), format!("{:?}", node.availability), ]); @@ -479,6 +490,7 @@ async fn main() -> anyhow::Result<()> { let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", + "Preferred AZ", "ShardCount", "StripeSize", "Placement", @@ -488,6 +500,11 @@ async fn main() -> anyhow::Result<()> { let shard_zero = tenant.shards.into_iter().next().unwrap(); table.add_row([ format!("{}", tenant.tenant_id), + shard_zero + .preferred_az_id + .as_ref() + .cloned() + .unwrap_or("".to_string()), format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), format!("{:?}", tenant.stripe_size), format!("{:?}", tenant.policy), @@ -614,6 +631,19 @@ async fn main() -> anyhow::Result<()> { None, ) .await?; + + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let nodes = nodes + .into_iter() + .map(|n| (n.id, n)) + .collect::>(); + println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); table.add_row(["Policy", &format!("{:?}", policy)]); @@ -622,7 +652,14 @@ async fn main() -> anyhow::Result<()> { println!("{table}"); println!("Shards:"); let mut table = comfy_table::Table::new(); - table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + table.set_header([ + "Shard", + "Attached", + "Attached AZ", + "Secondary", + "Last error", + "status", + ]); for shard in shards { let secondary = shard .node_secondary @@ -645,11 +682,18 @@ async fn main() -> anyhow::Result<()> { } let status = status_parts.join(","); + let attached_node = shard + .node_attached + .as_ref() + .map(|id| nodes.get(id).expect("Shard references nonexistent node")); + table.add_row([ format!("{}", shard.tenant_shard_id), - shard - .node_attached - .map(|n| format!("{}", n)) + attached_node + .map(|n| format!("{} ({})", n.listen_http_addr, n.id)) + .unwrap_or(String::new()), + attached_node + .map(|n| n.availability_zone_id.clone()) .unwrap_or(String::new()), secondary, shard.last_error, @@ -658,6 +702,66 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::TenantSetPreferredAz { + tenant_id, + preferred_az, + } => { + // First learn about the tenant's shards + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + // Learn about nodes to validate the AZ ID + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + if let Some(preferred_az) = &preferred_az { + let azs = nodes + .into_iter() + .map(|n| (n.availability_zone_id)) + .collect::>(); + if !azs.contains(preferred_az) { + anyhow::bail!( + "AZ {} not found on any node: known AZs are: {:?}", + preferred_az, + azs + ); + } + } else { + // Make it obvious to the user that since they've omitted an AZ, we're clearing it + eprintln!("Clearing preferred AZ for tenant {}", tenant_id); + } + + // Construct a request that modifies all the tenant's shards + let req = ShardsPreferredAzsRequest { + preferred_az_ids: describe_response + .shards + .into_iter() + .map(|s| { + ( + s.tenant_shard_id, + preferred_az.clone().map(AvailabilityZone), + ) + }) + .collect(), + }; + storcon_client + .dispatch::( + Method::PUT, + "control/v1/preferred_azs".to_string(), + Some(req), + ) + .await?; + } Command::TenantWarmup { tenant_id } => { let describe_response = storcon_client .dispatch::<(), TenantDescribeResponse>( diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index f3aefc6df9..f3880cb766 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -87,7 +87,7 @@ impl Display for AvailabilityZone { #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsRequest { #[serde(flatten)] - pub preferred_az_ids: HashMap, + pub preferred_az_ids: HashMap>, } #[derive(Serialize, Deserialize)] @@ -144,6 +144,8 @@ pub struct NodeDescribeResponse { pub availability: NodeAvailabilityWrapper, pub scheduling: NodeSchedulingPolicy, + pub availability_zone_id: String, + pub listen_http_addr: String, pub listen_http_port: u16, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d5885eba6..4164e3dc2b 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards are not scheduled into their preferred AZ pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + /// How many shard locations (secondary or attached) on each node + pub(crate) storage_controller_node_shards: measured::GaugeVec, + + /// How many _attached_ shard locations on each node + pub(crate) storage_controller_node_attached_shards: measured::GaugeVec, + + /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's + /// preferred AZ) + pub(crate) storage_controller_node_home_shards: measured::GaugeVec, + /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, @@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics { } } +#[derive(measured::LabelGroup, Clone)] +#[label(set = NodeLabelGroupSet)] +pub(crate) struct NodeLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) az: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) node_id: &'a str, +} + #[derive(measured::LabelGroup)] #[label(set = ReconcileCompleteLabelGroupSet)] pub(crate) struct ReconcileCompleteLabelGroup { diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 4cc9b0070d..f5c2d329e0 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -299,6 +299,7 @@ impl Node { id: self.id, availability: self.availability.clone().into(), scheduling: self.scheduling, + availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, listen_pg_addr: self.listen_pg_addr.clone(), diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index beb014f0a8..eb0bfc879e 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -708,10 +708,11 @@ impl Persistence { Ok(()) } + /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified) pub(crate) async fn set_tenant_shard_preferred_azs( &self, - preferred_azs: Vec<(TenantShardId, AvailabilityZone)>, - ) -> DatabaseResult> { + preferred_azs: Vec<(TenantShardId, Option)>, + ) -> DatabaseResult)>> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { @@ -722,7 +723,7 @@ impl Persistence { .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(preferred_az_id.eq(preferred_az.0.clone())) + .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone()))) .execute(conn)?; if updated == 1 { diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 04a594dcac..f5cab9dd57 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,4 +1,4 @@ -use crate::{node::Node, tenant_shard::TenantShard}; +use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; use itertools::Itertools; use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; use serde::Serialize; @@ -872,6 +872,33 @@ impl Scheduler { pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { self.nodes.get(&node_id).unwrap().attached_shard_count } + + /// Some metrics that we only calculate periodically: this is simpler than + /// rigorously updating them on every change. + pub(crate) fn update_metrics(&self) { + for (node_id, node) in &self.nodes { + let node_id_str = format!("{}", node_id); + let label_group = NodeLabelGroup { + az: &node.az.0, + node_id: &node_id_str, + }; + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_shards + .set(label_group.clone(), node.shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_attached_shards + .set(label_group.clone(), node.attached_shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_home_shards + .set(label_group.clone(), node.home_shard_count as i64); + } + } } #[cfg(test)] diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index dadcc44cfb..cbb9103880 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2517,7 +2517,7 @@ impl Service { .map(|t| { ( t.get_tenant_shard_id().expect("Corrupt shard in database"), - load_in_az.clone(), + Some(load_in_az.clone()), ) }) .collect(), @@ -6390,7 +6390,7 @@ impl Service { /// available. A return value of 0 indicates that everything is fully reconciled already. fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, _scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); // This function is an efficient place to update lazy statistics, since we are walking @@ -6451,6 +6451,9 @@ impl Service { } } + // Some metrics are calculated from SchedulerNode state, update these periodically + scheduler.update_metrics(); + // Process any deferred tenant drops for (tenant_id, guard) in drop_detached_tenants { self.maybe_drop_tenant(tenant_id, &mut locked, &guard); @@ -6509,7 +6512,7 @@ impl Service { // Shard was dropped between planning and execution; continue; }; - tracing::info!("Applying optimization: {optimization:?}"); + tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}"); if shard.apply_optimization(scheduler, optimization) { optimizations_applied += 1; if self.maybe_reconcile_shard(shard, nodes).is_some() { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 2ba2a57eba..79ed628c25 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1708,8 +1708,8 @@ impl TenantShard { self.intent.preferred_az_id.as_ref() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { - self.intent.preferred_az_id = Some(preferred_az_id); + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option) { + self.intent.preferred_az_id = preferred_az_id; } /// Returns all the nodes to which this tenant shard is attached according to the