diff --git a/Cargo.lock b/Cargo.lock index dbbf330cf9..66879fd743 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5820,6 +5820,7 @@ dependencies = [ "anyhow", "clap", "comfy-table", + "futures", "humantime", "hyper 0.14.26", "pageserver_api", diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index ed3462961f..f96f0084b2 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -9,6 +9,7 @@ license.workspace = true anyhow.workspace = true clap.workspace = true comfy-table.workspace = true +futures.workspace = true humantime.workspace = true hyper.workspace = true pageserver_api.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 05c4acdf90..8c84911d33 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,3 +1,4 @@ +use futures::StreamExt; use std::{collections::HashMap, str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; @@ -148,6 +149,22 @@ enum Command { #[arg(long)] threshold: humantime::Duration, }, + // Drain a set of specified pageservers by moving the primary attachments to pageservers + // outside of the specified set. + Drain { + // Set of pageserver node ids to drain. + #[arg(long)] + nodes: Vec, + // Optional: migration concurrency (default is 8) + #[arg(long)] + concurrency: Option, + // Optional: maximum number of shards to migrate + #[arg(long)] + max_shards: Option, + // Optional: when set to true, nothing is migrated, but the plan is printed to stdout + #[arg(long)] + dry_run: Option, + }, } #[derive(Parser)] @@ -737,6 +754,197 @@ async fn main() -> anyhow::Result<()> { }) .await?; } + Command::Drain { + nodes, + concurrency, + max_shards, + dry_run, + } => { + // Load the list of nodes, split them up into the drained and filled sets, + // and validate that draining is possible. + let node_descs = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + let mut node_to_drain_descs = Vec::new(); + let mut node_to_fill_descs = Vec::new(); + + for desc in node_descs { + let to_drain = nodes.iter().any(|id| *id == desc.id); + if to_drain { + node_to_drain_descs.push(desc); + } else { + node_to_fill_descs.push(desc); + } + } + + if nodes.len() != node_to_drain_descs.len() { + anyhow::bail!("Drain requested for node which doesn't exist.") + } + + let can_fill = node_to_fill_descs + .iter() + .filter(|desc| { + matches!(desc.availability, NodeAvailabilityWrapper::Active) + && matches!( + desc.scheduling, + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling + ) + }) + .any(|_| true); + + if !can_fill { + anyhow::bail!("There are no nodes to drain to") + } + + // Set the node scheduling policy to draining for the nodes which + // we plan to drain. + for node_desc in node_to_drain_descs.iter() { + let req = NodeConfigureRequest { + node_id: node_desc.id, + availability: None, + scheduling: Some(NodeSchedulingPolicy::Draining), + }; + + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", node_desc.id), + Some(req), + ) + .await?; + } + + // Perform the drain: move each tenant shard scheduled on a node to + // be drained to a node which is being filled. A simple round robin + // strategy is used to pick the new node. + let tenants = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + + let mut selected_node_idx = 0; + + struct DrainMove { + tenant_shard_id: TenantShardId, + from: NodeId, + to: NodeId, + } + + let mut moves: Vec = Vec::new(); + + let shards = tenants + .into_iter() + .flat_map(|tenant| tenant.shards.into_iter()); + for shard in shards { + if let Some(max_shards) = max_shards { + if moves.len() >= max_shards { + println!( + "Stop planning shard moves since the requested maximum was reached" + ); + break; + } + } + + let should_migrate = { + if let Some(attached_to) = shard.node_attached { + node_to_drain_descs + .iter() + .map(|desc| desc.id) + .any(|id| id == attached_to) + } else { + false + } + }; + + if !should_migrate { + continue; + } + + moves.push(DrainMove { + tenant_shard_id: shard.tenant_shard_id, + from: shard + .node_attached + .expect("We only migrate attached tenant shards"), + to: node_to_fill_descs[selected_node_idx].id, + }); + selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len(); + } + + let total_moves = moves.len(); + + if dry_run == Some(true) { + println!("Dryrun requested. Planned {total_moves} moves:"); + for mv in &moves { + println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to) + } + + return Ok(()); + } + + const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; + let mut stream = futures::stream::iter(moves) + .map(|mv| { + let client = Client::new(cli.api.clone(), cli.jwt.clone()); + async move { + client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), + Some(TenantShardMigrateRequest { + tenant_shard_id: mv.tenant_shard_id, + node_id: mv.to, + }), + ) + .await + .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) + } + }) + .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY)); + + let mut success = 0; + let mut failure = 0; + + while let Some(res) = stream.next().await { + match res { + Ok(_) => { + success += 1; + } + Err((tenant_shard_id, from, to, error)) => { + failure += 1; + println!( + "Failed to migrate {} from node {} to node {}: {}", + tenant_shard_id, from, to, error + ); + } + } + + if (success + failure) % 20 == 0 { + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } + } + + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } } Ok(())