diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 2edd09eac1..b3d1f0be05 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,15 +1,15 @@ -use std::{collections::HashMap, str::FromStr}; +use std::{collections::HashMap, str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; -use hyper::Method; +use hyper::{Method, StatusCode}; use pageserver_api::{ controller_api::{ NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantDescribeResponse, TenantPolicyRequest, }, models::{ - ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, - TenantShardSplitRequest, TenantShardSplitResponse, + LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest, + TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, }, shard::{ShardStripeSize, TenantShardId}, }; @@ -120,6 +120,12 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary + /// mode so that it can warm up content on a pageserver. + TenantWarmup { + #[arg(long)] + tenant_id: TenantId, + }, } #[derive(Parser)] @@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::TenantWarmup { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await; + match describe_response { + Ok(describe) => { + if matches!(describe.policy, PlacementPolicy::Secondary) { + // Fine: it's already known to controller in secondary mode: calling + // again to put it into secondary mode won't cause problems. + } else { + anyhow::bail!("Tenant already present with policy {:?}", describe.policy); + } + } + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { + // Fine: this tenant isn't know to the storage controller yet. + } + Err(e) => { + // Unexpected API error + return Err(e.into()); + } + } + + vps_client + .location_config( + TenantShardId::unsharded(tenant_id), + pageserver_api::models::LocationConfig { + mode: pageserver_api::models::LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: 0, + shard_count: 0, + shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, + tenant_conf: TenantConfig::default(), + }, + None, + true, + ) + .await?; + + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + let secondary_ps_id = describe_response + .shards + .first() + .unwrap() + .node_secondary + .first() + .unwrap(); + + println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); + loop { + let (status, progress) = vps_client + .tenant_secondary_download( + TenantShardId::unsharded(tenant_id), + Some(Duration::from_secs(10)), + ) + .await?; + println!( + "Progress: {}/{} layers, {}/{} bytes", + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + match status { + StatusCode::OK => { + println!("Download complete"); + break; + } + StatusCode::ACCEPTED => { + // Loop + } + _ => { + anyhow::bail!("Unexpected download status: {status}"); + } + } + } + } } Ok(()) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 840f354142..b4b23745f8 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -273,7 +273,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up but imports the generation number. """ - neon_env_builder.num_pageservers = 2 + # One pageserver to simulate legacy environment, two to be managed by storage controller + neon_env_builder.num_pageservers = 3 # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() @@ -288,10 +289,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) origin_ps = env.pageservers[0] - # This is the pageserver managed by the sharding service, where the tenant + # These are the pageservers managed by the sharding service, where the tenant # will be attached after onboarding env.pageservers[1].start() - dest_ps = env.pageservers[1] + env.pageservers[2].start() virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) for sk in env.safekeepers: @@ -330,6 +331,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) virtual_ps_http.tenant_secondary_download(tenant_id) + warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] # Call into storage controller to onboard the tenant generation += 1 @@ -344,6 +348,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) assert len(r["shards"]) == 1 + describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0] + dest_ps_id = describe["node_attached"] + dest_ps = env.get_pageserver(dest_ps_id) + if warm_up: + # The storage controller should have attached the tenant to the same placce + # it had a secondary location, otherwise there was no point warming it up + assert dest_ps_id == warm_up_ps + + # It should have been given a new secondary location as well + assert len(describe["node_secondary"]) == 1 + assert describe["node_secondary"][0] != warm_up_ps + # As if doing a live migration, detach the original pageserver origin_ps.http_client().tenant_location_conf( tenant_id, @@ -415,6 +431,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] ) dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + + # Storage controller auto-sets heatmap period, ignore it for the comparison + del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"] assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf env.storage_controller.consistency_check()