storcon_cli: tenant-warmup command (#7432)

## Problem

When we migrate a large existing tenant, we would like to be able to
ensure it has pre-loaded layers onto a pageserver managed by the storage
controller.

## Summary of changes

- Add `storcon_cli tenant-warmup`, which configures the tenant into
PlacementPolicy::Secondary (unless it's already attached), and then
polls the secondary download API reporting progress.
- Extend a test case to check that when onboarding with a secondary
location pre-created, we properly use that location for our first
attachment.
This commit is contained in:
John Spray
2024-04-19 12:32:58 +01:00
committed by GitHub
parent 6eb946e2de
commit 98be8b9430
2 changed files with 120 additions and 7 deletions

View File

@@ -1,15 +1,15 @@
use std::{collections::HashMap, str::FromStr};
use std::{collections::HashMap, str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
use hyper::Method;
use hyper::{Method, StatusCode};
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
TenantShardSplitRequest, TenantShardSplitResponse,
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
@@ -120,6 +120,12 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
}
#[derive(Parser)]
@@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> {
}
println!("{table}");
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
}
Ok(())

View File

@@ -273,7 +273,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
but imports the generation number.
"""
neon_env_builder.num_pageservers = 2
# One pageserver to simulate legacy environment, two to be managed by storage controller
neon_env_builder.num_pageservers = 3
# Start services by hand so that we can skip registration on one of the pageservers
env = neon_env_builder.init_configs()
@@ -288,10 +289,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
)
origin_ps = env.pageservers[0]
# This is the pageserver managed by the sharding service, where the tenant
# These are the pageservers managed by the sharding service, where the tenant
# will be attached after onboarding
env.pageservers[1].start()
dest_ps = env.pageservers[1]
env.pageservers[2].start()
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
for sk in env.safekeepers:
@@ -330,6 +331,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
)
virtual_ps_http.tenant_secondary_download(tenant_id)
warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"node_secondary"
][0]
# Call into storage controller to onboard the tenant
generation += 1
@@ -344,6 +348,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
)
assert len(r["shards"]) == 1
describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
dest_ps_id = describe["node_attached"]
dest_ps = env.get_pageserver(dest_ps_id)
if warm_up:
# The storage controller should have attached the tenant to the same placce
# it had a secondary location, otherwise there was no point warming it up
assert dest_ps_id == warm_up_ps
# It should have been given a new secondary location as well
assert len(describe["node_secondary"]) == 1
assert describe["node_secondary"][0] != warm_up_ps
# As if doing a live migration, detach the original pageserver
origin_ps.http_client().tenant_location_conf(
tenant_id,
@@ -415,6 +431,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
)
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
# Storage controller auto-sets heatmap period, ignore it for the comparison
del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
env.storage_controller.consistency_check()