mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
storcon_cli: tenant-warmup command (#7432)
## Problem When we migrate a large existing tenant, we would like to be able to ensure it has pre-loaded layers onto a pageserver managed by the storage controller. ## Summary of changes - Add `storcon_cli tenant-warmup`, which configures the tenant into PlacementPolicy::Secondary (unless it's already attached), and then polls the secondary download API reporting progress. - Extend a test case to check that when onboarding with a secondary location pre-created, we properly use that location for our first attachment.
This commit is contained in:
@@ -1,15 +1,15 @@
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use hyper::Method;
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -120,6 +120,12 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
|
||||
/// mode so that it can warm up content on a pageserver.
|
||||
TenantWarmup {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantWarmup { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
match describe_response {
|
||||
Ok(describe) => {
|
||||
if matches!(describe.policy, PlacementPolicy::Secondary) {
|
||||
// Fine: it's already known to controller in secondary mode: calling
|
||||
// again to put it into secondary mode won't cause problems.
|
||||
} else {
|
||||
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
|
||||
}
|
||||
}
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
|
||||
// Fine: this tenant isn't know to the storage controller yet.
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected API error
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
vps_client
|
||||
.location_config(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
pageserver_api::models::LocationConfig {
|
||||
mode: pageserver_api::models::LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: 0,
|
||||
shard_count: 0,
|
||||
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
},
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let secondary_ps_id = describe_response
|
||||
.shards
|
||||
.first()
|
||||
.unwrap()
|
||||
.node_secondary
|
||||
.first()
|
||||
.unwrap();
|
||||
|
||||
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
|
||||
loop {
|
||||
let (status, progress) = vps_client
|
||||
.tenant_secondary_download(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
Some(Duration::from_secs(10)),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Progress: {}/{} layers, {}/{} bytes",
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
match status {
|
||||
StatusCode::OK => {
|
||||
println!("Download complete");
|
||||
break;
|
||||
}
|
||||
StatusCode::ACCEPTED => {
|
||||
// Loop
|
||||
}
|
||||
_ => {
|
||||
anyhow::bail!("Unexpected download status: {status}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -273,7 +273,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
but imports the generation number.
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
# One pageserver to simulate legacy environment, two to be managed by storage controller
|
||||
neon_env_builder.num_pageservers = 3
|
||||
|
||||
# Start services by hand so that we can skip registration on one of the pageservers
|
||||
env = neon_env_builder.init_configs()
|
||||
@@ -288,10 +289,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
)
|
||||
origin_ps = env.pageservers[0]
|
||||
|
||||
# This is the pageserver managed by the sharding service, where the tenant
|
||||
# These are the pageservers managed by the sharding service, where the tenant
|
||||
# will be attached after onboarding
|
||||
env.pageservers[1].start()
|
||||
dest_ps = env.pageservers[1]
|
||||
env.pageservers[2].start()
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
|
||||
for sk in env.safekeepers:
|
||||
@@ -330,6 +331,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
)
|
||||
|
||||
virtual_ps_http.tenant_secondary_download(tenant_id)
|
||||
warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
|
||||
"node_secondary"
|
||||
][0]
|
||||
|
||||
# Call into storage controller to onboard the tenant
|
||||
generation += 1
|
||||
@@ -344,6 +348,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
)
|
||||
assert len(r["shards"]) == 1
|
||||
|
||||
describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
|
||||
dest_ps_id = describe["node_attached"]
|
||||
dest_ps = env.get_pageserver(dest_ps_id)
|
||||
if warm_up:
|
||||
# The storage controller should have attached the tenant to the same placce
|
||||
# it had a secondary location, otherwise there was no point warming it up
|
||||
assert dest_ps_id == warm_up_ps
|
||||
|
||||
# It should have been given a new secondary location as well
|
||||
assert len(describe["node_secondary"]) == 1
|
||||
assert describe["node_secondary"][0] != warm_up_ps
|
||||
|
||||
# As if doing a live migration, detach the original pageserver
|
||||
origin_ps.http_client().tenant_location_conf(
|
||||
tenant_id,
|
||||
@@ -415,6 +431,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
|
||||
)
|
||||
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
|
||||
|
||||
# Storage controller auto-sets heatmap period, ignore it for the comparison
|
||||
del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
|
||||
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
Reference in New Issue
Block a user