mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-22 15:41:15 +00:00
storage controller: API + CLI for migrating secondary locations (#10284)
## Problem Currently, if we want to move a secondary there isn't a neat way to do that: we just have migration API for the attached location, and it is only clean to use that if you've manually created a secondary via pageserver API in the place you're going to move it to. Secondary migration API enables: - Moving the secondary somewhere because we would like to later move the attached location there. - Move the secondary location because we just want to reclaim some disk space from its current location. ## Summary of changes - Add `/migrate_secondary` API - Add `tenant-shard-migrate-secondary` CLI - Add tests for above
This commit is contained in:
@@ -124,7 +124,10 @@ impl ComputeHookTenant {
|
||||
if let Some(shard_idx) = shard_idx {
|
||||
sharded.shards.remove(shard_idx);
|
||||
} else {
|
||||
tracing::warn!("Shard not found while handling detach")
|
||||
// This is a valid but niche case, where the tenant was previously attached
|
||||
// as a Secondary location and then detached, so has no previously notified
|
||||
// state.
|
||||
tracing::info!("Shard not found while handling detach")
|
||||
}
|
||||
}
|
||||
ComputeHookTenant::Unsharded(_) => {
|
||||
@@ -761,7 +764,10 @@ impl ComputeHook {
|
||||
let mut state_locked = self.state.lock().unwrap();
|
||||
match state_locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(_) => {
|
||||
tracing::warn!("Compute hook tenant not found for detach");
|
||||
// This is a valid but niche case, where the tenant was previously attached
|
||||
// as a Secondary location and then detached, so has no previously notified
|
||||
// state.
|
||||
tracing::info!("Compute hook tenant not found for detach");
|
||||
}
|
||||
Entry::Occupied(mut e) => {
|
||||
let sharded = e.get().is_sharded();
|
||||
|
||||
@@ -690,7 +690,8 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
let nodes = state.service.node_list().await?;
|
||||
let mut nodes = state.service.node_list().await?;
|
||||
nodes.sort_by_key(|n| n.get_id());
|
||||
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
|
||||
|
||||
json_response(StatusCode::OK, api_nodes)
|
||||
@@ -1005,6 +1006,29 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_migrate_secondary(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let mut req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_shard_migrate_secondary(tenant_shard_id, migrate_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_cancel_reconcile(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -1855,6 +1879,16 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_migrate"),
|
||||
)
|
||||
})
|
||||
.put(
|
||||
"/control/v1/tenant/:tenant_shard_id/migrate_secondary",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_migrate_secondary,
|
||||
RequestName("control_v1_tenant_migrate_secondary"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.put(
|
||||
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|
||||
|r| {
|
||||
|
||||
@@ -5055,6 +5055,69 @@ impl Service {
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_shard_migrate_secondary(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
migrate_req: TenantShardMigrateRequest,
|
||||
) -> Result<TenantShardMigrateResponse, ApiError> {
|
||||
let waiter = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let Some(node) = nodes.get(&migrate_req.node_id) else {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Node {} not found",
|
||||
migrate_req.node_id
|
||||
)));
|
||||
};
|
||||
|
||||
if !node.is_available() {
|
||||
// Warn but proceed: the caller may intend to manually adjust the placement of
|
||||
// a shard even if the node is down, e.g. if intervening during an incident.
|
||||
tracing::warn!("Migrating to unavailable node {node}");
|
||||
}
|
||||
|
||||
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant shard not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
if shard.intent.get_secondary().len() == 1
|
||||
&& shard.intent.get_secondary()[0] == migrate_req.node_id
|
||||
{
|
||||
tracing::info!(
|
||||
"Migrating secondary to {node}: intent is unchanged {:?}",
|
||||
shard.intent
|
||||
);
|
||||
} else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
|
||||
tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
|
||||
} else {
|
||||
let old_secondaries = shard.intent.get_secondary().clone();
|
||||
for secondary in old_secondaries {
|
||||
shard.intent.remove_secondary(scheduler, secondary);
|
||||
}
|
||||
|
||||
shard.intent.push_secondary(scheduler, migrate_req.node_id);
|
||||
shard.sequence = shard.sequence.next();
|
||||
tracing::info!(
|
||||
"Migrating secondary to {node}: new intent {:?}",
|
||||
shard.intent
|
||||
);
|
||||
}
|
||||
|
||||
self.maybe_reconcile_shard(shard, nodes)
|
||||
};
|
||||
|
||||
if let Some(waiter) = waiter {
|
||||
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
|
||||
} else {
|
||||
tracing::info!("Migration is a no-op");
|
||||
}
|
||||
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
/// 'cancel' in this context means cancel any ongoing reconcile
|
||||
pub(crate) async fn tenant_shard_cancel_reconcile(
|
||||
&self,
|
||||
|
||||
Reference in New Issue
Block a user