storage_controller: delete stale shards when deleting tenant (#9333)

## Problem

Tenant deletion only removes the current shards from remote storage. Any
stale parent shards (before splits) will be left behind. These shards
are kept since child shards may reference data from the parent until new
image layers are generated.

## Summary of changes

* Document a special case for pageserver tenant deletion that deletes
all shards in remote storage when given an unsharded tenant ID, as well
as any unsharded tenant data.
* Pass an unsharded tenant ID to delete all remote storage under the
tenant ID prefix.
* Split out `RemoteStorage::delete_prefix()` to delete a bucket prefix,
with additional test coverage.
* Add a `delimiter` argument to `asset_prefix_empty()` to support
partial prefix matches (i.e. all shards starting with a given tenant
ID).
This commit is contained in:
Erik Grinaker
2024-10-17 16:34:51 +02:00
committed by GitHub
parent f3a3eefd26
commit 4c9835f4a3
6 changed files with 376 additions and 97 deletions

View File

@@ -2862,17 +2862,12 @@ impl Service {
let _tenant_lock =
trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
// Detach all shards
let (detach_waiters, shard_ids, node) = {
let mut shard_ids = Vec::new();
// Detach all shards. This also deletes local pageserver shard data.
let (detach_waiters, node) = {
let mut detach_waiters = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
for (tenant_shard_id, shard) in
tenants.range_mut(TenantShardId::tenant_range(tenant_id))
{
shard_ids.push(*tenant_shard_id);
for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Update the tenant's intent to remove all attachments
shard.policy = PlacementPolicy::Detached;
shard
@@ -2892,7 +2887,7 @@ impl Service {
let node = nodes
.get(&node_id)
.expect("Pageservers may not be deleted while lock is active");
(detach_waiters, shard_ids, node.clone())
(detach_waiters, node.clone())
};
// This reconcile wait can fail in a few ways:
@@ -2907,38 +2902,34 @@ impl Service {
self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
.await?;
let locations = shard_ids
.into_iter()
.map(|s| (s, node.clone()))
.collect::<Vec<_>>();
let results = self.tenant_for_shards_api(
locations,
|tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
1,
3,
RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
for result in results {
match result {
Ok(StatusCode::ACCEPTED) => {
// This should never happen: we waited for detaches to finish above
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Unexpectedly still attached on {}",
node
)));
}
Ok(_) => {}
Err(mgmt_api::Error::Cancelled) => {
return Err(ApiError::ShuttingDown);
}
Err(e) => {
// This is unexpected: remote deletion should be infallible, unless the object store
// at large is unavailable.
tracing::error!("Error deleting via node {}: {e}", node);
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
}
// Delete the entire tenant (all shards) from remote storage via a random pageserver.
// Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with
// the tenant ID prefix, including all shards (even possibly stale ones).
match node
.with_client_retries(
|client| async move {
client
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await
},
&self.config.jwt_token,
1,
3,
RECONCILE_TIMEOUT,
&self.cancel,
)
.await
.unwrap_or(Err(mgmt_api::Error::Cancelled))
{
Ok(_) => {}
Err(mgmt_api::Error::Cancelled) => {
return Err(ApiError::ShuttingDown);
}
Err(e) => {
// This is unexpected: remote deletion should be infallible, unless the object store
// at large is unavailable.
tracing::error!("Error deleting via node {node}: {e}");
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
}
}