storage_controller: delete stale shards when deleting tenant (#9333)

## Problem

Tenant deletion only removes the current shards from remote storage. Any
stale parent shards (before splits) will be left behind. These shards
are kept since child shards may reference data from the parent until new
image layers are generated.

## Summary of changes

* Document a special case for pageserver tenant deletion that deletes
all shards in remote storage when given an unsharded tenant ID, as well
as any unsharded tenant data.
* Pass an unsharded tenant ID to delete all remote storage under the
tenant ID prefix.
* Split out `RemoteStorage::delete_prefix()` to delete a bucket prefix,
with additional test coverage.
* Add a `delimiter` argument to `asset_prefix_empty()` to support
partial prefix matches (i.e. all shards starting with a given tenant
ID).
This commit is contained in:
Erik Grinaker
2024-10-17 16:34:51 +02:00
committed by GitHub
parent f3a3eefd26
commit 4c9835f4a3
6 changed files with 376 additions and 97 deletions

View File

@@ -19,7 +19,12 @@ mod simulate_failures;
mod support;
use std::{
collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
collections::HashMap,
fmt::Debug,
num::NonZeroU32,
ops::Bound,
pin::{pin, Pin},
sync::Arc,
time::SystemTime,
};
@@ -28,6 +33,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
use futures::{stream::Stream, StreamExt};
use itertools::Itertools as _;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
@@ -261,7 +267,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError> {
let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel));
let mut combined = stream.next().await.expect("At least one item required")?;
while let Some(list) = stream.next().await {
let list = list?;
@@ -324,6 +330,35 @@ pub trait RemoteStorage: Send + Sync + 'static {
cancel: &CancellationToken,
) -> anyhow::Result<()>;
/// Deletes all objects matching the given prefix.
///
/// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
/// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will
/// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
/// through.
async fn delete_prefix(
&self,
prefix: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let mut stream =
pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel));
while let Some(result) = stream.next().await {
let keys = match result {
Ok(listing) if listing.keys.is_empty() => continue,
Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(),
Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()),
Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()),
Err(err) => return Err(err.into()),
};
tracing::info!("Deleting {} keys from remote storage", keys.len());
self.delete_objects(&keys, cancel).await?;
}
Ok(())
}
/// Copy a remote object inside a bucket from one path to another.
async fn copy(
&self,
@@ -488,6 +523,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
/// See [`RemoteStorage::delete_prefix`]
pub async fn delete_prefix(
&self,
prefix: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await,
Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await,
Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await,
Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await,
}
}
/// See [`RemoteStorage::copy`]
pub async fn copy_object(
&self,

View File

@@ -199,6 +199,138 @@ async fn list_no_delimiter_works(
Ok(())
}
/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"),
/// but only with NoDelimiter.
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
#[tokio::test]
async fn list_partial_prefix(
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
anyhow::bail!("S3 init failed: {e:?}")
}
};
let cancel = CancellationToken::new();
let test_client = Arc::clone(&ctx.enabled.client);
// Prefix "fold" should match all "folder{i}" directories with NoDelimiter.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("fold")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert_eq!(&objects, &ctx.remote_blobs);
// Prefix "fold" matches nothing with WithDelimiter.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("fold")?),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
// Prefix "" matches everything.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert_eq!(&objects, &ctx.remote_blobs);
// Prefix "" matches nothing with WithDelimiter.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("")?),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
// Prefix "foo" matches nothing.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("foo")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
// Prefix "folder2/blob" matches.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("folder2/blob")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
let expect: HashSet<_> = ctx
.remote_blobs
.iter()
.filter(|o| o.get_path().starts_with("folder2"))
.cloned()
.collect();
assert_eq!(&objects, &expect);
// Prefix "folder2/foo" matches nothing.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("folder2/foo")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
Ok(())
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -265,6 +397,80 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
Ok(())
}
/// Tests that delete_prefix() will delete all objects matching a prefix, including
/// partial prefixes (i.e. "/foo" matches "/foobar").
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
#[tokio::test]
async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
anyhow::bail!("S3 init failed: {e:?}")
}
};
let cancel = CancellationToken::new();
let test_client = Arc::clone(&ctx.enabled.client);
/// Asserts that the S3 listing matches the given paths.
macro_rules! assert_list {
($expect:expr) => {{
let listing = test_client
.list(None, ListingMode::NoDelimiter, None, &cancel)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert_eq!($expect, listing);
}};
}
// We start with the full set of uploaded files.
let mut expect = ctx.remote_blobs.clone();
// Deleting a non-existing prefix should do nothing.
test_client
.delete_prefix(&RemotePath::from_string("xyz")?, &cancel)
.await?;
assert_list!(expect);
// Prefixes are case-sensitive.
test_client
.delete_prefix(&RemotePath::from_string("Folder")?, &cancel)
.await?;
assert_list!(expect);
// Deleting a path which overlaps with an existing object should do nothing. We pick the first
// path in the set as our common prefix.
let path = expect.iter().next().expect("empty set").clone().join("xyz");
test_client.delete_prefix(&path, &cancel).await?;
assert_list!(expect);
// Deleting an exact path should work. We pick the first path in the set.
let path = expect.iter().next().expect("empty set").clone();
test_client.delete_prefix(&path, &cancel).await?;
expect.remove(&path);
assert_list!(expect);
// Deleting a prefix should delete all matching objects.
test_client
.delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel)
.await?;
expect.retain(|p| !p.get_path().as_str().starts_with("folder0/"));
assert_list!(expect);
// Deleting a common prefix should delete all objects.
test_client
.delete_prefix(&RemotePath::from_string("fold")?, &cancel)
.await?;
expect.clear();
assert_list!(expect);
Ok(())
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {

View File

@@ -11,6 +11,7 @@ use pageserver_api::shard::{
};
use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng};
use remote_storage::TimeoutOrCancel;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet};
@@ -1350,47 +1351,17 @@ impl TenantManager {
}
}
async fn delete_tenant_remote(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
let remote_path = remote_tenant_path(&tenant_shard_id);
let mut keys_stream = self.resources.remote_storage.list_streaming(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
);
while let Some(chunk) = keys_stream.next().await {
let keys = match chunk {
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
}
Ok(())
}
/// If a tenant is attached, detach it. Then remove its data from remote storage.
///
/// A tenant is considered deleted once it is gone from remote storage. It is the caller's
/// responsibility to avoid trying to attach the tenant again or use it any way once deletion
/// has started: this operation is not atomic, and must be retried until it succeeds.
///
/// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove
/// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage
/// controller uses this to purge all remote tenant data, including any stale parent shards that
/// may remain after splits. Ideally, this special case would be handled elsewhere. See:
/// <https://github.com/neondatabase/neon/pull/9394>.
pub(crate) async fn delete_tenant(
&self,
tenant_shard_id: TenantShardId,
@@ -1442,25 +1413,29 @@ impl TenantManager {
// in 500 responses to delete requests.
// - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
// 503/retry, rather than kicking off a wasteful concurrent deletion.
match backoff::retry(
|| async move { self.delete_tenant_remote(tenant_shard_id).await },
|e| match e {
DeleteTenantError::Cancelled => true,
DeleteTenantError::SlotError(_) => {
unreachable!("Remote deletion doesn't touch slots")
}
_ => false,
// NB: this also deletes partial prefixes, i.e. a <tenant_id> path will delete all
// <tenant_id>_<shard_id>/* objects. See method comment for why.
backoff::retry(
|| async move {
self.resources
.remote_storage
.delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel)
.await
},
|_| false, // backoff::retry handles cancellation
1,
3,
&format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
&self.cancel,
)
.await
{
Some(r) => r,
None => Err(DeleteTenantError::Cancelled),
}
.unwrap_or(Err(TimeoutOrCancel::Cancel.into()))
.map_err(|err| {
if TimeoutOrCancel::caused_by_cancel(&err) {
return DeleteTenantError::Cancelled;
}
DeleteTenantError::Other(err)
})
}
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]

View File

@@ -2862,17 +2862,12 @@ impl Service {
let _tenant_lock =
trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
// Detach all shards
let (detach_waiters, shard_ids, node) = {
let mut shard_ids = Vec::new();
// Detach all shards. This also deletes local pageserver shard data.
let (detach_waiters, node) = {
let mut detach_waiters = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
for (tenant_shard_id, shard) in
tenants.range_mut(TenantShardId::tenant_range(tenant_id))
{
shard_ids.push(*tenant_shard_id);
for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Update the tenant's intent to remove all attachments
shard.policy = PlacementPolicy::Detached;
shard
@@ -2892,7 +2887,7 @@ impl Service {
let node = nodes
.get(&node_id)
.expect("Pageservers may not be deleted while lock is active");
(detach_waiters, shard_ids, node.clone())
(detach_waiters, node.clone())
};
// This reconcile wait can fail in a few ways:
@@ -2907,38 +2902,34 @@ impl Service {
self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
.await?;
let locations = shard_ids
.into_iter()
.map(|s| (s, node.clone()))
.collect::<Vec<_>>();
let results = self.tenant_for_shards_api(
locations,
|tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
1,
3,
RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
for result in results {
match result {
Ok(StatusCode::ACCEPTED) => {
// This should never happen: we waited for detaches to finish above
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Unexpectedly still attached on {}",
node
)));
}
Ok(_) => {}
Err(mgmt_api::Error::Cancelled) => {
return Err(ApiError::ShuttingDown);
}
Err(e) => {
// This is unexpected: remote deletion should be infallible, unless the object store
// at large is unavailable.
tracing::error!("Error deleting via node {}: {e}", node);
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
}
// Delete the entire tenant (all shards) from remote storage via a random pageserver.
// Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with
// the tenant ID prefix, including all shards (even possibly stale ones).
match node
.with_client_retries(
|client| async move {
client
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await
},
&self.config.jwt_token,
1,
3,
RECONCILE_TIMEOUT,
&self.cancel,
)
.await
.unwrap_or(Err(mgmt_api::Error::Cancelled))
{
Ok(_) => {}
Err(mgmt_api::Error::Cancelled) => {
return Err(ApiError::ShuttingDown);
}
Err(e) => {
// This is unexpected: remote deletion should be infallible, unless the object store
// at large is unavailable.
tracing::error!("Error deleting via node {node}: {e}");
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
}
}

View File

@@ -303,9 +303,10 @@ def assert_prefix_empty(
remote_storage: Optional[RemoteStorage],
prefix: Optional[str] = None,
allowed_postfix: Optional[str] = None,
delimiter: str = "/",
) -> None:
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
response = list_prefix(remote_storage, prefix, delimiter)
keys = response["KeyCount"]
objects: list[ObjectTypeDef] = response.get("Contents", [])
common_prefixes = response.get("CommonPrefixes", [])
@@ -338,16 +339,18 @@ def assert_prefix_empty(
if not (allowed_postfix.endswith(key)):
filtered_count += 1
assert (
filtered_count == 0
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
assert filtered_count == 0, f"remote prefix {prefix} is not empty: {objects}"
# remote_storage must not be None, but that's easier for callers to make mypy happy
def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
def assert_prefix_not_empty(
remote_storage: Optional[RemoteStorage],
prefix: Optional[str] = None,
delimiter: str = "/",
):
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
assert response["KeyCount"] != 0, f"remote prefix {prefix} is empty: {response}"
def list_prefix(

View File

@@ -20,6 +20,7 @@ from fixtures.pageserver.utils import (
)
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.utils import run_pg_bench_small, wait_until
from fixtures.workload import Workload
from requests.exceptions import ReadTimeout
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -404,3 +405,57 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder
cloud_admin_api_token=cloud_admin_token,
)
assert healthy
def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
"""
Deleting a tenant should also delete any stale (pre-split) shards from remote storage.
"""
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
# Create an unsharded tenant.
tenant_id, timeline_id = env.create_tenant()
# Write some data.
workload = Workload(env, tenant_id, timeline_id, branch_name="main")
workload.init()
workload.write_rows(256)
workload.validate()
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(("tenants", str(tenant_id))),
)
# Upload a heatmap as well.
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
# Split off a few shards, in two rounds.
env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
env.storage_controller.tenant_shard_split(tenant_id, shard_count=16)
# Delete the tenant. This should also delete data for the unsharded and count=4 parents.
env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(("tenants", str(tenant_id))),
delimiter="", # match partial prefixes, i.e. all shards
)
dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*"))
assert dirs == [], f"found tenant directories: {dirs}"
# The initial tenant created by the test harness should still be there.
# Only the tenant we deleted should be removed.
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(("tenants", str(env.initial_tenant))),
)
dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*"))
assert dirs != [], "missing initial tenant directory"
env.stop()