storcon: handle ongoing deletions gracefully (#9449)

## Problem

Pageserver returns 409 (Conflict) if any of the shards are already
deleting the timeline. This resulted in an error being propagated out of
the HTTP handler and to the client. It's an expected scenario so we
should handle it nicely.

This caused failures in `test_storage_controller_smoke`
[here](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9435/11390431900/index.html#suites/8fc5d1648d2225380766afde7c428d81/86eee4b002d6572d).

## Summary of Changes

Instead of returning an error on 409s, we now bubble the status code up
and let the HTTP handler code retry until it gets a 404 or times out.
This commit is contained in:
Vlad Lazar
2024-10-18 15:33:04 +01:00
committed by GitHub
parent 5cbdec9c79
commit e162ab8b53
2 changed files with 33 additions and 14 deletions

View File

@@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete(
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
{
// On subsequent retries, wait longer.
// Enable callers with a 25 second request timeout to reliably get a response
const MAX_WAIT: Duration = Duration::from_secs(25);
const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5);
let started_at = Instant::now();
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
// completed.
let mut retry_period = Duration::from_secs(1);
// On subsequent retries, wait longer.
let max_retry_period = Duration::from_secs(5);
// Enable callers with a 30 second request timeout to reliably get a response
let max_wait = Duration::from_secs(25);
loop {
let status = f(service.clone()).await?;
@@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete(
StatusCode::ACCEPTED => {
tracing::info!("Deletion accepted, waiting to try again...");
tokio::time::sleep(retry_period).await;
retry_period = max_retry_period;
retry_period = MAX_RETRY_PERIOD;
}
StatusCode::CONFLICT => {
tracing::info!("Deletion already in progress, waiting to try again...");
tokio::time::sleep(retry_period).await;
}
StatusCode::NOT_FOUND => {
tracing::info!("Deletion complete");
@@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete(
}
let now = Instant::now();
if now + retry_period > started_at + max_wait {
if now + retry_period > started_at + MAX_WAIT {
tracing::info!("Deletion timed out waiting for 404");
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
// the pageserver's swagger definition for this endpoint, and has the same desired

View File

@@ -3630,14 +3630,21 @@ impl Service {
);
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
client
let res = client
.timeline_delete(tenant_shard_id, timeline_id)
.await
.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
))
})
.await;
match res {
Ok(ok) => Ok(ok),
Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
Err(e) => {
Err(
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
))
)
}
}
}
let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
@@ -3652,7 +3659,13 @@ impl Service {
})
.await?;
// If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
// If any shards >0 haven't finished deletion yet, don't start deletion on shard zero.
// We return 409 (Conflict) if deletion was already in progress on any of the shards
// and 202 (Accepted) if deletion was not already in progress on any of the shards.
if statuses.iter().any(|s| s == &StatusCode::CONFLICT) {
return Ok(StatusCode::CONFLICT);
}
if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
return Ok(StatusCode::ACCEPTED);
}