diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 46b6f4f2bf..afefe8598c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete( R: std::future::Future> + Send + 'static, F: Fn(Arc) -> R + Send + Sync + 'static, { + // On subsequent retries, wait longer. + // Enable callers with a 25 second request timeout to reliably get a response + const MAX_WAIT: Duration = Duration::from_secs(25); + const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5); + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion // completed. let mut retry_period = Duration::from_secs(1); - // On subsequent retries, wait longer. - let max_retry_period = Duration::from_secs(5); - // Enable callers with a 30 second request timeout to reliably get a response - let max_wait = Duration::from_secs(25); loop { let status = f(service.clone()).await?; @@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete( StatusCode::ACCEPTED => { tracing::info!("Deletion accepted, waiting to try again..."); tokio::time::sleep(retry_period).await; - retry_period = max_retry_period; + retry_period = MAX_RETRY_PERIOD; + } + StatusCode::CONFLICT => { + tracing::info!("Deletion already in progress, waiting to try again..."); + tokio::time::sleep(retry_period).await; } StatusCode::NOT_FOUND => { tracing::info!("Deletion complete"); @@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete( } let now = Instant::now(); - if now + retry_period > started_at + max_wait { + if now + retry_period > started_at + MAX_WAIT { tracing::info!("Deletion timed out waiting for 404"); // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of // the pageserver's swagger definition for this endpoint, and has the same desired diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ab2c3b5e48..01aa8f1dab 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3630,14 +3630,21 @@ impl Service { ); let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - client + let res = client .timeline_delete(tenant_shard_id, timeline_id) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) - }) + .await; + + match res { + Ok(ok) => Ok(ok), + Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT), + Err(e) => { + Err( + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + ) + } + } } let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect(); @@ -3652,7 +3659,13 @@ impl Service { }) .await?; - // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero. + // We return 409 (Conflict) if deletion was already in progress on any of the shards + // and 202 (Accepted) if deletion was not already in progress on any of the shards. + if statuses.iter().any(|s| s == &StatusCode::CONFLICT) { + return Ok(StatusCode::CONFLICT); + } + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { return Ok(StatusCode::ACCEPTED); }