mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 18:10:37 +00:00
storcon: handle ongoing deletions gracefully (#9449)
## Problem Pageserver returns 409 (Conflict) if any of the shards are already deleting the timeline. This resulted in an error being propagated out of the HTTP handler and to the client. It's an expected scenario so we should handle it nicely. This caused failures in `test_storage_controller_smoke` [here](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9435/11390431900/index.html#suites/8fc5d1648d2225380766afde7c428d81/86eee4b002d6572d). ## Summary of Changes Instead of returning an error on 409s, we now bubble the status code up and let the HTTP handler code retry until it gets a 404 or times out.
This commit is contained in:
@@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete(
|
||||
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
|
||||
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
// On subsequent retries, wait longer.
|
||||
// Enable callers with a 25 second request timeout to reliably get a response
|
||||
const MAX_WAIT: Duration = Duration::from_secs(25);
|
||||
const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5);
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
|
||||
// completed.
|
||||
let mut retry_period = Duration::from_secs(1);
|
||||
// On subsequent retries, wait longer.
|
||||
let max_retry_period = Duration::from_secs(5);
|
||||
// Enable callers with a 30 second request timeout to reliably get a response
|
||||
let max_wait = Duration::from_secs(25);
|
||||
|
||||
loop {
|
||||
let status = f(service.clone()).await?;
|
||||
@@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete(
|
||||
StatusCode::ACCEPTED => {
|
||||
tracing::info!("Deletion accepted, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
retry_period = max_retry_period;
|
||||
retry_period = MAX_RETRY_PERIOD;
|
||||
}
|
||||
StatusCode::CONFLICT => {
|
||||
tracing::info!("Deletion already in progress, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
}
|
||||
StatusCode::NOT_FOUND => {
|
||||
tracing::info!("Deletion complete");
|
||||
@@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete(
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
if now + retry_period > started_at + max_wait {
|
||||
if now + retry_period > started_at + MAX_WAIT {
|
||||
tracing::info!("Deletion timed out waiting for 404");
|
||||
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
|
||||
// the pageserver's swagger definition for this endpoint, and has the same desired
|
||||
|
||||
Reference in New Issue
Block a user