storcon: handle ongoing deletions gracefully (#9449)

## Problem Pageserver returns 409 (Conflict) if any of the shards are already deleting the timeline. This resulted in an error being propagated out of the HTTP handler and to the client. It's an expected scenario so we should handle it nicely. This caused failures in `test_storage_controller_smoke` [here](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9435/11390431900/index.html#suites/8fc5d1648d2225380766afde7c428d81/86eee4b002d6572d). ## Summary of Changes Instead of returning an error on 409s, we now bubble the status code up and let the HTTP handler code retry until it gets a 404 or times out.
2026-01-05 20:42:54 +00:00 · 2024-10-18 15:33:04 +01:00
parent 5cbdec9c79
commit e162ab8b53
2 changed files with 33 additions and 14 deletions
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete(
        R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
        F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
    {
+        // On subsequent retries, wait longer.
+        // Enable callers with a 25 second request timeout to reliably get a response
+        const MAX_WAIT: Duration = Duration::from_secs(25);
+        const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5);
+
        let started_at = Instant::now();
+
        // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
        // completed.
        let mut retry_period = Duration::from_secs(1);
-        // On subsequent retries, wait longer.
-        let max_retry_period = Duration::from_secs(5);
-        // Enable callers with a 30 second request timeout to reliably get a response
-        let max_wait = Duration::from_secs(25);

        loop {
            let status = f(service.clone()).await?;
@@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete(
                StatusCode::ACCEPTED => {
                    tracing::info!("Deletion accepted, waiting to try again...");
                    tokio::time::sleep(retry_period).await;
-                    retry_period = max_retry_period;
+                    retry_period = MAX_RETRY_PERIOD;
+                }
+                StatusCode::CONFLICT => {
+                    tracing::info!("Deletion already in progress, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
                }
                StatusCode::NOT_FOUND => {
                    tracing::info!("Deletion complete");
@@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete(
            }

            let now = Instant::now();
-            if now + retry_period > started_at + max_wait {
+            if now + retry_period > started_at + MAX_WAIT {
                tracing::info!("Deletion timed out waiting for 404");
                // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
                // the pageserver's swagger definition for this endpoint, and has the same desired
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3630,14 +3630,21 @@ impl Service {
                );

                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-                client
+                let res = client
                    .timeline_delete(tenant_shard_id, timeline_id)
-                    .await
-                    .map_err(|e| {
-                        ApiError::InternalServerError(anyhow::anyhow!(
-                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                        ))
-                    })
+                    .await;
+
+                match res {
+                    Ok(ok) => Ok(ok),
+                    Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(e) => {
+                        Err(
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                            ))
+                        )
+                    }
+                }
            }

            let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
@@ -3652,7 +3659,13 @@ impl Service {
                })
                .await?;

-            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero.
+            // We return 409 (Conflict) if deletion was already in progress on any of the shards
+            // and 202 (Accepted) if deletion was not already in progress on any of the shards.
+            if statuses.iter().any(|s| s == &StatusCode::CONFLICT) {
+                return Ok(StatusCode::CONFLICT);
+            }
+
            if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
                return Ok(StatusCode::ACCEPTED);
            }