pageserver/controller: enable tenant deletion without attachment (#7957)

## Problem As described in #7952, the controller's attempt to reconcile a tenant before finally deleting it can get hung up waiting for the compute notification hook to accept updates. The fact that we try and reconcile a tenant at all during deletion is part of a more general design issue (#5080), where deletion was implemented as an operation on attached tenant, requiring the tenant to be attached in order to delete it, which is not in principle necessary. Closes: #7952 ## Summary of changes - In the pageserver deletion API, only do the traditional deletion path if the tenant is attached. If it's secondary, then tear down the secondary location, and then do a remote delete. If it's not attached at all, just do the remote delete. - In the storage controller, instead of ensuring a tenant is attached before deletion, do a best-effort detach of the tenant, and then call into some arbitrary pageserver to issue a deletion of remote content. The pageserver retains its existing delete behavior when invoked on attached locations. We can remove this later when all users of the API are updated to either do a detach-before-delete. This will enable removing the "weird" code paths during startup that sometimes load a tenant and then immediately delete it, and removing the deletion markers on tenants.
2026-05-29 11:00:38 +00:00 · 2024-06-05 21:22:54 +01:00
parent 83ab14e271
commit 91dd99038e
7 changed files with 312 additions and 117 deletions
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -142,52 +142,6 @@ async fn handle_tenant_create(
    )
 }

-// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
-// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
-// needing to track a "deleting" state for tenants.
-async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
-where
-    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
-    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
-{
-    let started_at = Instant::now();
-    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
-    // completed.
-    let mut retry_period = Duration::from_secs(1);
-    // On subsequent retries, wait longer.
-    let max_retry_period = Duration::from_secs(5);
-    // Enable callers with a 30 second request timeout to reliably get a response
-    let max_wait = Duration::from_secs(25);
-
-    loop {
-        let status = f(service.clone()).await?;
-        match status {
-            StatusCode::ACCEPTED => {
-                tracing::info!("Deletion accepted, waiting to try again...");
-                tokio::time::sleep(retry_period).await;
-                retry_period = max_retry_period;
-            }
-            StatusCode::NOT_FOUND => {
-                tracing::info!("Deletion complete");
-                return json_response(StatusCode::OK, ());
-            }
-            _ => {
-                tracing::warn!("Unexpected status {status}");
-                return json_response(status, ());
-            }
-        }
-
-        let now = Instant::now();
-        if now + retry_period > started_at + max_wait {
-            tracing::info!("Deletion timed out waiting for 404");
-            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
-            // the pageserver's swagger definition for this endpoint, and has the same desired
-            // effect of causing the control plane to retry later.
-            return json_response(StatusCode::CONFLICT, ());
-        }
-    }
-}
-
 async fn handle_tenant_location_config(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -283,13 +237,17 @@ async fn handle_tenant_delete(
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;

-    deletion_wrapper(service, move |service| async move {
-        service
-            .tenant_delete(tenant_id)
-            .await
-            .and_then(map_reqwest_hyper_status)
-    })
-    .await
+    let status_code = service
+        .tenant_delete(tenant_id)
+        .await
+        .and_then(map_reqwest_hyper_status)?;
+
+    if status_code == StatusCode::NOT_FOUND {
+        // The pageserver uses 404 for successful deletion, but we use 200
+        json_response(StatusCode::OK, ())
+    } else {
+        json_response(status_code, ())
+    }
 }

 async fn handle_tenant_timeline_create(
@@ -317,6 +275,51 @@ async fn handle_tenant_timeline_delete(

    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

+    // For timeline deletions, which both implement an "initially return 202, then 404 once
+    // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
+    async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+    where
+        R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+        F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+    {
+        let started_at = Instant::now();
+        // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+        // completed.
+        let mut retry_period = Duration::from_secs(1);
+        // On subsequent retries, wait longer.
+        let max_retry_period = Duration::from_secs(5);
+        // Enable callers with a 30 second request timeout to reliably get a response
+        let max_wait = Duration::from_secs(25);
+
+        loop {
+            let status = f(service.clone()).await?;
+            match status {
+                StatusCode::ACCEPTED => {
+                    tracing::info!("Deletion accepted, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
+                    retry_period = max_retry_period;
+                }
+                StatusCode::NOT_FOUND => {
+                    tracing::info!("Deletion complete");
+                    return json_response(StatusCode::OK, ());
+                }
+                _ => {
+                    tracing::warn!("Unexpected status {status}");
+                    return json_response(status, ());
+                }
+            }
+
+            let now = Instant::now();
+            if now + retry_period > started_at + max_wait {
+                tracing::info!("Deletion timed out waiting for 404");
+                // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+                // the pageserver's swagger definition for this endpoint, and has the same desired
+                // effect of causing the control plane to retry later.
+                return json_response(StatusCode::CONFLICT, ());
+            }
+        }
+    }
+
    deletion_wrapper(service, move |service| async move {
        service
            .tenant_timeline_delete(tenant_id, timeline_id)