From 6a35a3e9f149798df1b1761ee64099d8d75fbe90 Mon Sep 17 00:00:00 2001
From: "John G. Crowley" <53502854+johngcrowley@users.noreply.github.com>
Date: Wed, 25 Mar 2026 03:27:03 -0500
Subject: [PATCH] HCC, resolved GCS upload permit deadlock, SK generation
 delete bug-fix. (#12873)

## Problem

**HCC SafeKeepers**
* Currently, the `hcc_base_url` flag is set to `None`, disabling
automatic timeline pull from other SafeKeepers on restart. We can
manually call `pull_timeline` but would prefer to use the Hadron
functionality.

**GCS Sempahore Permit Deadlock on Upload**
* GCS `upload` trait implementation's call of `put_object` is
duplicating semaphore permit acquisition, creating deadlock. Each
`upload` acquires, calls `put_object`, nothing to acquire, times out,
retries, etc.

**Storage Controller delete API for SafeKeepers Bug**
* Noticed this while doing a PITR and reusing an old Timeline ID (that
had been previously deleted).
* `DELETE` timeline endpoint in Storage Controller fails to delete the
TL due to generation number mismatch between the [Pending
Op](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_service.rs#L565)
(gen = `i32::MAX`) and the [Schedule
Request](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_service.rs#L582)
(gen = SK.generation). The extant Pending Op [blocks the
deletion](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_reconciler.rs#L462)
of the database record for the TL on [condition that the request
generation == the pending op
generation](https://github.com/neondatabase/neon/blob/main/storage_controller/src/persistence.rs#L1844),
which only happens when the Storage Controller is restarted, [where
pending operations are
reloaded](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_reconciler.rs#L162),
and the request generation is set to `i32::MAX`.
* If the same Timeline ID is used later after its `DELETE`, the old
`start_lsn` value therefore remains in the Storage Controller database
and will cause Compute's WalProposer to crashloop as it thinks it starts
from the prior timeline's (of that same ID's) LSN.

## Summary of changes
* Activate `hcc_base_url` (Hadron) argument for SafeKeeper binary to
enable automatic timeline pull from other SafeKeepers on start.
* Remove the nested permit acquisition and timeout wrapper from
`put_object` in GCS client, as `put_object` is only called by the
`upload` trait implementation.
* Set the Pending Op generation number to SafeKeeper gen to allow
timeline deletion from Storage Controller database without having to
bounce a Storage Controller pod.
---
 libs/remote_storage/src/gcs_bucket.rs         | 29 +++++++++++--------
 safekeeper/src/bin/safekeeper.rs              |  4 ++-
 .../src/service/safekeeper_service.rs         |  2 +-
 3 files changed, 21 insertions(+), 14 deletions(-)
diff --git a/libs/remote_storage/src/gcs_bucket.rs b/libs/remote_storage/src/gcs_bucket.rs
index a4f6db0b18..b21825ddb3 100644
--- a/libs/remote_storage/src/gcs_bucket.rs
+++ b/libs/remote_storage/src/gcs_bucket.rs
@@ -323,8 +323,14 @@ impl GCSBucket {
         cancel: &CancellationToken,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
+
+        // we removed the semaphore permit here which was duplicative of the outer upload() trait
+        // impl call. put_object only called by upload(), so safe to do. upload handles permit and
+        // timeout, we were getting deadlock with upload concurrency at 32, and 32 upload permits
+        // then trying to get a permit inside their put_object calls, timing out, retrying,
+        // ad-infinitum.
+        
         let kind = RequestKind::Put;
-        let _permit = self.permit(kind, cancel).await?;
         let started_at = start_measuring_requests(kind);
 
         let multipart_uri = format!(
@@ -372,21 +378,21 @@ impl GCSBucket {
             .headers(headers)
             .send();
 
-        let upload = tokio::time::timeout(self.timeout, upload);
-
         let res = tokio::select! {
             res = upload => res,
             _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
         };
 
-        if let Ok(inner) = &res {
-            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, inner, started_at);
-        }
+        // not if let-ing an Ok(inner), since res is not double-Result<>-wrapped with the tokio
+        // timeout, observe_elapsed's AttemptedOutcome trait obj expects
+        // &Result<reqwest::Response> which &res directly is, and it can handle the Err case.
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        
         match res {
-            Ok(Ok(res)) => {
+            Ok(res) => {
                 if !res.status().is_success() {
                     match res.status() {
                         _ => Err(anyhow::anyhow!("GCS PUT error \n\t {:?}", res)),
@@ -410,8 +416,7 @@ impl GCSBucket {
                     Ok(())
                 }
             }
-            Ok(Err(reqw)) => Err(reqw.into()),
-            Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+            Err(reqw) => Err(reqw.into()),
         }
     }
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 2ec541b6f0..6632819e1f 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -263,6 +263,8 @@ struct Args {
     /* BEGIN_HADRON */
     #[arg(long)]
     enable_pull_timeline_on_startup: bool,
+    #[arg(long)]
+    hcc_base_url: Option<url::Url>,
     /// How often to scan entire data-dir for total disk usage
     #[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)]
     global_disk_check_interval: Duration,
@@ -459,7 +461,7 @@ async fn main() -> anyhow::Result<()> {
         /* BEGIN_HADRON */
         advertise_pg_addr_tenant_only: None,
         enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
-        hcc_base_url: None,
+        hcc_base_url: args.hcc_base_url,
         global_disk_check_interval: args.global_disk_check_interval,
         max_global_disk_usage_ratio: args.max_global_disk_usage_ratio,
         /* END_HADRON */
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 689d341b6a..fc004429a6 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -585,7 +585,7 @@ impl Service {
                     host_list: Vec::new(),
                     tenant_id,
                     timeline_id: Some(timeline_id),
-                    generation: tl.generation as u32,
+                    generation: i32::MAX as u32,
                     kind: SafekeeperTimelineOpKind::Delete,
                 };
                 locked.safekeeper_reconcilers.schedule_request(req);