diff --git a/libs/remote_storage/src/gcs_bucket.rs b/libs/remote_storage/src/gcs_bucket.rs index a4f6db0b18..b21825ddb3 100644 --- a/libs/remote_storage/src/gcs_bucket.rs +++ b/libs/remote_storage/src/gcs_bucket.rs @@ -323,8 +323,14 @@ impl GCSBucket { cancel: &CancellationToken, metadata: Option, ) -> anyhow::Result<()> { + + // we removed the semaphore permit here which was duplicative of the outer upload() trait + // impl call. put_object only called by upload(), so safe to do. upload handles permit and + // timeout, we were getting deadlock with upload concurrency at 32, and 32 upload permits + // then trying to get a permit inside their put_object calls, timing out, retrying, + // ad-infinitum. + let kind = RequestKind::Put; - let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let multipart_uri = format!( @@ -372,21 +378,21 @@ impl GCSBucket { .headers(headers) .send(); - let upload = tokio::time::timeout(self.timeout, upload); - let res = tokio::select! { res = upload => res, _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), }; - if let Ok(inner) = &res { - let started_at = ScopeGuard::into_inner(started_at); - crate::metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, inner, started_at); - } + // not if let-ing an Ok(inner), since res is not double-Result<>-wrapped with the tokio + // timeout, observe_elapsed's AttemptedOutcome trait obj expects + // &Result which &res directly is, and it can handle the Err case. + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + match res { - Ok(Ok(res)) => { + Ok(res) => { if !res.status().is_success() { match res.status() { _ => Err(anyhow::anyhow!("GCS PUT error \n\t {:?}", res)), @@ -410,8 +416,7 @@ impl GCSBucket { Ok(()) } } - Ok(Err(reqw)) => Err(reqw.into()), - Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), + Err(reqw) => Err(reqw.into()), } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 2ec541b6f0..6632819e1f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -263,6 +263,8 @@ struct Args { /* BEGIN_HADRON */ #[arg(long)] enable_pull_timeline_on_startup: bool, + #[arg(long)] + hcc_base_url: Option, /// How often to scan entire data-dir for total disk usage #[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)] global_disk_check_interval: Duration, @@ -459,7 +461,7 @@ async fn main() -> anyhow::Result<()> { /* BEGIN_HADRON */ advertise_pg_addr_tenant_only: None, enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup, - hcc_base_url: None, + hcc_base_url: args.hcc_base_url, global_disk_check_interval: args.global_disk_check_interval, max_global_disk_usage_ratio: args.max_global_disk_usage_ratio, /* END_HADRON */ diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 689d341b6a..fc004429a6 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -585,7 +585,7 @@ impl Service { host_list: Vec::new(), tenant_id, timeline_id: Some(timeline_id), - generation: tl.generation as u32, + generation: i32::MAX as u32, kind: SafekeeperTimelineOpKind::Delete, }; locked.safekeeper_reconcilers.schedule_request(req);