mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 13:10:38 +00:00
HCC, resolved GCS upload permit deadlock, SK generation delete bug-fix. (#12873)
## Problem **HCC SafeKeepers** * Currently, the `hcc_base_url` flag is set to `None`, disabling automatic timeline pull from other SafeKeepers on restart. We can manually call `pull_timeline` but would prefer to use the Hadron functionality. **GCS Sempahore Permit Deadlock on Upload** * GCS `upload` trait implementation's call of `put_object` is duplicating semaphore permit acquisition, creating deadlock. Each `upload` acquires, calls `put_object`, nothing to acquire, times out, retries, etc. **Storage Controller delete API for SafeKeepers Bug** * Noticed this while doing a PITR and reusing an old Timeline ID (that had been previously deleted). * `DELETE` timeline endpoint in Storage Controller fails to delete the TL due to generation number mismatch between the [Pending Op](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_service.rs#L565) (gen = `i32::MAX`) and the [Schedule Request](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_service.rs#L582) (gen = SK.generation). The extant Pending Op [blocks the deletion](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_reconciler.rs#L462) of the database record for the TL on [condition that the request generation == the pending op generation](https://github.com/neondatabase/neon/blob/main/storage_controller/src/persistence.rs#L1844), which only happens when the Storage Controller is restarted, [where pending operations are reloaded](https://github.com/neondatabase/neon/blob/main/storage_controller/src/service/safekeeper_reconciler.rs#L162), and the request generation is set to `i32::MAX`. * If the same Timeline ID is used later after its `DELETE`, the old `start_lsn` value therefore remains in the Storage Controller database and will cause Compute's WalProposer to crashloop as it thinks it starts from the prior timeline's (of that same ID's) LSN. ## Summary of changes * Activate `hcc_base_url` (Hadron) argument for SafeKeeper binary to enable automatic timeline pull from other SafeKeepers on start. * Remove the nested permit acquisition and timeout wrapper from `put_object` in GCS client, as `put_object` is only called by the `upload` trait implementation. * Set the Pending Op generation number to SafeKeeper gen to allow timeline deletion from Storage Controller database without having to bounce a Storage Controller pod.
This commit is contained in:
@@ -323,8 +323,14 @@ impl GCSBucket {
|
||||
cancel: &CancellationToken,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
// we removed the semaphore permit here which was duplicative of the outer upload() trait
|
||||
// impl call. put_object only called by upload(), so safe to do. upload handles permit and
|
||||
// timeout, we were getting deadlock with upload concurrency at 32, and 32 upload permits
|
||||
// then trying to get a permit inside their put_object calls, timing out, retrying,
|
||||
// ad-infinitum.
|
||||
|
||||
let kind = RequestKind::Put;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let multipart_uri = format!(
|
||||
@@ -372,21 +378,21 @@ impl GCSBucket {
|
||||
.headers(headers)
|
||||
.send();
|
||||
|
||||
let upload = tokio::time::timeout(self.timeout, upload);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = upload => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
if let Ok(inner) = &res {
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
// not if let-ing an Ok(inner), since res is not double-Result<>-wrapped with the tokio
|
||||
// timeout, observe_elapsed's AttemptedOutcome trait obj expects
|
||||
// &Result<reqwest::Response> which &res directly is, and it can handle the Err case.
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
match res {
|
||||
Ok(Ok(res)) => {
|
||||
Ok(res) => {
|
||||
if !res.status().is_success() {
|
||||
match res.status() {
|
||||
_ => Err(anyhow::anyhow!("GCS PUT error \n\t {:?}", res)),
|
||||
@@ -410,8 +416,7 @@ impl GCSBucket {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Ok(Err(reqw)) => Err(reqw.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
Err(reqw) => Err(reqw.into()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -263,6 +263,8 @@ struct Args {
|
||||
/* BEGIN_HADRON */
|
||||
#[arg(long)]
|
||||
enable_pull_timeline_on_startup: bool,
|
||||
#[arg(long)]
|
||||
hcc_base_url: Option<url::Url>,
|
||||
/// How often to scan entire data-dir for total disk usage
|
||||
#[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)]
|
||||
global_disk_check_interval: Duration,
|
||||
@@ -459,7 +461,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
/* BEGIN_HADRON */
|
||||
advertise_pg_addr_tenant_only: None,
|
||||
enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
|
||||
hcc_base_url: None,
|
||||
hcc_base_url: args.hcc_base_url,
|
||||
global_disk_check_interval: args.global_disk_check_interval,
|
||||
max_global_disk_usage_ratio: args.max_global_disk_usage_ratio,
|
||||
/* END_HADRON */
|
||||
|
||||
@@ -585,7 +585,7 @@ impl Service {
|
||||
host_list: Vec::new(),
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
generation: tl.generation as u32,
|
||||
generation: i32::MAX as u32,
|
||||
kind: SafekeeperTimelineOpKind::Delete,
|
||||
};
|
||||
locked.safekeeper_reconcilers.schedule_request(req);
|
||||
|
||||
Reference in New Issue
Block a user