pageserver: improve handling of archival_config calls during Timeline shutdown (#9415)

## Problem

In test `test_timeline_offloading`, we see failures like:
```
PageserverApiException: queue is in state Stopped
```

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/11356917668/index.html#testresult/ff0e348a78a974ee/retries

## Summary of changes

- Amend code paths that handle errors from RemoteTimelineClient to check
for cancellation and emit the Cancelled error variant in these cases
(will give clients a 503 to retry)
- Remove the implicit `#[from]` for the Other error case, to make it
harder to add code that accidentally squashes errors into this
(500-equivalent) error variant.

This would be neater if we made RemoteTimelineClient return a structured
error instead of anyhow::Error, but that's a bigger refactor.

I'm not sure if the test really intends to hit this path, but the error
handling fix makes sense either way.
This commit is contained in:
John Spray
2024-10-16 13:39:58 +01:00
committed by GitHub
parent bc6b8cee01
commit 89a65a9e5a

View File

@@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata;
use self::mgr::GetActiveTenantError;
use self::mgr::GetTenantError;
use self::remote_timeline_client::upload::upload_index_part;
use self::remote_timeline_client::RemoteTimelineClient;
use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
use self::timeline::uninit::TimelineCreateGuard;
use self::timeline::uninit::TimelineExclusionError;
use self::timeline::uninit::UninitializedTimeline;
@@ -632,7 +632,7 @@ pub enum TimelineArchivalError {
AlreadyInProgress,
#[error(transparent)]
Other(#[from] anyhow::Error),
Other(anyhow::Error),
}
impl Debug for TimelineArchivalError {
@@ -1602,7 +1602,8 @@ impl Tenant {
"failed to load remote timeline {} for tenant {}",
timeline_id, self.tenant_shard_id
)
})?;
})
.map_err(TimelineArchivalError::Other)?;
let timelines = self.timelines.lock().unwrap();
if let Some(timeline) = timelines.get(&timeline_id) {
let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
@@ -1672,9 +1673,19 @@ impl Tenant {
};
// Third part: upload new timeline archival state and block until it is present in S3
let upload_needed = timeline
let upload_needed = match timeline
.remote_client
.schedule_index_upload_for_timeline_archival_state(new_state)?;
.schedule_index_upload_for_timeline_archival_state(new_state)
{
Ok(upload_needed) => upload_needed,
Err(e) => {
if timeline.cancel.is_cancelled() {
return Err(TimelineArchivalError::Cancelled);
} else {
return Err(TimelineArchivalError::Other(e));
}
}
};
if upload_needed {
info!("Uploading new state");
@@ -1685,7 +1696,14 @@ impl Tenant {
tracing::warn!("reached timeout for waiting on upload queue");
return Err(TimelineArchivalError::Timeout);
};
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
v.map_err(|e| match e {
WaitCompletionError::NotInitialized(e) => {
TimelineArchivalError::Other(anyhow::anyhow!(e))
}
WaitCompletionError::UploadQueueShutDownOrStopped => {
TimelineArchivalError::Cancelled
}
})?;
}
Ok(())
}