fix(pageserver): drain upload queue before offloading timeline (#9682)

It is possible at the point we shutdown the timeline, there are
still layer files we did not upload.

## Summary of changes

* If the queue is not empty, avoid offloading.
* Shutdown the timeline gracefully using the flush mode to
ensure all local files are uploaded before deleting the timeline
directory.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2024-11-08 14:28:55 -05:00
committed by GitHub
parent ab47804d00
commit af8238ae52
4 changed files with 12 additions and 7 deletions

View File

@@ -2002,9 +2002,9 @@ async fn timeline_offload_handler(
"timeline has attached children".into(),
));
}
if !timeline.can_offload() {
if let (false, reason) = timeline.can_offload() {
return Err(ApiError::PreconditionFailed(
"Timeline::can_offload() returned false".into(),
format!("Timeline::can_offload() check failed: {}", reason) .into(),
));
}
offload_timeline(&tenant, &timeline)

View File

@@ -2493,7 +2493,8 @@ impl Tenant {
timelines_to_compact_or_offload = timelines
.iter()
.filter_map(|(timeline_id, timeline)| {
let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
let (is_active, (can_offload, _)) =
(timeline.is_active(), timeline.can_offload());
let has_no_unoffloaded_children = {
!timelines
.iter()

View File

@@ -1570,12 +1570,16 @@ impl Timeline {
///
/// This is neccessary but not sufficient for offloading of the timeline as it might have
/// child timelines that are not offloaded yet.
pub(crate) fn can_offload(&self) -> bool {
pub(crate) fn can_offload(&self) -> (bool, &'static str) {
if self.remote_client.is_archived() != Some(true) {
return false;
return (false, "the timeline is not archived");
}
if !self.remote_client.no_pending_work() {
// if the remote client is still processing some work, we can't offload
return (false, "the upload queue is not drained yet");
}
true
(true, "ok")
}
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending

View File

@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
}
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Hard).await;
timeline.shutdown(super::ShutdownMode::Flush).await;
// TODO extend guard mechanism above with method
// to make deletions possible while offloading is in progress