mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 14:40:37 +00:00
fix: on-demand downloads can outlive timeline shutdown (#7051)
## Problem Before this PR, it was possible that on-demand downloads were started after `Timeline::shutdown()`. For example, we have observed a walreceiver-connection-handler-initiated on-demand download that was started after `Timeline::shutdown()`s final `task_mgr::shutdown_tasks()` call. The underlying issue is that `task_mgr::shutdown_tasks()` isn't sticky, i.e., new tasks can be spawned during or after `task_mgr::shutdown_tasks()`. Cc: https://github.com/neondatabase/neon/issues/4175 in lieu of a more specific issue for task_mgr. We already decided we want to get rid of it anyways. Original investigation: https://neondb.slack.com/archives/C033RQ5SPDH/p1709824952465949 ## Changes - enter gate while downloading - use timeline cancellation token for cancelling download thereby, fixes #7054 Entering the gate might also remove recent "kept the gate from closing" in staging.
This commit is contained in:
@@ -272,9 +272,6 @@ pub enum TaskKind {
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
|
||||
@@ -880,23 +880,18 @@ impl LayerInner {
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
|
||||
// block tenant::mgr::remove_tenant_from_memory.
|
||||
|
||||
let this: Arc<Self> = self.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||
Some(self.desc.tenant_shard_id),
|
||||
Some(self.desc.timeline_id),
|
||||
&task_name,
|
||||
false,
|
||||
async move {
|
||||
let guard = timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| DownloadError::DownloadCancelled)?;
|
||||
|
||||
tokio::task::spawn(async move {
|
||||
|
||||
let _guard = guard;
|
||||
|
||||
let client = timeline
|
||||
.remote_client
|
||||
@@ -906,7 +901,7 @@ impl LayerInner {
|
||||
let result = client.download_layer_file(
|
||||
&this.desc.filename(),
|
||||
&this.metadata(),
|
||||
&crate::task_mgr::shutdown_token()
|
||||
&timeline.cancel
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -929,7 +924,6 @@ impl LayerInner {
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff) => {},
|
||||
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
|
||||
_ = timeline.cancel.cancelled() => {},
|
||||
};
|
||||
|
||||
@@ -959,11 +953,10 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
match rx.await {
|
||||
Ok((Ok(()), permit)) => {
|
||||
if let Some(reason) = self
|
||||
|
||||
Reference in New Issue
Block a user