fix: allow dropping from UploadQueue by spawn_blocking

This commit is contained in:
Joonas Koivunen
2023-08-25 16:15:49 +03:00
parent bdfc895642
commit effc151244

View File

@@ -373,49 +373,58 @@ impl Drop for LayerInner {
// SEMITODO: this could be spawn_blocking or not; we are only doing the filesystem delete
// right now, later this will be a submit to the global deletion queue.
let span = tracing::info_span!(parent: None, "layer_drop", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id, layer = %self);
let _g = span.entered();
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id, layer = %self);
let mut removed = false;
match std::fs::remove_file(&self.path) {
Ok(()) => {
removed = true;
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// until we no longer do detaches by removing all local files before removing the
// tenant from the global map, we will always get these errors even if we knew what
// is the latest state.
//
// we currently do not track the latest state, so we'll also end up here on evicted
// layers.
}
Err(e) => {
tracing::error!(layer = %self, "failed to remove garbage collected layer: {e}");
}
}
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().filename();
let file_size = self.layer_desc().file_size;
let timeline = self.timeline.clone();
if let Some(timeline) = self.timeline.upgrade() {
if removed {
timeline
.metrics
.resident_physical_size_gauge
.sub(self.layer_desc().file_size);
}
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res =
remote_client.schedule_layer_file_deletion(&[self.layer_desc().filename()]);
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
let _g = span.entered();
if let Err(e) = res {
if !timeline.is_active() {
// downgrade the warning to info maybe?
}
tracing::warn!(layer=%self, "scheduling deletion on drop failed: {e:#}");
let mut removed = false;
match std::fs::remove_file(path) {
Ok(()) => {
removed = true;
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// until we no longer do detaches by removing all local files before removing the
// tenant from the global map, we will always get these errors even if we knew what
// is the latest state.
//
// we currently do not track the latest state, so we'll also end up here on evicted
// layers.
}
Err(e) => {
tracing::error!("failed to remove garbage collected layer: {e}");
}
}
} else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
}
if let Some(timeline) = timeline.upgrade() {
if removed {
timeline.metrics.resident_physical_size_gauge.sub(file_size);
}
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.schedule_layer_file_deletion(&[file_name]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
}
}
} else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
}
});
}
}