Compare commits

...

2 Commits

Author SHA1 Message Date
Christian Schwarz
01898af391 clippy 2023-11-24 09:15:40 +00:00
Christian Schwarz
85f0867db8 failure during timeline creation: always clean up timeline dir
Context: https://app.incident.io/neondb/incidents/103

Before this change, there's the following race condition if the tenant
gets deleted during timeline creation:

TBD, copy from https://neondb.slack.com/archives/C066ZFAJU85/p1700751858049319

The root of the problem is that we bail out of timeline creation without
cleaning up the uninit marker or timeline directory.

This PR changes the code to do that, and with our new policy for
infallible local IO, it also takes the opportunity to clean stuff up
around `TimelineUninitMark` & `UninitializedTimeline`.

I also added a missing fsync of the common parent directory between
timelines_dir removal and uninit marker removal.

We could probably get rid of the entire uninit mark idea, as
we no longer treat local FS state as the source of truth, and we only
upload to remote storage after successful creation (right?).
2023-11-24 09:12:17 +00:00
2 changed files with 123 additions and 79 deletions

View File

@@ -86,7 +86,6 @@ use crate::tenant::storage_layer::ImageLayer;
use crate::InitializationOrder;
use crate::tenant::timeline::delete::DeleteTimelineFlow;
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
use crate::virtual_file::VirtualFile;
use crate::walredo::PostgresRedoManager;
use crate::TEMP_FILE_SUFFIX;
@@ -3091,7 +3090,7 @@ impl Tenant {
.await
{
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
cleanup_timeline_directory(uninit_mark);
drop(uninit_mark); // does the cleanup
return Err(e);
}
@@ -3143,20 +3142,8 @@ impl Tenant {
"Timeline {timeline_path} already exists, cannot create its uninit mark file",
);
let uninit_mark_path = self
.conf
.timeline_uninit_mark_file_path(tenant_id, timeline_id);
fs::File::create(&uninit_mark_path)
.context("Failed to create uninit mark file")
.and_then(|_| {
crashsafe::fsync_file_and_parent(&uninit_mark_path)
.context("Failed to fsync uninit mark file")
})
.with_context(|| {
format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}")
})?;
let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path);
let uninit_mark = TimelineUninitMark::new(self.conf, tenant_id, timeline_id)
.context("create uninit mark")?;
Ok(uninit_mark)
}

View File

@@ -2,10 +2,20 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
use anyhow::Context;
use camino::Utf8PathBuf;
use tracing::{error, info, info_span, warn};
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
use tracing::{info, info_span, warn};
use utils::{
crashsafe,
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
use crate::{
config::PageServerConf,
context::RequestContext,
import_datadir,
tenant::Tenant,
virtual_file::{on_fatal_io_error, MaybeFatalIo},
};
use super::Timeline;
@@ -62,13 +72,8 @@ impl<'t> UninitializedTimeline<'t> {
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
),
Entry::Vacant(v) => {
uninit_mark.remove_uninit_mark().with_context(|| {
format!(
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
)
})?;
uninit_mark.remove_uninit_mark();
v.insert(Arc::clone(&new_timeline));
new_timeline.maybe_spawn_flush_loop();
}
}
@@ -126,29 +131,6 @@ impl<'t> UninitializedTimeline<'t> {
}
}
impl Drop for UninitializedTimeline<'_> {
fn drop(&mut self) {
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
error!("Timeline got dropped without initializing, cleaning its files");
cleanup_timeline_directory(uninit_mark);
}
}
}
pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
let timeline_path = &uninit_mark.timeline_path;
match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
Ok(()) => {
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
}
Err(e) => {
error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
}
}
drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
}
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
/// or gets removed eventually.
///
@@ -158,58 +140,133 @@ pub(crate) struct TimelineUninitMark {
uninit_mark_deleted: bool,
uninit_mark_path: Utf8PathBuf,
pub(crate) timeline_path: Utf8PathBuf,
common_parent: Utf8PathBuf,
}
impl TimelineUninitMark {
pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
Self {
pub(crate) fn new(
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<Self> {
let timeline_path = conf.timeline_path(&tenant_id, &timeline_id);
let uninit_mark_path = conf.timeline_uninit_mark_file_path(tenant_id, timeline_id);
// assert they share the same parent
let timeline_parent_path = timeline_path
.parent()
.expect("timeline_path must have a parent");
let uninit_mark_parent_path = uninit_mark_path
.parent()
.expect("uninit mark path must have a parent");
assert_eq!(timeline_parent_path, uninit_mark_parent_path);
let common_parent = uninit_mark_parent_path;
// crate the uninit file
let _ = fs::OpenOptions::new()
.create_new(true)
.write(true)
.open(&uninit_mark_path)
.context("create uninit mark file")?;
crashsafe::fsync_file_and_parent(common_parent).context("fsync uninit mark file")?;
Ok(Self {
uninit_mark_deleted: false,
common_parent: common_parent.to_owned(),
uninit_mark_path,
timeline_path,
}
})
}
fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
if !self.uninit_mark_deleted {
self.delete_mark_file_if_present()?;
}
fn remove_uninit_mark(mut self) {
// remove the uninit mark
fs::remove_file(&self.uninit_mark_path).fatal_err(&format!(
"TimelineUninitMark::drop: remove_file uninit mark: {}",
self.uninit_mark_path
));
Ok(())
}
// fsync to persist the removal
crashsafe::fsync(&self.common_parent).fatal_err(&format!(
"TimelineUninitMark::drop: fsync common parent dir: {}",
self.common_parent
));
fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
let uninit_mark_file = &self.uninit_mark_path;
let uninit_mark_parent = uninit_mark_file
.parent()
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
})?;
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
self.uninit_mark_deleted = true;
Ok(())
}
}
impl Drop for TimelineUninitMark {
fn drop(&mut self) {
if !self.uninit_mark_deleted {
if self.timeline_path.exists() {
error!(
"Uninit mark {} is not removed, timeline {} stays uninitialized",
self.uninit_mark_path, self.timeline_path
)
} else {
// unblock later timeline creation attempts
warn!(
"Removing intermediate uninit mark file {}",
// unblock later timeline creation attempts
let _entered =
info_span!("TimelineUninitMark_drop", timeline_path=%self.timeline_path).entered();
warn!("removing timeline dir and uninit mark file");
// sanity-check: ensure the uninit mark file still exists on disk
let uninit_mark_file_exists = self.uninit_mark_path.try_exists().fatal_err(&format!(
"TimelineUninitMark::drop: stat() uninit mark file: {}",
self.uninit_mark_path
));
if !uninit_mark_file_exists {
panic!(
"uninit mark file assumed to exists but doesn't: {}",
self.uninit_mark_path
);
if let Err(e) = self.delete_mark_file_if_present() {
error!("Failed to remove the uninit mark file: {e}")
}
// recursively delete `timeline_path`, ignoring NotFound errors and aborting the process on all others.
match fs::remove_dir_all(&self.timeline_path) {
Ok(()) => {
info!("timeline dir removed successfully");
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// this can happen both if the timeline_path does not exist
// and if the timeline_path exists and there's another thread
// still operating on that directory and our remove_dir_all call
// effectively got hit by time-of-check vs time-of-use.
// Disambiguate by calling remove_dir against the timeline_path
match std::fs::remove_dir(&self.timeline_path) {
Ok(()) => {
warn!("retrying timeline dir removal succeeded after NotFound, this is indicative of a race condition");
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// this is the good case: the first NotFound was because the dir didn't exist
info!("timeline dir does not exist");
}
Err(e) => {
on_fatal_io_error(&e, &format!("TimelineUninitMark::drop: remove_dir_all failed with NotFound, then remove_dir failed: {}", self.timeline_path));
}
}
}
Err(e) => {
on_fatal_io_error(
&e,
&format!(
"TimelineUninitMark::drop: delete timeline directory: {:?}",
self.timeline_path
),
);
}
}
// fsync to order timelines_dir removal before unint mark removal
crashsafe::fsync(&self.common_parent).fatal_err(&format!(
"TimelineUninitMark::drop: fsync after timeline dir removal: {}",
self.common_parent,
));
// remove the uninit mark
fs::remove_file(&self.common_parent).fatal_err(&format!(
"TimelineUninitMark::drop: remove_file uninit mark: {}",
self.common_parent,
));
// fsync to persist the removal
crashsafe::fsync(&self.common_parent).fatal_err(&format!(
"TimelineUninitMark::drop: fsync common parent dir: {}",
self.common_parent,
));
}
}
}