fsync directory before mark file removal (#4986)

## Problem

Deletions can be possibly reordered. Use fsync to avoid the case when
mark file doesnt exist but other tenant/timeline files do.

See added comments.

resolves #4987
This commit is contained in:
Dmitry Rodionov
2023-08-15 19:24:23 +03:00
committed by GitHub
parent 207919f5eb
commit 52c2c69351
5 changed files with 31 additions and 7 deletions

View File

@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
.map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
}
pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
tokio::fs::File::open(path).await?.sync_all().await
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;

View File

@@ -212,6 +212,19 @@ async fn cleanup_remaining_fs_traces(
))?
});
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let tenant_path = &conf.tenant_path(tenant_id);
if tenant_path.exists() {
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
.await
.context("fsync_pre_mark_remove")?;
}
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {

View File

@@ -11,7 +11,7 @@ use std::time::Duration;
use anyhow::{anyhow, Context};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use utils::backoff;
use utils::{backoff, crashsafe};
use crate::config::PageServerConf;
use crate::tenant::storage_layer::LayerFileName;
@@ -23,10 +23,6 @@ use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, LayerFileMetadata};
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
}
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
///
@@ -150,7 +146,7 @@ pub async fn download_layer_file<'a>(
})
.map_err(DownloadError::Other)?;
fsync_path(&local_path)
crashsafe::fsync_async(&local_path)
.await
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
.map_err(DownloadError::Other)?;

View File

@@ -279,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
});
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let timeline_path = conf.timelines_path(&tenant_id);
crashsafe::fsync_async(timeline_path)
.await
.context("fsync_pre_mark_remove")?;
// Remove delete mark
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
.await

View File

@@ -284,4 +284,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8
return 30 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10