From 80f10d5ced19493d6d491f87db11211d80839adb Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 08:31:55 +0100 Subject: [PATCH] pageserver: safe deletion for tenant directories (#5182) ## Problem If a pageserver crashes partway through deleting a tenant's directory, it might leave a partial state that confuses a subsequent startup/attach. ## Summary of changes Rename tenant directory to a temporary path before deleting. Timeline deletions already have deletion markers to provide safety. In future, it would be nice to exploit this to send responses to detach requests earlier: https://github.com/neondatabase/neon/issues/5183 --- pageserver/src/tenant/mgr.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index dd5f34b6b2..05be9393a0 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -22,8 +22,9 @@ use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; +use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; @@ -60,6 +61,29 @@ impl TenantsMap { } } +/// This is "safe" in that that it won't leave behind a partially deleted directory +/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting +/// the contents. +/// +/// This is pageserver-specific, as it relies on future processes after a crash to check +/// for TEMP_FILE_SUFFIX when loading things. +async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { + let parent = path + .as_ref() + .parent() + // It is invalid to call this function with a relative path. Tenant directories + // should always have a parent. + .ok_or(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "Path must be absolute", + ))?; + + let tmp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); + fs::rename(&path, &tmp_path).await?; + fs::File::open(parent).await?.sync_all().await?; + fs::remove_dir_all(tmp_path).await +} + static TENANTS: Lazy> = Lazy::new(|| RwLock::new(TenantsMap::Initializing)); /// Initialize repositories with locally available timelines. @@ -92,6 +116,8 @@ pub async fn init_tenant_mgr( "Found temporary tenant directory, removing: {}", tenant_dir_path.display() ); + // No need to use safe_remove_tenant_dir_all because this is already + // a temporary path if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await { error!( "Failed to remove temporary directory '{}': {:?}", @@ -490,7 +516,7 @@ async fn detach_tenant0( ) -> Result<(), TenantStateError> { let local_files_cleanup_operation = |tenant_id_to_clean| async move { let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); - fs::remove_dir_all(&local_tenant_directory) + safe_remove_tenant_dir_all(&local_tenant_directory) .await .with_context(|| { format!("local tenant directory {local_tenant_directory:?} removal")