From 75d583c04a1c68a3bc859151e12ccc3bb6514862 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 21 Jun 2023 14:25:58 +0300 Subject: [PATCH] Tenant::load: fix uninit timeline marker processing (#4458) ## Problem During timeline creation we create special mark file which presense indicates that initialization didnt complete successfully. In case of a crash restart we can remove such half-initialized timeline and following retry from control plane side should perform another attempt. So in case of a possible crash restart during initial loading we have following picture: ``` timelines | - ___uninit | - | - | ``` We call `std::fs::read_dir` to walk files in `timelines` directory one by one. If we see uninit file we proceed with deletion of both, timeline directory and uninit file. If we see timeline we check if uninit file exists and do the same cleanup. But in fact its possible to get both branches to be true at the same time. Result of readdir doesnt reflect following directory state modifications. So you can still get "valid" entry on the next iteration of the loop despite the fact that it was deleted in one of the previous iterations of the loop. To see that you can apply the following patch (it disables uninit mark cleanup on successful timeline creation): ```diff diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4beb2664..b3cdad8f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -224,11 +224,6 @@ impl UninitializedTimeline<'_> { ) })?; } - uninit_mark.remove_uninit_mark().with_context(|| { - format!( - "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" - ) - })?; v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); ``` And perform the following steps: ```bash neon_local init neon_local start neon_local tenant create neon_local stop neon_local start ``` The error is: ```log INFO load{tenant_id=X}:blocking: Found an uninit mark file .neon/tenants/X/timelines/Y.___uninit, removing the timeline and its uninit mark 2023-06-09T18:43:41.664247Z ERROR load{tenant_id=X}: load failed, setting tenant state to Broken: failed to load metadata Caused by: 0: Failed to read metadata bytes from path .neon/tenants/X/timelines/Y/metadata 1: No such file or directory (os error 2) ``` So uninit mark got deleted together with timeline directory but we still got directory entry for it and tried to load it. The bug prevented tenant from being successfully loaded. ## Summary of changes Ideally I think we shouldnt place uninit marks in the same directory as timeline directories but move them to separate directory and gather them as an input to actual listing, but that would be sort of an on-disk format change, so just check whether entries are still valid before operating on them. --- libs/utils/src/http/error.rs | 3 +- pageserver/src/http/routes.rs | 14 +- pageserver/src/page_service.rs | 6 +- pageserver/src/tenant.rs | 337 ++++++++++-------- pageserver/src/tenant/mgr.rs | 6 +- .../src/tenant/remote_timeline_client.rs | 2 +- .../walreceiver/connection_manager.rs | 2 +- safekeeper/src/timeline.rs | 2 +- 8 files changed, 208 insertions(+), 164 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index f9c06453df..527e486fd0 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,5 +1,6 @@ use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; +use std::error::Error as StdError; use thiserror::Error; use tracing::error; @@ -15,7 +16,7 @@ pub enum ApiError { Unauthorized(String), #[error("NotFound: {0}")] - NotFound(anyhow::Error), + NotFound(Box), #[error("Conflict: {0}")] Conflict(String), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index fc8da70cc0..5bec07b74a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -142,7 +142,7 @@ impl From for ApiError { impl From for ApiError { fn from(tse: TenantStateError) -> ApiError { match tse { - TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), _ => ApiError::InternalServerError(anyhow::Error::new(tse)), } } @@ -151,7 +151,7 @@ impl From for ApiError { impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { - GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), e @ GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does @@ -169,7 +169,7 @@ impl From for ApiError { fn from(e: SetNewTenantConfigError) -> ApiError { match e { SetNewTenantConfigError::GetTenant(tid) => { - ApiError::NotFound(anyhow!("tenant {}", tid)) + ApiError::NotFound(anyhow!("tenant {}", tid).into()) } e @ SetNewTenantConfigError::Persist(_) => { ApiError::InternalServerError(anyhow::Error::new(e)) @@ -182,7 +182,7 @@ impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; match value { - NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")), + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), HasChildren(children) => ApiError::PreconditionFailed( format!("Cannot delete timeline which has child timelines: {children:?}") .into_boxed_str(), @@ -397,7 +397,7 @@ async fn timeline_detail_handler( let timeline = tenant .get_timeline(timeline_id, false) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; let timeline_info = build_timeline_info( &timeline, @@ -1061,7 +1061,7 @@ async fn timeline_download_remote_layers_handler_get( let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; json_response(StatusCode::OK, info) } @@ -1072,7 +1072,7 @@ async fn active_timeline_of_active_tenant( let tenant = mgr::get_tenant(tenant_id, true).await?; tenant .get_timeline(timeline_id, true) - .map_err(ApiError::NotFound) + .map_err(|e| ApiError::NotFound(e.into())) } async fn always_panic_handler( diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9e9285a009..31ad45790c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -390,7 +390,9 @@ impl PageServerHandler { }; // Check that the timeline exists - let timeline = tenant.get_timeline(timeline_id, true)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(|e| anyhow::anyhow!(e))?; // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; @@ -1230,6 +1232,6 @@ async fn get_active_tenant_timeline( .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant .get_timeline(timeline_id, true) - .map_err(GetActiveTimelineError::Timeline)?; + .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?; Ok(timeline) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7fdd047c96..0e8d6b1287 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -421,6 +421,21 @@ remote: } } +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum GetTimelineError { + #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] + NotActive { + tenant_id: TenantId, + timeline_id: TimelineId, + state: TimelineState, + }, + #[error("Timeline {tenant_id}/{timeline_id} was not found")] + NotFound { + tenant_id: TenantId, + timeline_id: TimelineId, + }, +} + #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("NotFound")] @@ -946,6 +961,117 @@ impl Tenant { tenant } + pub fn scan_and_sort_timelines_dir( + self: Arc, + ) -> anyhow::Result> { + let timelines_dir = self.conf.timelines_path(&self.tenant_id); + let mut timelines_to_load: HashMap = HashMap::new(); + + for entry in + std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? + { + let entry = entry.context("read timeline dir entry")?; + let timeline_dir = entry.path(); + + if crate::is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else if is_uninit_mark(&timeline_dir) { + if !timeline_dir.exists() { + warn!( + "Timeline dir entry become invalid: {}", + timeline_dir.display() + ); + continue; + } + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + } else { + if !timeline_dir.exists() { + warn!( + "Timeline dir entry become invalid: {}", + timeline_dir.display() + ); + continue; + } + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = self + .conf + .timeline_uninit_mark_file_path(self.tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!( + %timeline_id, + "Found an uninit mark file, removing the timeline and its uninit mark", + ); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + continue; + } + + let file_name = entry.file_name(); + if let Ok(timeline_id) = + file_name.to_str().unwrap_or_default().parse::() + { + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) + .context("failed to load metadata")?; + timelines_to_load.insert(timeline_id, metadata); + } else { + // A file or directory that doesn't look like a timeline ID + warn!( + "unexpected file or directory in timelines directory: {}", + file_name.to_string_lossy() + ); + } + } + } + + // Sort the array of timeline IDs into tree-order, so that parent comes before + // all its children. + tree_sort_timelines(timelines_to_load) + } + /// /// Background task to load in-memory data structures for this tenant, from /// files on disk. Used at pageserver startup. @@ -962,110 +1088,16 @@ impl Tenant { utils::failpoint_sleep_millis_async!("before-loading-tenant"); - // TODO split this into two functions, scan and actual load - // Load in-memory state to reflect the local files on disk // // Scan the directory, peek into the metadata file of each timeline, and // collect a list of timelines and their ancestors. - let tenant_id = self.tenant_id; - let conf = self.conf; let span = info_span!("blocking"); + let cloned = Arc::clone(self); let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || { let _g = span.entered(); - let mut timelines_to_load: HashMap = HashMap::new(); - let timelines_dir = conf.timelines_path(&tenant_id); - - for entry in - std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? - { - let entry = entry.context("read timeline dir entry")?; - let timeline_dir = entry.path(); - - if crate::is_temporary(&timeline_dir) { - info!( - "Found temporary timeline directory, removing: {}", - timeline_dir.display() - ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { - error!( - "Failed to remove temporary directory '{}': {:?}", - timeline_dir.display(), - e - ); - } - } else if is_uninit_mark(&timeline_dir) { - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {}, removing the timeline and its uninit mark", - timeline_uninit_mark_file.display() - ); - let timeline_id = timeline_uninit_mark_file - .file_stem() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {}", - timeline_uninit_mark_file.display() - ) - })?; - let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else { - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {}", - timeline_dir.display() - ) - })?; - let timeline_uninit_mark_file = - conf.timeline_uninit_mark_file_path(tenant_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - %timeline_id, - "Found an uninit mark file, removing the timeline and its uninit mark", - ); - if let Err(e) = remove_timeline_and_uninit_mark( - &timeline_dir, - &timeline_uninit_mark_file, - ) { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = - file_name.to_str().unwrap_or_default().parse::() - { - let metadata = load_metadata(conf, timeline_id, tenant_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); - } else { - // A file or directory that doesn't look like a timeline ID - warn!( - "unexpected file or directory in timelines directory: {}", - file_name.to_string_lossy() - ); - } - } - } - - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - tree_sort_timelines(timelines_to_load) + cloned.scan_and_sort_timelines_dir() }) .await .context("load spawn_blocking") @@ -1213,19 +1245,21 @@ impl Tenant { &self, timeline_id: TimelineId, active_only: bool, - ) -> anyhow::Result> { + ) -> Result, GetTimelineError> { let timelines_accessor = self.timelines.lock().unwrap(); - let timeline = timelines_accessor.get(&timeline_id).with_context(|| { - format!("Timeline {}/{} was not found", self.tenant_id, timeline_id) - })?; + let timeline = timelines_accessor + .get(&timeline_id) + .ok_or(GetTimelineError::NotFound { + tenant_id: self.tenant_id, + timeline_id, + })?; if active_only && !timeline.is_active() { - anyhow::bail!( - "Timeline {}/{} is not active, state: {:?}", - self.tenant_id, + Err(GetTimelineError::NotActive { + tenant_id: self.tenant_id, timeline_id, - timeline.current_state() - ) + state: timeline.current_state(), + }) } else { Ok(Arc::clone(timeline)) } @@ -3375,9 +3409,8 @@ where #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; - use once_cell::sync::Lazy; use once_cell::sync::OnceCell; - use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + use std::sync::Arc; use std::{fs, path::PathBuf}; use utils::logging; use utils::lsn::Lsn; @@ -3410,8 +3443,6 @@ pub mod harness { buf.freeze() } - static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); - impl From for TenantConfOpt { fn from(tenant_conf: TenantConf) -> Self { Self { @@ -3438,33 +3469,16 @@ pub mod harness { } } - pub struct TenantHarness<'a> { + pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, pub tenant_id: TenantId, - - pub lock_guard: ( - Option>, - Option>, - ), } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); - impl<'a> TenantHarness<'a> { + impl TenantHarness { pub fn create(test_name: &'static str) -> anyhow::Result { - Self::create_internal(test_name, false) - } - pub fn create_exclusive(test_name: &'static str) -> anyhow::Result { - Self::create_internal(test_name, true) - } - fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result { - let lock_guard = if exclusive { - (None, Some(LOCK.write().unwrap())) - } else { - (Some(LOCK.read().unwrap()), None) - }; - LOG_HANDLE.get_or_init(|| { logging::init( logging::LogFormat::Test, @@ -3500,7 +3514,6 @@ pub mod harness { conf, tenant_conf, tenant_id, - lock_guard, }) } @@ -3525,26 +3538,12 @@ pub mod harness { self.tenant_id, None, )); - // populate tenant with locally available timelines - let mut timelines_to_load = HashMap::new(); - for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) - .expect("should be able to read timelines dir") - { - let timeline_dir_entry = timeline_dir_entry?; - let timeline_id: TimelineId = timeline_dir_entry - .path() - .file_name() - .unwrap() - .to_string_lossy() - .parse()?; - - let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; - timelines_to_load.insert(timeline_id, timeline_metadata); - } tenant .load(None, ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) .await?; + + // TODO reuse Tenant::activate (needs broker) tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { timeline.set_state(TimelineState::Active); @@ -4070,9 +4069,13 @@ mod tests { std::fs::write(metadata_path, metadata_bytes)?; let err = harness.try_load(&ctx).await.err().expect("should fail"); - assert!(err - .to_string() - .starts_with("Failed to parse metadata bytes from path")); + // get all the stack with all .context, not tonly the last one + let message = format!("{err:#}"); + let expected = "Failed to parse metadata bytes from path"; + assert!( + message.contains(expected), + "message '{message}' expected to contain {expected}" + ); let mut found_error_message = false; let mut err_source = err.source(); @@ -4506,6 +4509,44 @@ mod tests { assert!(expect_initdb_optimization); assert!(initdb_optimization_count > 0); } + Ok(()) + } + + #[tokio::test] + async fn test_uninit_mark_crash() -> anyhow::Result<()> { + let name = "test_uninit_mark_crash"; + let harness = TenantHarness::create(name)?; + { + let (tenant, ctx) = harness.load().await; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + // Keeps uninit mark in place + std::mem::forget(tline); + } + + let (tenant, _) = harness.load().await; + match tenant.get_timeline(TIMELINE_ID, false) { + Ok(_) => panic!("timeline should've been removed during load"), + Err(e) => { + assert_eq!( + e, + GetTimelineError::NotFound { + tenant_id: tenant.tenant_id, + timeline_id: TIMELINE_ID, + } + ) + } + } + + assert!(!harness + .conf + .timeline_path(&TIMELINE_ID, &tenant.tenant_id) + .exists()); + + assert!(!harness + .conf + .timeline_uninit_mark_file_path(tenant.tenant_id, TIMELINE_ID) + .exists()); Ok(()) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7e123c3fbd..09b825d2e9 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -675,7 +675,7 @@ pub async fn immediate_gc( .get(&tenant_id) .map(Arc::clone) .with_context(|| format!("tenant {tenant_id}")) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting @@ -724,11 +724,11 @@ pub async fn immediate_compact( .get(&tenant_id) .map(Arc::clone) .with_context(|| format!("tenant {tenant_id}")) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; let timeline = tenant .get_timeline(timeline_id, true) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; // Run in task_mgr to avoid race with tenant_detach operation let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 8db2bc4eb2..7808b64d35 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1367,7 +1367,7 @@ mod tests { struct TestSetup { runtime: &'static tokio::runtime::Runtime, entered_runtime: EnterGuard<'static>, - harness: TenantHarness<'static>, + harness: TenantHarness, tenant: Arc, tenant_ctx: RequestContext, remote_fs_dir: PathBuf, diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 83dfc5f598..fa23ae765d 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1321,7 +1321,7 @@ mod tests { const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr"; - async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState { + async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 52c3e8d4be..30036cc7f2 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -266,7 +266,7 @@ impl From for ApiError { fn from(te: TimelineError) -> ApiError { match te { TimelineError::NotFound(ttid) => { - ApiError::NotFound(anyhow!("timeline {} not found", ttid)) + ApiError::NotFound(anyhow!("timeline {} not found", ttid).into()) } _ => ApiError::InternalServerError(anyhow!("{}", te)), }