diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 5723a512f6..2f824cc453 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,7 +16,7 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use crate::{Download, DownloadError, RemoteObjectId}; diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe.rs similarity index 84% rename from libs/utils/src/crashsafe_dir.rs rename to libs/utils/src/crashsafe.rs index 032ab0a916..3726779cb2 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe.rs @@ -12,16 +12,8 @@ pub fn create_dir(path: impl AsRef) -> io::Result<()> { let path = path.as_ref(); fs::create_dir(path)?; - File::open(path)?.sync_all()?; - - if let Some(parent) = path.parent() { - File::open(parent)?.sync_all() - } else { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "can't find parent", - )) - } + fsync_file_and_parent(path)?; + Ok(()) } /// Similar to [`std::fs::create_dir_all`], except we fsync all @@ -65,12 +57,12 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { // Fsync the created directories from child to parent. for &path in dirs_to_create.iter() { - File::open(path)?.sync_all()?; + fsync(path)?; } // If we created any new directories, fsync the parent. if !dirs_to_create.is_empty() { - File::open(path)?.sync_all()?; + fsync(path)?; } Ok(()) @@ -92,6 +84,33 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> { + let parent = file_path.parent().ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("File {file_path:?} has no parent"), + ) + })?; + + fsync(file_path)?; + fsync(parent)?; + Ok(()) +} + +pub fn fsync(path: &Path) -> io::Result<()> { + File::open(path) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}"))) + .and_then(|file| { + file.sync_all().map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to sync file {path:?} data and metadata: {e}"), + ) + }) + }) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}"))) +} + #[cfg(test)] mod tests { use tempfile::tempdir; diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2c80556446..f1f48f5a90 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -22,8 +22,8 @@ pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts pub mod connstring; -// helper functions for creating and fsyncing directories/trees -pub mod crashsafe_dir; +// helper functions for creating and fsyncing +pub mod crashsafe; // common authentication routines pub mod auth; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 12f594077e..9317dd5dd7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,7 +87,7 @@ fn main() -> anyhow::Result<()> { let tenants_path = conf.tenants_path(); if !tenants_path.exists() { - utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| { format!( "Failed to create tenants root dir at '{}'", tenants_path.display() diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e3c7baad8..b797866e43 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,6 +7,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use remote_storage::RemoteStorageConfig; use std::env; +use utils::crashsafe::path_with_suffix_extension; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -24,6 +25,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; +pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { @@ -364,6 +366,17 @@ impl PageServerConf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } + pub fn timeline_uninit_mark_file_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> PathBuf { + path_with_suffix_extension( + self.timeline_path(&timeline_id, &tenant_id), + TIMELINE_UNINIT_MARK_SUFFIX, + ) + } + /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 23c4351b4e..ee3dc684e3 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. pub fn import_timeline_from_postgres_datadir( - path: &Path, tline: &Timeline, - lsn: Lsn, + pgdata_path: &Path, + pgdata_lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(lsn); + let mut modification = tline.begin_modification(pgdata_lsn); modification.init_empty()?; // Import all but pg_wal - let all_but_wal = WalkDir::new(path) + let all_but_wal = WalkDir::new(pgdata_path) .into_iter() .filter_entry(|entry| !entry.path().ends_with("pg_wal")); for entry in all_but_wal { @@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir( let metadata = entry.metadata().expect("error getting dir entry metadata"); if metadata.is_file() { let absolute_path = entry.path(); - let relative_path = absolute_path.strip_prefix(path)?; + let relative_path = absolute_path.strip_prefix(pgdata_path)?; let file = File::open(absolute_path)?; let len = metadata.len() as usize; @@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir( "Postgres cluster was not shut down cleanly" ); ensure!( - pg_control.checkPointCopy.redo == lsn.0, + pg_control.checkPointCopy.redo == pgdata_lsn.0, "unexpected checkpoint REDO pointer" ); @@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir( // this reads the checkpoint record itself, advancing the tip of the timeline to // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( - &path.join("pg_wal"), + &pgdata_path.join("pg_wal"), tline, Lsn(pg_control.checkPointCopy.redo), - lsn, + pgdata_lsn, )?; Ok(()) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 795a99058d..9b2bb3114d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -32,7 +32,7 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; -use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; +use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -500,11 +500,8 @@ impl PageServerHandler { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( - timeline_id, - base_lsn, - pg_version, - )?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -527,7 +524,8 @@ impl PageServerHandler { // - use block_in_place() let mut copyin_stream = Box::pin(copyin_stream(pgb)); let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?; + timeline.initialize()?; // Drain the rest of the Copy data let mut bytes_after_tar = 0; @@ -544,12 +542,6 @@ impl PageServerHandler { // It wouldn't work if base came from vanilla postgres though, // since we discard some log files. - // Flush data to disk, then upload to s3 - info!("flushing layers"); - timeline.checkpoint(CheckpointConfig::Flush)?; - - timeline.launch_wal_receiver()?; - info!("done"); Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fc9867dc05..424ce4769a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1403,7 +1403,9 @@ pub fn create_test_timeline( timeline_id: utils::id::TimelineId, pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; + let tline = tenant + .create_empty_timeline(timeline_id, Lsn(8), pg_version)? + .initialize()?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 61ef164f14..6f9b2e2071 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -22,7 +22,7 @@ use crate::{ TEMP_FILE_SUFFIX, }; use utils::{ - crashsafe_dir::path_with_suffix_extension, + crashsafe::path_with_suffix_extension, id::{TenantId, TenantTimelineId, TimelineId}, }; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 47ef9284b8..93c473f0fe 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,7 +14,7 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map::Entry; @@ -23,10 +23,12 @@ use std::collections::HashMap; use std::fs; use std::fs::File; use std::fs::OpenOptions; +use std::io; use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; +use std::path::PathBuf; use std::process::Command; use std::process::Stdio; use std::sync::Arc; @@ -49,7 +51,7 @@ pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ - crashsafe_dir, + crashsafe, id::{TenantId, TimelineId}, lsn::{Lsn, RecordLsn}, }; @@ -120,6 +122,216 @@ pub struct Tenant { upload_layers: bool, } +/// A timeline with some of its files on disk, being initialized. +/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or +/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory +/// to be removed on next restart. +/// +/// The caller is responsible for proper timeline data filling before the final init. +#[must_use] +pub struct UninitializedTimeline<'t> { + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + raw_timeline: Option<(Timeline, TimelineUninitMark)>, +} + +/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, +/// or gets removed eventually. +/// +/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +#[must_use] +struct TimelineUninitMark { + uninit_mark_deleted: bool, + uninit_mark_path: PathBuf, + timeline_path: PathBuf, +} + +impl UninitializedTimeline<'_> { + /// Ensures timeline data is valid, loads it into pageserver's memory and removes uninit mark file on success. + pub fn initialize(self) -> anyhow::Result> { + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(&mut timelines, true) + } + + fn initialize_with_lock( + mut self, + timelines: &mut HashMap>, + load_layer_map: bool, + ) -> anyhow::Result> { + let timeline_id = self.timeline_id; + let tenant_id = self.owning_tenant.tenant_id; + + let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { + format!("No timeline for initalization found for {tenant_id}/{timeline_id}") + })?; + let new_timeline = Arc::new(new_timeline); + + let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); + // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least + // ensure!(new_disk_consistent_lsn.is_valid(), + // "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized"); + + match timelines.entry(timeline_id) { + Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + ), + Entry::Vacant(v) => { + if load_layer_map { + new_timeline + .load_layer_map(new_disk_consistent_lsn) + .with_context(|| { + format!( + "Failed to load layermap for timeline {tenant_id}/{timeline_id}" + ) + })?; + } + uninit_mark.remove_uninit_mark().with_context(|| { + format!( + "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + ) + })?; + v.insert(Arc::clone(&new_timeline)); + new_timeline.launch_wal_receiver().with_context(|| { + format!("Failed to launch walreceiver for timeline {tenant_id}/{timeline_id}") + })?; + } + } + + Ok(new_timeline) + } + + /// Prepares timeline data by loading it from the basebackup archive. + pub fn import_basebackup_from_tar( + &self, + reader: impl std::io::Read, + base_lsn: Lsn, + ) -> anyhow::Result<()> { + let raw_timeline = self.raw_timeline()?; + import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context( + || { + format!( + "Failed to import basebackup for timeline {}/{}", + self.owning_tenant.tenant_id, self.timeline_id + ) + }, + )?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + raw_timeline + .checkpoint(CheckpointConfig::Flush) + .with_context(|| { + format!( + "Failed to checkpoint after basebackup import for timeline {}/{}", + self.owning_tenant.tenant_id, self.timeline_id + ) + })?; + Ok(()) + } + + fn raw_timeline(&self) -> anyhow::Result<&Timeline> { + Ok(&self + .raw_timeline + .as_ref() + .with_context(|| { + format!( + "No raw timeline {}/{} found", + self.owning_tenant.tenant_id, self.timeline_id + ) + })? + .0) + } +} + +impl Drop for UninitializedTimeline<'_> { + fn drop(&mut self) { + if let Some((_, uninit_mark)) = self.raw_timeline.take() { + let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered(); + error!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(uninit_mark); + } + } +} + +fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { + let timeline_path = &uninit_mark.timeline_path; + match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + Ok(()) => { + info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + } + Err(e) => { + error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") + } + } + drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists +} + +impl TimelineUninitMark { + /// Useful for initializing timelines, existing on disk after the restart. + pub fn dummy() -> Self { + Self { + uninit_mark_deleted: true, + uninit_mark_path: PathBuf::new(), + timeline_path: PathBuf::new(), + } + } + + fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { + Self { + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + } + } + + fn remove_uninit_mark(mut self) -> anyhow::Result<()> { + if !self.uninit_mark_deleted { + self.delete_mark_file_if_present()?; + } + + Ok(()) + } + + fn delete_mark_file_if_present(&mut self) -> Result<(), anyhow::Error> { + let uninit_mark_file = &self.uninit_mark_path; + let uninit_mark_parent = uninit_mark_file + .parent() + .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; + ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| { + format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") + })?; + crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; + self.uninit_mark_deleted = true; + + Ok(()) + } +} + +impl Drop for TimelineUninitMark { + fn drop(&mut self) { + if !self.uninit_mark_deleted { + if self.timeline_path.exists() { + error!( + "Uninit mark {} is not removed, timeline {} stays uninitialized", + self.uninit_mark_path.display(), + self.timeline_path.display() + ) + } else { + // unblock later timeline creation attempts + warn!( + "Removing intermediate uninit mark file {}", + self.uninit_mark_path.display() + ); + if let Err(e) = self.delete_mark_file_if_present() { + error!("Failed to remove the uninit mark file: {e}") + } + } + } + } +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { @@ -162,19 +374,10 @@ impl Tenant { new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, - ) -> Result> { - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - - anyhow::ensure!( - timelines.get(&new_timeline_id).is_none(), - "Timeline {new_timeline_id} already exists" - ); - - let timeline_path = self.conf.timeline_path(&new_timeline_id, &self.tenant_id); - if timeline_path.exists() { - bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") - } + ) -> anyhow::Result { + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?; + drop(timelines); let new_metadata = TimelineMetadata::new( Lsn(0), @@ -185,11 +388,13 @@ impl Tenant { initdb_lsn, pg_version, ); - let new_timeline = - self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; - new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - - Ok(new_timeline) + self.prepare_timeline( + new_timeline_id, + new_metadata, + timeline_uninit_mark, + true, + None, + ) } /// Create a new timeline. @@ -205,14 +410,10 @@ impl Tenant { ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, - ) -> Result>> { + ) -> anyhow::Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - if self - .conf - .timeline_path(&new_timeline_id, &self.tenant_id) - .exists() - { + if self.get_timeline(new_timeline_id).is_ok() { debug!("timeline {new_timeline_id} already exists"); return Ok(None); } @@ -391,21 +592,32 @@ impl Tenant { timeline_id, metadata.pg_version() ); - let ancestor = metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) - .cloned(); - match timelines_accessor.entry(timeline_id) { - Entry::Occupied(_) => warn!( + + if timelines_accessor.contains_key(&timeline_id) { + warn!( "Timeline {}/{} already exists in the tenant map, skipping its initialization", self.tenant_id, timeline_id - ), - Entry::Vacant(v) => { - let timeline = self - .initialize_new_timeline(timeline_id, metadata, ancestor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - v.insert(timeline); - } + ); + continue; + } else { + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + let timeline = UninitializedTimeline { + owning_tenant: self, + timeline_id, + raw_timeline: Some(( + self.create_timeline_data(timeline_id, metadata, ancestor) + .with_context(|| { + format!("Failed to initialize timeline {timeline_id}") + })?, + TimelineUninitMark::dummy(), + )), + }; + let initialized_timeline = + timeline.initialize_with_lock(&mut timelines_accessor, true)?; + timelines_accessor.insert(timeline_id, initialized_timeline); } } @@ -599,12 +811,12 @@ impl Tenant { self.tenant_conf.write().unwrap().update(&new_tenant_conf); } - fn initialize_new_timeline( + fn create_timeline_data( &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, ancestor: Option>, - ) -> anyhow::Result> { + ) -> anyhow::Result { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( ancestor.is_some(), @@ -612,9 +824,8 @@ impl Tenant { ) } - let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); let pg_version = new_metadata.pg_version(); - let new_timeline = Arc::new(Timeline::new( + Ok(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), new_metadata, @@ -624,15 +835,7 @@ impl Tenant { Arc::clone(&self.walredo_mgr), self.upload_layers, pg_version, - )); - - new_timeline - .load_layer_map(new_disk_consistent_lsn) - .context("failed to load layermap")?; - - new_timeline.launch_wal_receiver()?; - - Ok(new_timeline) + )) } pub fn new( @@ -914,11 +1117,14 @@ impl Tenant { src: TimelineId, dst: TimelineId, start_lsn: Option, - ) -> Result> { + ) -> anyhow::Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. let _gc_cs = self.gc_cs.lock().unwrap(); + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?; + drop(timelines); // In order for the branch creation task to not wait for GC/compaction, // we need to make sure that the starting LSN of the child branch is not out of scope midway by @@ -929,12 +1135,12 @@ impl Tenant { // Step 2 is to avoid initializing the new branch using data removed by past GC iterations // or in-queue GC iterations. - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + let src_timeline = self.get_timeline(src).with_context(|| { + format!( + "No ancestor {} found for timeline {}/{}", + src, self.tenant_id, dst + ) + })?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); @@ -988,7 +1194,17 @@ impl Tenant { src_timeline.initdb_lsn, src_timeline.pg_version, ); - let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; + let mut timelines = self.timelines.lock().unwrap(); + let new_timeline = self + .prepare_timeline( + dst, + metadata, + timeline_uninit_mark, + false, + Some(src_timeline), + )? + .initialize_with_lock(&mut timelines, true)?; + drop(timelines); info!("branched timeline {dst} from {src} at {start_lsn}"); Ok(new_timeline) @@ -1000,7 +1216,10 @@ impl Tenant { &self, timeline_id: TimelineId, pg_version: u32, - ) -> Result> { + ) -> anyhow::Result> { + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?; + drop(timelines); // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1010,24 +1229,65 @@ impl Tenant { TEMP_FILE_SUFFIX, ); - // Init temporarily repo to get bootstrap data + // an uninit mark was placed before, nothing else can access this timeline files + // current initdb was not run yet, so remove whatever was left from the previous runs + if initdb_path.exists() { + fs::remove_dir_all(&initdb_path).with_context(|| { + format!( + "Failed to remove already existing initdb directory: {}", + initdb_path.display() + ) + })?; + } + // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path run_initdb(self.conf, &initdb_path, pg_version)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it + scopeguard::defer! { + if let Err(e) = fs::remove_dir_all(&initdb_path) { + // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call + error!("Failed to remove temporary initdb directory '{}': {}", initdb_path.display(), e); + } + } + let pgdata_path = &initdb_path; + let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata. + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + pgdata_lsn, + pgdata_lsn, + pg_version, + ); + let raw_timeline = + self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?; + + let tenant_id = raw_timeline.owning_tenant.tenant_id; + let unfinished_timeline = raw_timeline.raw_timeline()?; + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + .with_context(|| { + format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") + })?; fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); + anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); + unfinished_timeline + .checkpoint(CheckpointConfig::Forced) + .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; - timeline.checkpoint(CheckpointConfig::Forced)?; + let mut timelines = self.timelines.lock().unwrap(); + let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?; + drop(timelines); info!( "created root timeline {} timeline.lsn {}", @@ -1035,25 +1295,65 @@ impl Tenant { timeline.get_last_record_lsn() ); - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - Ok(timeline) } - fn create_initialized_timeline( + /// Creates intermediate timeline structure and its files, without loading it into memory. + /// It's up to the caller to import the necesary data and import the timeline into memory. + fn prepare_timeline( &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, - ) -> Result> { - crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) - .with_context(|| { - format!( - "Failed to create timeline {}/{} directory", - new_timeline_id, self.tenant_id - ) - })?; + uninit_mark: TimelineUninitMark, + init_layers: bool, + ancestor: Option>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + match self.create_timeline_files( + &uninit_mark.timeline_path, + new_timeline_id, + new_metadata, + ancestor, + ) { + Ok(new_timeline) => { + if init_layers { + new_timeline.layers.write().unwrap().next_open_layer_at = + Some(new_timeline.initdb_lsn); + } + debug!( + "Successfully created initial files for timeline {tenant_id}/{new_timeline_id}" + ); + Ok(UninitializedTimeline { + owning_tenant: self, + timeline_id: new_timeline_id, + raw_timeline: Some((new_timeline, uninit_mark)), + }) + } + Err(e) => { + error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + cleanup_timeline_directory(uninit_mark); + Err(e) + } + } + } + + fn create_timeline_files( + &self, + timeline_path: &Path, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + ancestor: Option>, + ) -> anyhow::Result { + let timeline_data = self + .create_timeline_data(new_timeline_id, new_metadata.clone(), ancestor) + .context("Failed to create timeline data structure")?; + crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; + + fail::fail_point!("after-timeline-uninit-mark-creation", |_| { + anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + }); + save_metadata( self.conf, new_timeline_id, @@ -1061,37 +1361,49 @@ impl Tenant { &new_metadata, true, ) - .with_context(|| { - format!( - "Failed to create timeline {}/{} metadata", - new_timeline_id, self.tenant_id - ) - })?; + .context("Failed to create timeline metadata")?; - let ancestor = new_metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) - .cloned(); - let new_timeline = self - .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) + Ok(timeline_data) + } + + /// Attempts to create an uninit mark file for the timeline initialization. + /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. + /// + /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. + fn create_timeline_uninit_mark( + &self, + timeline_id: TimelineId, + timelines: &MutexGuard>>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + anyhow::ensure!( + timelines.get(&timeline_id).is_none(), + "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" + ); + let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id); + anyhow::ensure!( + !timeline_path.exists(), + "Timeline {} already exists, cannot create its uninit mark file", + timeline_path.display() + ); + + let uninit_mark_path = self + .conf + .timeline_uninit_mark_file_path(tenant_id, timeline_id); + fs::File::create(&uninit_mark_path) + .context("Failed to create uninit mark file") + .and_then(|_| { + crashsafe::fsync_file_and_parent(&uninit_mark_path) + .context("Failed to fsync uninit mark file") + }) .with_context(|| { - format!( - "Failed to initialize timeline {}/{}", - new_timeline_id, self.tenant_id - ) + format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}") })?; - match timelines.entry(new_timeline_id) { - Entry::Occupied(_) => bail!( - "Found freshly initialized timeline {} in the tenant map", - new_timeline_id - ), - Entry::Vacant(v) => { - v.insert(Arc::clone(&new_timeline)); - } - } + let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); - Ok(new_timeline) + Ok(uninit_mark) } } @@ -1111,7 +1423,7 @@ fn run_initdb( initdb_lib_dir.display(), ); - let initdb_output = Command::new(initdb_bin_path) + let initdb_output = Command::new(&initdb_bin_path) .args(&["-D", &initdb_target_dir.to_string_lossy()]) .args(&["-U", &conf.superuser]) .args(&["-E", "utf8"]) @@ -1124,7 +1436,13 @@ fn run_initdb( .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) .stdout(Stdio::null()) .output() - .context("failed to execute initdb")?; + .with_context(|| { + format!( + "failed to execute {} at target dir {}", + initdb_bin_path.display(), + initdb_target_dir.display() + ) + })?; if !initdb_output.status.success() { bail!( "initdb failed: '{}'", @@ -1163,6 +1481,19 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { Ok(()) } +fn ignore_absent_files(fs_operation: F) -> io::Result<()> +where + F: Fn() -> io::Result<()>, +{ + fs_operation().or_else(|e| { + if e.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(e) + } + }) +} + #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; @@ -1379,7 +1710,9 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1401,13 +1734,18 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let _ = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), - format!("Timeline {TIMELINE_ID} already exists") + format!( + "Timeline {}/{} already exists in pageserver's memory", + tenant.tenant_id, TIMELINE_ID + ) ), } @@ -1427,7 +1765,9 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); use std::str::from_utf8; @@ -1522,7 +1862,9 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1552,7 +1894,9 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? + .initialize()?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1596,7 +1940,9 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1613,7 +1959,9 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1641,8 +1989,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1662,7 +2011,9 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1698,7 +2049,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1735,7 +2088,9 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1785,7 +2140,9 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let mut lsn = Lsn(0x10); @@ -1825,7 +2182,9 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 1000; @@ -1895,7 +2254,9 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 1000; @@ -1974,7 +2335,9 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index b2c927d4fc..f1db50bf7f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,7 +12,7 @@ use tracing::*; use remote_storage::GenericRemoteStorage; -use crate::config::{PageServerConf, METADATA_FILE_NAME}; +use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles}; @@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::TEMP_FILE_SUFFIX; -use utils::crashsafe_dir::{self, path_with_suffix_extension}; +use utils::crashsafe::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { @@ -265,58 +265,98 @@ fn create_tenant_files( temporary_tenant_dir.display() ); - let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(&tenant_id), - &target_tenant_directory, - &temporary_tenant_dir, - )?; - let temporary_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_id), - &target_tenant_directory, - &temporary_tenant_dir, - )?; - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| { + crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| { format!( "could not create temporary tenant directory {}", temporary_tenant_dir.display() ) })?; - // first, create a config in the top-level temp directory, fsync the file - Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; - // then, create a subdirectory in the top-level temp directory, fsynced - crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + + let creation_result = try_create_target_tenant_dir( + conf, + tenant_conf, + tenant_id, + &temporary_tenant_dir, + &target_tenant_directory, + ); + + if creation_result.is_err() { + error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); + if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { + error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") + } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { + error!( + "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}" + ) + } + } + + creation_result +} + +fn try_create_target_tenant_dir( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, + temporary_tenant_dir: &Path, + target_tenant_directory: &Path, +) -> Result<(), anyhow::Error> { + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?; + let temporary_tenant_config_path = rebase_directory( + &conf.tenant_config_path(tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?; + + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context( + || { + format!( + "Failed to write tenant {} config to {}", + tenant_id, + temporary_tenant_config_path.display() + ) + }, + )?; + crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( - "could not create temporary tenant timelines directory {}", + "could not create tenant {} temporary timelines directory {}", + tenant_id, temporary_tenant_timelines_dir.display() ) })?; - fail::fail_point!("tenant-creation-before-tmp-rename", |_| { anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); - // move-rename tmp directory with all files synced into a permanent directory, fsync its parent - fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| { + fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( - "failed to move temporary tenant directory {} into the permanent one {}", + "failed to move tenant {} temporary directory {} into the permanent one {}", + tenant_id, temporary_tenant_dir.display(), target_tenant_directory.display() ) })?; let target_dir_parent = target_tenant_directory.parent().with_context(|| { format!( - "Failed to get tenant dir parent for {}", + "Failed to get tenant {} dir parent for {}", + tenant_id, target_tenant_directory.display() ) })?; - fs::File::open(target_dir_parent)?.sync_all()?; - - info!( - "created tenant directory structure in {}", - target_tenant_directory.display() - ); + crashsafe::fsync(target_dir_parent).with_context(|| { + format!( + "Failed to fsync renamed directory's parent {} for tenant {}", + target_dir_parent.display(), + tenant_id, + ) + })?; Ok(()) } @@ -602,6 +642,15 @@ fn is_temporary(path: &Path) -> bool { } } +fn is_uninit_mark(path: &Path) -> bool { + match path.file_name() { + Some(name) => name + .to_string_lossy() + .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + None => false, + } +} + fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, @@ -644,28 +693,74 @@ fn collect_timelines_for_tenant( e ); } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } } else { - match collect_timeline_files(&timeline_dir) { - Ok((timeline_id, metadata, timeline_files)) => { - tenant_timelines.insert( - timeline_id, - TimelineLocalFiles::collected(metadata, timeline_files), - ); + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + config.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark"); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); } - Err(e) => { - error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_dir.display(), - e - ); - match remove_if_empty(&timeline_dir) { - Ok(true) => info!( - "Removed empty timeline directory {}", - timeline_dir.display() - ), - Ok(false) => (), - Err(e) => { - error!("Failed to remove empty timeline directory: {e:?}") + } else { + match collect_timeline_files(&timeline_dir) { + Ok((metadata, timeline_files)) => { + tenant_timelines.insert( + timeline_id, + TimelineLocalFiles::collected(metadata, timeline_files), + ); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } } } } @@ -688,24 +783,41 @@ fn collect_timelines_for_tenant( Ok((tenant_id, TenantAttachData::Ready(tenant_timelines))) } +fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { + fs::remove_dir_all(&timeline_dir) + .or_else(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + // we can leave the uninit mark without a timeline dir, + // just remove the mark then + Ok(()) + } else { + Err(e) + } + }) + .with_context(|| { + format!( + "Failed to remove unit marked timeline directory {}", + timeline_dir.display() + ) + })?; + fs::remove_file(&uninit_mark).with_context(|| { + format!( + "Failed to remove timeline uninit mark file {}", + uninit_mark.display() + ) + })?; + + Ok(()) +} + // discover timeline files and extract timeline metadata // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<( - TimelineId, - TimelineMetadata, - HashMap, -)> { +) -> anyhow::Result<(TimelineMetadata, HashMap)> { let mut timeline_files = HashMap::new(); let mut timeline_metadata_path = None; - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { @@ -754,5 +866,5 @@ fn collect_timeline_files( "Timeline has no ancestor and no layer files" ); - Ok((timeline_id, metadata, timeline_files)) + Ok((metadata, timeline_files)) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 29179e9871..01389e52f4 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1374,7 +1374,9 @@ mod tests { timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) - .expect("Failed to create an empty timeline for dummy wal connection manager"), + .expect("Failed to create an empty timeline for dummy wal connection manager") + .initialize() + .unwrap(), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1dd27caba6..b8874a0223 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,7 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 7baa67935d..101cce9ffc 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -111,18 +111,20 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): future.result() -def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): +def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] - # Introduce failpoint when creating a new timeline + # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) + _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) # Restart the page server env.neon_cli.pageserver_stop(immediate=True) @@ -133,3 +135,36 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): assert ( new_tenant_timelines == old_tenant_timelines ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" + + +def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant() + + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] + + # Introduce failpoint when creating a new timeline uninit mark, before any other files were created + pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) + with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): + _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id) + + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + # "New" timeline is not present in the list, allowing pageserver to retry the same request + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 5910b4f74f..c888c6f7ee 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -105,15 +105,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(Exception): import_tar(corrupt_base_tar, wal_tar) - # Clean up - # TODO it should clean itself - client = env.pageserver.http_client() - client.timeline_delete(tenant, timeline) - # Importing correct backup works import_tar(base_tar, wal_tar) # Wait for data to land in s3 + client = env.pageserver.http_client() wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 37c5a130e2..4ffea60950 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -23,7 +23,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): initial_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) - initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) + initial_tenant_dirs = [d for d in tenants_dir.iterdir()] pageserver_http = neon_simple_env.pageserver.http_client() pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) @@ -35,26 +35,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) assert initial_tenants == new_tenants, "should not create new tenants" - new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs) - assert len(new_tenant_dirs) == 1, "should have new tenant directory created" - tmp_tenant_dir = new_tenant_dirs[0] - assert str(tmp_tenant_dir).endswith( - ".___temp" - ), "new tenant directory created should be a temporary one" - - neon_simple_env.pageserver.stop() - neon_simple_env.pageserver.start() - - tenants_after_restart = sorted( - map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) - ) - dirs_after_restart = set([d for d in tenants_dir.iterdir()]) + new_tenant_dirs = [d for d in tenants_dir.iterdir()] assert ( - tenants_after_restart == initial_tenants - ), "should load all non-corrupt tenants after restart" - assert ( - dirs_after_restart == initial_tenant_dirs - ), "pageserver should clean its temp tenant dirs on restart" + new_tenant_dirs == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on tenant creation failure" def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):