From 306a47c4fab6f9c0c2cf83af64848750ff772335 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 20 Oct 2022 14:19:17 +0300 Subject: [PATCH] Use uninit mark files during timeline init for atomic creation (#2489) Part of https://github.com/neondatabase/neon/pull/2239 Regular, from scratch, timeline creation involves initdb to be run in a separate directory, data from this directory to be imported into pageserver and, finally, timeline-related background tasks to start. This PR ensures we don't leave behind any directories that are not marked as temporary and that pageserver removes such directories on restart, allowing timeline creation to be retried with the same IDs, if needed. It would be good to later rewrite the logic to use a temporary directory, similar what tenant creation does. Yet currently it's harder than this change, so not done. --- libs/remote_storage/src/local_fs.rs | 2 +- .../src/{crashsafe_dir.rs => crashsafe.rs} | 43 +- libs/utils/src/lib.rs | 4 +- pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/config.rs | 13 + pageserver/src/import_datadir.rs | 16 +- pageserver/src/page_service.rs | 18 +- pageserver/src/pgdatadir_mapping.rs | 4 +- pageserver/src/storage_sync/download.rs | 2 +- pageserver/src/tenant.rs | 619 ++++++++++++++---- pageserver/src/tenant_mgr.rs | 236 +++++-- .../src/walreceiver/connection_manager.rs | 4 +- pageserver/src/walredo.rs | 2 +- test_runner/regress/test_broken_timeline.py | 41 +- test_runner/regress/test_import.py | 6 +- test_runner/regress/test_tenants.py | 24 +- 16 files changed, 777 insertions(+), 259 deletions(-) rename libs/utils/src/{crashsafe_dir.rs => crashsafe.rs} (84%) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 5723a512f6..2f824cc453 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,7 +16,7 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use crate::{Download, DownloadError, RemoteObjectId}; diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe.rs similarity index 84% rename from libs/utils/src/crashsafe_dir.rs rename to libs/utils/src/crashsafe.rs index 032ab0a916..3726779cb2 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe.rs @@ -12,16 +12,8 @@ pub fn create_dir(path: impl AsRef) -> io::Result<()> { let path = path.as_ref(); fs::create_dir(path)?; - File::open(path)?.sync_all()?; - - if let Some(parent) = path.parent() { - File::open(parent)?.sync_all() - } else { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "can't find parent", - )) - } + fsync_file_and_parent(path)?; + Ok(()) } /// Similar to [`std::fs::create_dir_all`], except we fsync all @@ -65,12 +57,12 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { // Fsync the created directories from child to parent. for &path in dirs_to_create.iter() { - File::open(path)?.sync_all()?; + fsync(path)?; } // If we created any new directories, fsync the parent. if !dirs_to_create.is_empty() { - File::open(path)?.sync_all()?; + fsync(path)?; } Ok(()) @@ -92,6 +84,33 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> { + let parent = file_path.parent().ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("File {file_path:?} has no parent"), + ) + })?; + + fsync(file_path)?; + fsync(parent)?; + Ok(()) +} + +pub fn fsync(path: &Path) -> io::Result<()> { + File::open(path) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}"))) + .and_then(|file| { + file.sync_all().map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to sync file {path:?} data and metadata: {e}"), + ) + }) + }) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}"))) +} + #[cfg(test)] mod tests { use tempfile::tempdir; diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2c80556446..f1f48f5a90 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -22,8 +22,8 @@ pub mod pq_proto; // dealing with connstring parsing and handy access to it's parts pub mod connstring; -// helper functions for creating and fsyncing directories/trees -pub mod crashsafe_dir; +// helper functions for creating and fsyncing +pub mod crashsafe; // common authentication routines pub mod auth; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 12f594077e..9317dd5dd7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,7 +87,7 @@ fn main() -> anyhow::Result<()> { let tenants_path = conf.tenants_path(); if !tenants_path.exists() { - utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| { format!( "Failed to create tenants root dir at '{}'", tenants_path.display() diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e3c7baad8..b797866e43 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,6 +7,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use remote_storage::RemoteStorageConfig; use std::env; +use utils::crashsafe::path_with_suffix_extension; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -24,6 +25,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt}; /// The name of the metadata file pageserver creates per timeline. pub const METADATA_FILE_NAME: &str = "metadata"; +pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { @@ -364,6 +366,17 @@ impl PageServerConf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } + pub fn timeline_uninit_mark_file_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> PathBuf { + path_with_suffix_extension( + self.timeline_path(&timeline_id, &tenant_id), + TIMELINE_UNINIT_MARK_SUFFIX, + ) + } + /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 23c4351b4e..ee3dc684e3 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. pub fn import_timeline_from_postgres_datadir( - path: &Path, tline: &Timeline, - lsn: Lsn, + pgdata_path: &Path, + pgdata_lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) // Then fishing out pg_control would be unnecessary - let mut modification = tline.begin_modification(lsn); + let mut modification = tline.begin_modification(pgdata_lsn); modification.init_empty()?; // Import all but pg_wal - let all_but_wal = WalkDir::new(path) + let all_but_wal = WalkDir::new(pgdata_path) .into_iter() .filter_entry(|entry| !entry.path().ends_with("pg_wal")); for entry in all_but_wal { @@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir( let metadata = entry.metadata().expect("error getting dir entry metadata"); if metadata.is_file() { let absolute_path = entry.path(); - let relative_path = absolute_path.strip_prefix(path)?; + let relative_path = absolute_path.strip_prefix(pgdata_path)?; let file = File::open(absolute_path)?; let len = metadata.len() as usize; @@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir( "Postgres cluster was not shut down cleanly" ); ensure!( - pg_control.checkPointCopy.redo == lsn.0, + pg_control.checkPointCopy.redo == pgdata_lsn.0, "unexpected checkpoint REDO pointer" ); @@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir( // this reads the checkpoint record itself, advancing the tip of the timeline to // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( - &path.join("pg_wal"), + &pgdata_path.join("pg_wal"), tline, Lsn(pg_control.checkPointCopy.redo), - lsn, + pgdata_lsn, )?; Ok(()) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 795a99058d..9b2bb3114d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -32,7 +32,7 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; -use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; +use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -500,11 +500,8 @@ impl PageServerHandler { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( - timeline_id, - base_lsn, - pg_version, - )?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -527,7 +524,8 @@ impl PageServerHandler { // - use block_in_place() let mut copyin_stream = Box::pin(copyin_stream(pgb)); let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?; + tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?; + timeline.initialize()?; // Drain the rest of the Copy data let mut bytes_after_tar = 0; @@ -544,12 +542,6 @@ impl PageServerHandler { // It wouldn't work if base came from vanilla postgres though, // since we discard some log files. - // Flush data to disk, then upload to s3 - info!("flushing layers"); - timeline.checkpoint(CheckpointConfig::Flush)?; - - timeline.launch_wal_receiver()?; - info!("done"); Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fc9867dc05..424ce4769a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1403,7 +1403,9 @@ pub fn create_test_timeline( timeline_id: utils::id::TimelineId, pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; + let tline = tenant + .create_empty_timeline(timeline_id, Lsn(8), pg_version)? + .initialize()?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 61ef164f14..6f9b2e2071 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -22,7 +22,7 @@ use crate::{ TEMP_FILE_SUFFIX, }; use utils::{ - crashsafe_dir::path_with_suffix_extension, + crashsafe::path_with_suffix_extension, id::{TenantId, TenantTimelineId, TimelineId}, }; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 47ef9284b8..93c473f0fe 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,7 +14,7 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map::Entry; @@ -23,10 +23,12 @@ use std::collections::HashMap; use std::fs; use std::fs::File; use std::fs::OpenOptions; +use std::io; use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; +use std::path::PathBuf; use std::process::Command; use std::process::Stdio; use std::sync::Arc; @@ -49,7 +51,7 @@ pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ - crashsafe_dir, + crashsafe, id::{TenantId, TimelineId}, lsn::{Lsn, RecordLsn}, }; @@ -120,6 +122,216 @@ pub struct Tenant { upload_layers: bool, } +/// A timeline with some of its files on disk, being initialized. +/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or +/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory +/// to be removed on next restart. +/// +/// The caller is responsible for proper timeline data filling before the final init. +#[must_use] +pub struct UninitializedTimeline<'t> { + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + raw_timeline: Option<(Timeline, TimelineUninitMark)>, +} + +/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, +/// or gets removed eventually. +/// +/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +#[must_use] +struct TimelineUninitMark { + uninit_mark_deleted: bool, + uninit_mark_path: PathBuf, + timeline_path: PathBuf, +} + +impl UninitializedTimeline<'_> { + /// Ensures timeline data is valid, loads it into pageserver's memory and removes uninit mark file on success. + pub fn initialize(self) -> anyhow::Result> { + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(&mut timelines, true) + } + + fn initialize_with_lock( + mut self, + timelines: &mut HashMap>, + load_layer_map: bool, + ) -> anyhow::Result> { + let timeline_id = self.timeline_id; + let tenant_id = self.owning_tenant.tenant_id; + + let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { + format!("No timeline for initalization found for {tenant_id}/{timeline_id}") + })?; + let new_timeline = Arc::new(new_timeline); + + let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); + // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least + // ensure!(new_disk_consistent_lsn.is_valid(), + // "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized"); + + match timelines.entry(timeline_id) { + Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + ), + Entry::Vacant(v) => { + if load_layer_map { + new_timeline + .load_layer_map(new_disk_consistent_lsn) + .with_context(|| { + format!( + "Failed to load layermap for timeline {tenant_id}/{timeline_id}" + ) + })?; + } + uninit_mark.remove_uninit_mark().with_context(|| { + format!( + "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + ) + })?; + v.insert(Arc::clone(&new_timeline)); + new_timeline.launch_wal_receiver().with_context(|| { + format!("Failed to launch walreceiver for timeline {tenant_id}/{timeline_id}") + })?; + } + } + + Ok(new_timeline) + } + + /// Prepares timeline data by loading it from the basebackup archive. + pub fn import_basebackup_from_tar( + &self, + reader: impl std::io::Read, + base_lsn: Lsn, + ) -> anyhow::Result<()> { + let raw_timeline = self.raw_timeline()?; + import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context( + || { + format!( + "Failed to import basebackup for timeline {}/{}", + self.owning_tenant.tenant_id, self.timeline_id + ) + }, + )?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + raw_timeline + .checkpoint(CheckpointConfig::Flush) + .with_context(|| { + format!( + "Failed to checkpoint after basebackup import for timeline {}/{}", + self.owning_tenant.tenant_id, self.timeline_id + ) + })?; + Ok(()) + } + + fn raw_timeline(&self) -> anyhow::Result<&Timeline> { + Ok(&self + .raw_timeline + .as_ref() + .with_context(|| { + format!( + "No raw timeline {}/{} found", + self.owning_tenant.tenant_id, self.timeline_id + ) + })? + .0) + } +} + +impl Drop for UninitializedTimeline<'_> { + fn drop(&mut self) { + if let Some((_, uninit_mark)) = self.raw_timeline.take() { + let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered(); + error!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(uninit_mark); + } + } +} + +fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { + let timeline_path = &uninit_mark.timeline_path; + match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + Ok(()) => { + info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + } + Err(e) => { + error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") + } + } + drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists +} + +impl TimelineUninitMark { + /// Useful for initializing timelines, existing on disk after the restart. + pub fn dummy() -> Self { + Self { + uninit_mark_deleted: true, + uninit_mark_path: PathBuf::new(), + timeline_path: PathBuf::new(), + } + } + + fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { + Self { + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + } + } + + fn remove_uninit_mark(mut self) -> anyhow::Result<()> { + if !self.uninit_mark_deleted { + self.delete_mark_file_if_present()?; + } + + Ok(()) + } + + fn delete_mark_file_if_present(&mut self) -> Result<(), anyhow::Error> { + let uninit_mark_file = &self.uninit_mark_path; + let uninit_mark_parent = uninit_mark_file + .parent() + .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; + ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| { + format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") + })?; + crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; + self.uninit_mark_deleted = true; + + Ok(()) + } +} + +impl Drop for TimelineUninitMark { + fn drop(&mut self) { + if !self.uninit_mark_deleted { + if self.timeline_path.exists() { + error!( + "Uninit mark {} is not removed, timeline {} stays uninitialized", + self.uninit_mark_path.display(), + self.timeline_path.display() + ) + } else { + // unblock later timeline creation attempts + warn!( + "Removing intermediate uninit mark file {}", + self.uninit_mark_path.display() + ); + if let Err(e) = self.delete_mark_file_if_present() { + error!("Failed to remove the uninit mark file: {e}") + } + } + } + } +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { @@ -162,19 +374,10 @@ impl Tenant { new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, - ) -> Result> { - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - - anyhow::ensure!( - timelines.get(&new_timeline_id).is_none(), - "Timeline {new_timeline_id} already exists" - ); - - let timeline_path = self.conf.timeline_path(&new_timeline_id, &self.tenant_id); - if timeline_path.exists() { - bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") - } + ) -> anyhow::Result { + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?; + drop(timelines); let new_metadata = TimelineMetadata::new( Lsn(0), @@ -185,11 +388,13 @@ impl Tenant { initdb_lsn, pg_version, ); - let new_timeline = - self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; - new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - - Ok(new_timeline) + self.prepare_timeline( + new_timeline_id, + new_metadata, + timeline_uninit_mark, + true, + None, + ) } /// Create a new timeline. @@ -205,14 +410,10 @@ impl Tenant { ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, - ) -> Result>> { + ) -> anyhow::Result>> { let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - if self - .conf - .timeline_path(&new_timeline_id, &self.tenant_id) - .exists() - { + if self.get_timeline(new_timeline_id).is_ok() { debug!("timeline {new_timeline_id} already exists"); return Ok(None); } @@ -391,21 +592,32 @@ impl Tenant { timeline_id, metadata.pg_version() ); - let ancestor = metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) - .cloned(); - match timelines_accessor.entry(timeline_id) { - Entry::Occupied(_) => warn!( + + if timelines_accessor.contains_key(&timeline_id) { + warn!( "Timeline {}/{} already exists in the tenant map, skipping its initialization", self.tenant_id, timeline_id - ), - Entry::Vacant(v) => { - let timeline = self - .initialize_new_timeline(timeline_id, metadata, ancestor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - v.insert(timeline); - } + ); + continue; + } else { + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + let timeline = UninitializedTimeline { + owning_tenant: self, + timeline_id, + raw_timeline: Some(( + self.create_timeline_data(timeline_id, metadata, ancestor) + .with_context(|| { + format!("Failed to initialize timeline {timeline_id}") + })?, + TimelineUninitMark::dummy(), + )), + }; + let initialized_timeline = + timeline.initialize_with_lock(&mut timelines_accessor, true)?; + timelines_accessor.insert(timeline_id, initialized_timeline); } } @@ -599,12 +811,12 @@ impl Tenant { self.tenant_conf.write().unwrap().update(&new_tenant_conf); } - fn initialize_new_timeline( + fn create_timeline_data( &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, ancestor: Option>, - ) -> anyhow::Result> { + ) -> anyhow::Result { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( ancestor.is_some(), @@ -612,9 +824,8 @@ impl Tenant { ) } - let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); let pg_version = new_metadata.pg_version(); - let new_timeline = Arc::new(Timeline::new( + Ok(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), new_metadata, @@ -624,15 +835,7 @@ impl Tenant { Arc::clone(&self.walredo_mgr), self.upload_layers, pg_version, - )); - - new_timeline - .load_layer_map(new_disk_consistent_lsn) - .context("failed to load layermap")?; - - new_timeline.launch_wal_receiver()?; - - Ok(new_timeline) + )) } pub fn new( @@ -914,11 +1117,14 @@ impl Tenant { src: TimelineId, dst: TimelineId, start_lsn: Option, - ) -> Result> { + ) -> anyhow::Result> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. let _gc_cs = self.gc_cs.lock().unwrap(); + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?; + drop(timelines); // In order for the branch creation task to not wait for GC/compaction, // we need to make sure that the starting LSN of the child branch is not out of scope midway by @@ -929,12 +1135,12 @@ impl Tenant { // Step 2 is to avoid initializing the new branch using data removed by past GC iterations // or in-queue GC iterations. - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + let src_timeline = self.get_timeline(src).with_context(|| { + format!( + "No ancestor {} found for timeline {}/{}", + src, self.tenant_id, dst + ) + })?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); @@ -988,7 +1194,17 @@ impl Tenant { src_timeline.initdb_lsn, src_timeline.pg_version, ); - let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; + let mut timelines = self.timelines.lock().unwrap(); + let new_timeline = self + .prepare_timeline( + dst, + metadata, + timeline_uninit_mark, + false, + Some(src_timeline), + )? + .initialize_with_lock(&mut timelines, true)?; + drop(timelines); info!("branched timeline {dst} from {src} at {start_lsn}"); Ok(new_timeline) @@ -1000,7 +1216,10 @@ impl Tenant { &self, timeline_id: TimelineId, pg_version: u32, - ) -> Result> { + ) -> anyhow::Result> { + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?; + drop(timelines); // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. let initdb_path = path_with_suffix_extension( @@ -1010,24 +1229,65 @@ impl Tenant { TEMP_FILE_SUFFIX, ); - // Init temporarily repo to get bootstrap data + // an uninit mark was placed before, nothing else can access this timeline files + // current initdb was not run yet, so remove whatever was left from the previous runs + if initdb_path.exists() { + fs::remove_dir_all(&initdb_path).with_context(|| { + format!( + "Failed to remove already existing initdb directory: {}", + initdb_path.display() + ) + })?; + } + // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path run_initdb(self.conf, &initdb_path, pg_version)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it + scopeguard::defer! { + if let Err(e) = fs::remove_dir_all(&initdb_path) { + // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call + error!("Failed to remove temporary initdb directory '{}': {}", initdb_path.display(), e); + } + } + let pgdata_path = &initdb_path; + let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata. + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + pgdata_lsn, + pgdata_lsn, + pg_version, + ); + let raw_timeline = + self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?; + + let tenant_id = raw_timeline.owning_tenant.tenant_id; + let unfinished_timeline = raw_timeline.raw_timeline()?; + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + .with_context(|| { + format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") + })?; fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); + anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); + unfinished_timeline + .checkpoint(CheckpointConfig::Forced) + .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; - timeline.checkpoint(CheckpointConfig::Forced)?; + let mut timelines = self.timelines.lock().unwrap(); + let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?; + drop(timelines); info!( "created root timeline {} timeline.lsn {}", @@ -1035,25 +1295,65 @@ impl Tenant { timeline.get_last_record_lsn() ); - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - Ok(timeline) } - fn create_initialized_timeline( + /// Creates intermediate timeline structure and its files, without loading it into memory. + /// It's up to the caller to import the necesary data and import the timeline into memory. + fn prepare_timeline( &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, - ) -> Result> { - crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) - .with_context(|| { - format!( - "Failed to create timeline {}/{} directory", - new_timeline_id, self.tenant_id - ) - })?; + uninit_mark: TimelineUninitMark, + init_layers: bool, + ancestor: Option>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + match self.create_timeline_files( + &uninit_mark.timeline_path, + new_timeline_id, + new_metadata, + ancestor, + ) { + Ok(new_timeline) => { + if init_layers { + new_timeline.layers.write().unwrap().next_open_layer_at = + Some(new_timeline.initdb_lsn); + } + debug!( + "Successfully created initial files for timeline {tenant_id}/{new_timeline_id}" + ); + Ok(UninitializedTimeline { + owning_tenant: self, + timeline_id: new_timeline_id, + raw_timeline: Some((new_timeline, uninit_mark)), + }) + } + Err(e) => { + error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + cleanup_timeline_directory(uninit_mark); + Err(e) + } + } + } + + fn create_timeline_files( + &self, + timeline_path: &Path, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + ancestor: Option>, + ) -> anyhow::Result { + let timeline_data = self + .create_timeline_data(new_timeline_id, new_metadata.clone(), ancestor) + .context("Failed to create timeline data structure")?; + crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; + + fail::fail_point!("after-timeline-uninit-mark-creation", |_| { + anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + }); + save_metadata( self.conf, new_timeline_id, @@ -1061,37 +1361,49 @@ impl Tenant { &new_metadata, true, ) - .with_context(|| { - format!( - "Failed to create timeline {}/{} metadata", - new_timeline_id, self.tenant_id - ) - })?; + .context("Failed to create timeline metadata")?; - let ancestor = new_metadata - .ancestor_timeline() - .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) - .cloned(); - let new_timeline = self - .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) + Ok(timeline_data) + } + + /// Attempts to create an uninit mark file for the timeline initialization. + /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. + /// + /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. + fn create_timeline_uninit_mark( + &self, + timeline_id: TimelineId, + timelines: &MutexGuard>>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + anyhow::ensure!( + timelines.get(&timeline_id).is_none(), + "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" + ); + let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id); + anyhow::ensure!( + !timeline_path.exists(), + "Timeline {} already exists, cannot create its uninit mark file", + timeline_path.display() + ); + + let uninit_mark_path = self + .conf + .timeline_uninit_mark_file_path(tenant_id, timeline_id); + fs::File::create(&uninit_mark_path) + .context("Failed to create uninit mark file") + .and_then(|_| { + crashsafe::fsync_file_and_parent(&uninit_mark_path) + .context("Failed to fsync uninit mark file") + }) .with_context(|| { - format!( - "Failed to initialize timeline {}/{}", - new_timeline_id, self.tenant_id - ) + format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}") })?; - match timelines.entry(new_timeline_id) { - Entry::Occupied(_) => bail!( - "Found freshly initialized timeline {} in the tenant map", - new_timeline_id - ), - Entry::Vacant(v) => { - v.insert(Arc::clone(&new_timeline)); - } - } + let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); - Ok(new_timeline) + Ok(uninit_mark) } } @@ -1111,7 +1423,7 @@ fn run_initdb( initdb_lib_dir.display(), ); - let initdb_output = Command::new(initdb_bin_path) + let initdb_output = Command::new(&initdb_bin_path) .args(&["-D", &initdb_target_dir.to_string_lossy()]) .args(&["-U", &conf.superuser]) .args(&["-E", "utf8"]) @@ -1124,7 +1436,13 @@ fn run_initdb( .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) .stdout(Stdio::null()) .output() - .context("failed to execute initdb")?; + .with_context(|| { + format!( + "failed to execute {} at target dir {}", + initdb_bin_path.display(), + initdb_target_dir.display() + ) + })?; if !initdb_output.status.success() { bail!( "initdb failed: '{}'", @@ -1163,6 +1481,19 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> Result<()> { Ok(()) } +fn ignore_absent_files(fs_operation: F) -> io::Result<()> +where + F: Fn() -> io::Result<()>, +{ + fs_operation().or_else(|e| { + if e.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(e) + } + }) +} + #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; @@ -1379,7 +1710,9 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1401,13 +1734,18 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let _ = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), - format!("Timeline {TIMELINE_ID} already exists") + format!( + "Timeline {}/{} already exists in pageserver's memory", + tenant.tenant_id, TIMELINE_ID + ) ), } @@ -1427,7 +1765,9 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); use std::str::from_utf8; @@ -1522,7 +1862,9 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1552,7 +1894,9 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? + .initialize()?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1596,7 +1940,9 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1613,7 +1959,9 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1641,8 +1989,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1662,7 +2011,9 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1698,7 +2049,9 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1735,7 +2088,9 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1785,7 +2140,9 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; let mut lsn = Lsn(0x10); @@ -1825,7 +2182,9 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 1000; @@ -1895,7 +2254,9 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 1000; @@ -1974,7 +2335,9 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index b2c927d4fc..f1db50bf7f 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -12,7 +12,7 @@ use tracing::*; use remote_storage::GenericRemoteStorage; -use crate::config::{PageServerConf, METADATA_FILE_NAME}; +use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles}; @@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::TEMP_FILE_SUFFIX; -use utils::crashsafe_dir::{self, path_with_suffix_extension}; +use utils::crashsafe::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { @@ -265,58 +265,98 @@ fn create_tenant_files( temporary_tenant_dir.display() ); - let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(&tenant_id), - &target_tenant_directory, - &temporary_tenant_dir, - )?; - let temporary_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_id), - &target_tenant_directory, - &temporary_tenant_dir, - )?; - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| { + crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| { format!( "could not create temporary tenant directory {}", temporary_tenant_dir.display() ) })?; - // first, create a config in the top-level temp directory, fsync the file - Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?; - // then, create a subdirectory in the top-level temp directory, fsynced - crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + + let creation_result = try_create_target_tenant_dir( + conf, + tenant_conf, + tenant_id, + &temporary_tenant_dir, + &target_tenant_directory, + ); + + if creation_result.is_err() { + error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); + if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { + error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") + } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { + error!( + "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}" + ) + } + } + + creation_result +} + +fn try_create_target_tenant_dir( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, + temporary_tenant_dir: &Path, + target_tenant_directory: &Path, +) -> Result<(), anyhow::Error> { + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?; + let temporary_tenant_config_path = rebase_directory( + &conf.tenant_config_path(tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?; + + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context( + || { + format!( + "Failed to write tenant {} config to {}", + tenant_id, + temporary_tenant_config_path.display() + ) + }, + )?; + crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( - "could not create temporary tenant timelines directory {}", + "could not create tenant {} temporary timelines directory {}", + tenant_id, temporary_tenant_timelines_dir.display() ) })?; - fail::fail_point!("tenant-creation-before-tmp-rename", |_| { anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); - // move-rename tmp directory with all files synced into a permanent directory, fsync its parent - fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| { + fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( - "failed to move temporary tenant directory {} into the permanent one {}", + "failed to move tenant {} temporary directory {} into the permanent one {}", + tenant_id, temporary_tenant_dir.display(), target_tenant_directory.display() ) })?; let target_dir_parent = target_tenant_directory.parent().with_context(|| { format!( - "Failed to get tenant dir parent for {}", + "Failed to get tenant {} dir parent for {}", + tenant_id, target_tenant_directory.display() ) })?; - fs::File::open(target_dir_parent)?.sync_all()?; - - info!( - "created tenant directory structure in {}", - target_tenant_directory.display() - ); + crashsafe::fsync(target_dir_parent).with_context(|| { + format!( + "Failed to fsync renamed directory's parent {} for tenant {}", + target_dir_parent.display(), + tenant_id, + ) + })?; Ok(()) } @@ -602,6 +642,15 @@ fn is_temporary(path: &Path) -> bool { } } +fn is_uninit_mark(path: &Path) -> bool { + match path.file_name() { + Some(name) => name + .to_string_lossy() + .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + None => false, + } +} + fn collect_timelines_for_tenant( config: &'static PageServerConf, tenant_path: &Path, @@ -644,28 +693,74 @@ fn collect_timelines_for_tenant( e ); } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } } else { - match collect_timeline_files(&timeline_dir) { - Ok((timeline_id, metadata, timeline_files)) => { - tenant_timelines.insert( - timeline_id, - TimelineLocalFiles::collected(metadata, timeline_files), - ); + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + config.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark"); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); } - Err(e) => { - error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_dir.display(), - e - ); - match remove_if_empty(&timeline_dir) { - Ok(true) => info!( - "Removed empty timeline directory {}", - timeline_dir.display() - ), - Ok(false) => (), - Err(e) => { - error!("Failed to remove empty timeline directory: {e:?}") + } else { + match collect_timeline_files(&timeline_dir) { + Ok((metadata, timeline_files)) => { + tenant_timelines.insert( + timeline_id, + TimelineLocalFiles::collected(metadata, timeline_files), + ); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } } } } @@ -688,24 +783,41 @@ fn collect_timelines_for_tenant( Ok((tenant_id, TenantAttachData::Ready(tenant_timelines))) } +fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { + fs::remove_dir_all(&timeline_dir) + .or_else(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + // we can leave the uninit mark without a timeline dir, + // just remove the mark then + Ok(()) + } else { + Err(e) + } + }) + .with_context(|| { + format!( + "Failed to remove unit marked timeline directory {}", + timeline_dir.display() + ) + })?; + fs::remove_file(&uninit_mark).with_context(|| { + format!( + "Failed to remove timeline uninit mark file {}", + uninit_mark.display() + ) + })?; + + Ok(()) +} + // discover timeline files and extract timeline metadata // NOTE: ephemeral files are excluded from the list fn collect_timeline_files( timeline_dir: &Path, -) -> anyhow::Result<( - TimelineId, - TimelineMetadata, - HashMap, -)> { +) -> anyhow::Result<(TimelineMetadata, HashMap)> { let mut timeline_files = HashMap::new(); let mut timeline_metadata_path = None; - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; let timeline_dir_entries = fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; for entry in timeline_dir_entries { @@ -754,5 +866,5 @@ fn collect_timeline_files( "Timeline has no ancestor and no layer files" ); - Ok((timeline_id, metadata, timeline_files)) + Ok((metadata, timeline_files)) } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 29179e9871..01389e52f4 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1374,7 +1374,9 @@ mod tests { timeline: harness .load() .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) - .expect("Failed to create an empty timeline for dummy wal connection manager"), + .expect("Failed to create an empty timeline for dummy wal connection manager") + .initialize() + .unwrap(), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1dd27caba6..b8874a0223 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,7 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; -use utils::crashsafe_dir::path_with_suffix_extension; +use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 7baa67935d..101cce9ffc 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -111,18 +111,20 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): future.result() -def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): +def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] - # Introduce failpoint when creating a new timeline + # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) + _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) # Restart the page server env.neon_cli.pageserver_stop(immediate=True) @@ -133,3 +135,36 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): assert ( new_tenant_timelines == old_tenant_timelines ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" + + +def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant() + + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] + + # Introduce failpoint when creating a new timeline uninit mark, before any other files were created + pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) + with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): + _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id) + + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + # "New" timeline is not present in the list, allowing pageserver to retry the same request + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 5910b4f74f..c888c6f7ee 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -105,15 +105,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(Exception): import_tar(corrupt_base_tar, wal_tar) - # Clean up - # TODO it should clean itself - client = env.pageserver.http_client() - client.timeline_delete(tenant, timeline) - # Importing correct backup works import_tar(base_tar, wal_tar) # Wait for data to land in s3 + client = env.pageserver.http_client() wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 37c5a130e2..4ffea60950 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -23,7 +23,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): initial_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) - initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) + initial_tenant_dirs = [d for d in tenants_dir.iterdir()] pageserver_http = neon_simple_env.pageserver.http_client() pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) @@ -35,26 +35,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) assert initial_tenants == new_tenants, "should not create new tenants" - new_tenant_dirs = list(set([d for d in tenants_dir.iterdir()]) - initial_tenant_dirs) - assert len(new_tenant_dirs) == 1, "should have new tenant directory created" - tmp_tenant_dir = new_tenant_dirs[0] - assert str(tmp_tenant_dir).endswith( - ".___temp" - ), "new tenant directory created should be a temporary one" - - neon_simple_env.pageserver.stop() - neon_simple_env.pageserver.start() - - tenants_after_restart = sorted( - map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) - ) - dirs_after_restart = set([d for d in tenants_dir.iterdir()]) + new_tenant_dirs = [d for d in tenants_dir.iterdir()] assert ( - tenants_after_restart == initial_tenants - ), "should load all non-corrupt tenants after restart" - assert ( - dirs_after_restart == initial_tenant_dirs - ), "pageserver should clean its temp tenant dirs on restart" + new_tenant_dirs == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on tenant creation failure" def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):