mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 08:52:56 +00:00
crash-safe and resumable tenant attach
This change introduces a marker file $repo/tenants/$tenant_id/attaching that is present while a tenant is in Attaching state. When pageserver restarts, we use it to resume the tenant attach operation. Before this change, a crash during tenant attach would result in one of the following: 1. crash upon restart due to missing metadata file (IIRC) 2. "successful" loading of the tenant with a subset of timelines
This commit is contained in:
committed by
Dmitry Rodionov
parent
c4c4558736
commit
bb6dbd2f43
45
libs/utils/src/fs_ext.rs
Normal file
45
libs/utils/src/fs_ext.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
/// Extensions to `std::fs` types.
|
||||
use std::{fs, io, path::Path};
|
||||
|
||||
pub trait PathExt {
|
||||
/// Returns an error if `self` is not a directory.
|
||||
fn is_empty_dir(&self) -> io::Result<bool>;
|
||||
}
|
||||
|
||||
impl<P> PathExt for P
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
fn is_empty_dir(&self) -> io::Result<bool> {
|
||||
Ok(fs::read_dir(self)?.into_iter().next().is_none())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[test]
|
||||
fn is_empty_dir() {
|
||||
use super::PathExt;
|
||||
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
// test positive case
|
||||
assert!(
|
||||
dir_path.is_empty_dir().expect("test failure"),
|
||||
"new tempdir should be empty"
|
||||
);
|
||||
|
||||
// invoke on a file to ensure it returns an error
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
let f = std::fs::File::create(&file_path).unwrap();
|
||||
drop(f);
|
||||
assert!(file_path.is_empty_dir().is_err());
|
||||
|
||||
// do it again on a path, we know to be nonexistent
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(file_path.is_empty_dir().is_err());
|
||||
}
|
||||
}
|
||||
@@ -48,6 +48,8 @@ pub mod nonblock;
|
||||
// Default signal handling
|
||||
pub mod signals;
|
||||
|
||||
pub mod fs_ext;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -23,7 +23,7 @@ use utils::{
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
@@ -390,6 +390,11 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(&tenant_id)
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
|
||||
@@ -129,6 +129,7 @@ mod upload;
|
||||
|
||||
// re-export this
|
||||
pub use download::list_remote_timelines;
|
||||
pub use download::is_temp_download_file;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Helper functions to download files from remote storage with a RemoteStorage
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
@@ -36,6 +37,21 @@ pub async fn download_layer_file<'a>(
|
||||
download_layer_file_guts(conf, storage, tenant_id, timeline_id, path, layer_metadata).await
|
||||
}
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub fn is_temp_download_file(path: &Path) -> bool {
|
||||
let extension = path.extension().map(|pname| {
|
||||
pname
|
||||
.to_str()
|
||||
.expect("paths passed to this function must be valid Rust strings")
|
||||
});
|
||||
match extension {
|
||||
Some(TEMP_DOWNLOAD_EXTENSION) => true,
|
||||
Some(_) => false,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_layer_file_guts<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
@@ -44,8 +60,6 @@ async fn download_layer_file_guts<'a>(
|
||||
path: &'a RelativePath,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let local_path = path.to_local_path(&timeline_path);
|
||||
|
||||
@@ -104,6 +104,8 @@ pub use crate::tenant::timeline::WalReceiverInfo;
|
||||
/// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
|
||||
|
||||
///
|
||||
/// Tenant consists of multiple timelines. Keep them in a hash table.
|
||||
///
|
||||
@@ -498,6 +500,27 @@ impl Tenant {
|
||||
///
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
|
||||
async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
|
||||
// Create directory with marker file to indicate attaching state.
|
||||
// The load_local_tenants() function in tenant_mgr relies on the marker file
|
||||
// to determine whether a tenant has finished attaching.
|
||||
let tenant_dir = self.conf.tenant_path(&self.tenant_id);
|
||||
let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
|
||||
debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
|
||||
if tenant_dir.exists() {
|
||||
if !marker_file.is_file() {
|
||||
anyhow::bail!(
|
||||
"calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
|
||||
tenant_dir.display(), marker_file.display());
|
||||
}
|
||||
} else {
|
||||
crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
|
||||
fs::File::create(&marker_file).context("create tenant attaching marker file")?;
|
||||
crashsafe::fsync_file_and_parent(&marker_file)
|
||||
.context("fsync tenant attaching marker file and parent")?;
|
||||
}
|
||||
debug_assert!(tenant_dir.is_dir());
|
||||
debug_assert!(marker_file.is_file());
|
||||
|
||||
// Get list of remote timelines
|
||||
// download index files for every tenant timeline
|
||||
info!("listing remote timelines");
|
||||
@@ -517,19 +540,18 @@ impl Tenant {
|
||||
for (timeline_id, index_part) in remote_timelines.iter() {
|
||||
let remote_metadata = index_part.parse_metadata().with_context(|| {
|
||||
format!(
|
||||
"Failed to parse metadata file from remote storage for tenant {}",
|
||||
self.tenant_id
|
||||
"Failed to parse metadata file from remote storage for tenant {} timeline {}",
|
||||
self.tenant_id, timeline_id
|
||||
)
|
||||
})?;
|
||||
timeline_ancestors.insert(*timeline_id, remote_metadata);
|
||||
index_parts.insert(*timeline_id, index_part);
|
||||
}
|
||||
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
|
||||
|
||||
// For every timeline, download the metadata file, scan the local directory,
|
||||
// and build a layer map that contains an entry for each remote and local
|
||||
// layer file.
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
|
||||
for (timeline_id, _metadata) in sorted_timelines {
|
||||
// TODO again handle early failure
|
||||
self.load_remote_timeline(
|
||||
@@ -537,9 +559,20 @@ impl Tenant {
|
||||
index_parts[&timeline_id],
|
||||
remote_storage.clone(),
|
||||
)
|
||||
.await?
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to load remote timeline {} for tenant {}",
|
||||
timeline_id, self.tenant_id
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
std::fs::remove_file(&marker_file)
|
||||
.with_context(|| format!("unlink attach marker file {}", marker_file.display()))?;
|
||||
crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
|
||||
.context("fsync tenant directory after unlinking attach marker file")?;
|
||||
|
||||
// FIXME: Check if the state has changed to Stopping while we were downloading stuff
|
||||
// We're ready for business.
|
||||
// Change to active state under the hood spawns background loops
|
||||
|
||||
@@ -930,6 +930,8 @@ impl Timeline {
|
||||
num_layers += 1;
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else if crate::storage_sync::is_temp_download_file(&direntry_path) {
|
||||
info!("skipping temp download file, reconcile_with_remote will resume / clean up: {}", fname);
|
||||
} else if is_ephemeral_file(&fname) {
|
||||
// Delete any old ephemeral files
|
||||
trace!("deleting old ephemeral file in timeline dir: {}", fname);
|
||||
|
||||
@@ -20,6 +20,7 @@ use crate::walredo::PostgresRedoManager;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
|
||||
use utils::crashsafe::{self, path_with_suffix_extension};
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
mod tenants_state {
|
||||
@@ -79,16 +80,25 @@ pub fn init_tenant_mgr(
|
||||
);
|
||||
}
|
||||
} else {
|
||||
number_of_tenants += 1;
|
||||
if let Err(e) =
|
||||
load_local_tenant(conf, &tenant_dir_path, remote_storage.clone())
|
||||
{
|
||||
error!(
|
||||
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
|
||||
Ok(Some(_)) => number_of_tenants += 1,
|
||||
Ok(None) => {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
dir_entry,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -114,7 +124,19 @@ fn load_local_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Option<()>> {
|
||||
if !tenant_path.is_dir() {
|
||||
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
|
||||
}
|
||||
|
||||
let is_empty = tenant_path
|
||||
.is_empty_dir()
|
||||
.context("check whether tenant_path is an empty dir")?;
|
||||
if is_empty {
|
||||
info!("skipping empty tenant directory {tenant_path:?}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
@@ -122,11 +144,33 @@ fn load_local_tenant(
|
||||
.parse::<TenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
|
||||
// Start loading the tenant into memory. It will initially be in Loading
|
||||
// state.
|
||||
let tenant = Tenant::spawn_load(conf, tenant_id, remote_storage)?;
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
Tenant::spawn_attach(conf, tenant_id, &remote_storage)?
|
||||
} else {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
// XXX there should be a constructor to make a broken tenant
|
||||
// TODO should we use Tenant::load_tenant_config() here?
|
||||
let tenant_conf = TenantConfOpt::default();
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Tenant::new(
|
||||
TenantState::Broken,
|
||||
conf,
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
remote_storage,
|
||||
);
|
||||
Arc::new(tenant)
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(conf, tenant_id, remote_storage)?
|
||||
};
|
||||
tenants_state::write_tenants().insert(tenant_id, tenant);
|
||||
Ok(())
|
||||
return Ok(Some(()));
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -63,9 +63,11 @@ def test_remote_storage_backup_and_restore(
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
|
||||
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
|
||||
env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
|
||||
# FIXME retry downloads without throwing errors
|
||||
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
|
||||
# we have a bunch of pytest.raises for this below
|
||||
env.pageserver.allowed_errors.append(".*tenant already exists.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
@@ -118,19 +120,22 @@ def test_remote_storage_backup_and_restore(
|
||||
time.sleep(10)
|
||||
|
||||
# assert cannot attach timeline that is scheduled for download
|
||||
with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"):
|
||||
# FIXME implement layer download retries
|
||||
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
tenant_status = client.tenant_status(tenant_id)
|
||||
log.info("Tenant status with active failpoint: %s", tenant_status)
|
||||
assert tenant_status["has_in_progress_downloads"] is True
|
||||
# FIXME implement layer download retries
|
||||
# assert tenant_status["has_in_progress_downloads"] is True
|
||||
|
||||
# trigger temporary download files removal
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
# ensure that an initiated attach operation survives pageserver restart
|
||||
with pytest.raises(Exception, match="tenant already exists"):
|
||||
client.tenant_attach(tenant_id)
|
||||
log.info("waiting for timeline redownload")
|
||||
wait_until(
|
||||
number_of_iterations=20,
|
||||
|
||||
Reference in New Issue
Block a user