crash-safe and resumable tenant attach

This change introduces a marker file

  $repo/tenants/$tenant_id/attaching

that is present while a tenant is in Attaching state.

When pageserver restarts, we use it to resume the tenant attach operation.
Before this change, a crash during tenant attach would result in one of
the following:
1. crash upon restart due to missing metadata file (IIRC)
2. "successful" loading of the tenant with a subset of timelines
This commit is contained in:
Christian Schwarz
2022-11-14 13:08:45 -05:00
committed by Dmitry Rodionov
parent c4c4558736
commit bb6dbd2f43
9 changed files with 175 additions and 24 deletions

45
libs/utils/src/fs_ext.rs Normal file
View File

@@ -0,0 +1,45 @@
/// Extensions to `std::fs` types.
use std::{fs, io, path::Path};
pub trait PathExt {
/// Returns an error if `self` is not a directory.
fn is_empty_dir(&self) -> io::Result<bool>;
}
impl<P> PathExt for P
where
P: AsRef<Path>,
{
fn is_empty_dir(&self) -> io::Result<bool> {
Ok(fs::read_dir(self)?.into_iter().next().is_none())
}
}
#[cfg(test)]
mod test {
use std::path::PathBuf;
#[test]
fn is_empty_dir() {
use super::PathExt;
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
// test positive case
assert!(
dir_path.is_empty_dir().expect("test failure"),
"new tempdir should be empty"
);
// invoke on a file to ensure it returns an error
let file_path: PathBuf = dir_path.join("testfile");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
assert!(file_path.is_empty_dir().is_err());
// do it again on a path, we know to be nonexistent
std::fs::remove_file(&file_path).unwrap();
assert!(file_path.is_empty_dir().is_err());
}
}

View File

@@ -48,6 +48,8 @@ pub mod nonblock;
// Default signal handling
pub mod signals;
pub mod fs_ext;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -23,7 +23,7 @@ use utils::{
postgres_backend::AuthType,
};
use crate::tenant::TIMELINES_SEGMENT_NAME;
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::tenant_config::{TenantConf, TenantConfOpt};
/// The name of the metadata file pageserver creates per timeline.
@@ -390,6 +390,11 @@ impl PageServerConf {
self.tenants_path().join(tenant_id.to_string())
}
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(&tenant_id)
.join(TENANT_ATTACHING_MARKER_FILENAME)
}
/// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located.
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {

View File

@@ -129,6 +129,7 @@ mod upload;
// re-export this
pub use download::list_remote_timelines;
pub use download::is_temp_download_file;
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;

View File

@@ -1,5 +1,6 @@
//! Helper functions to download files from remote storage with a RemoteStorage
use std::collections::HashSet;
use std::path::Path;
use anyhow::{bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
@@ -36,6 +37,21 @@ pub async fn download_layer_file<'a>(
download_layer_file_guts(conf, storage, tenant_id, timeline_id, path, layer_metadata).await
}
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Path) -> bool {
let extension = path.extension().map(|pname| {
pname
.to_str()
.expect("paths passed to this function must be valid Rust strings")
});
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,
Some(_) => false,
None => false,
}
}
async fn download_layer_file_guts<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
@@ -44,8 +60,6 @@ async fn download_layer_file_guts<'a>(
path: &'a RelativePath,
layer_metadata: &'a LayerFileMetadata,
) -> anyhow::Result<u64> {
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let local_path = path.to_local_path(&timeline_path);

View File

@@ -104,6 +104,8 @@ pub use crate::tenant::timeline::WalReceiverInfo;
/// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
///
/// Tenant consists of multiple timelines. Keep them in a hash table.
///
@@ -498,6 +500,27 @@ impl Tenant {
///
#[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
// Create directory with marker file to indicate attaching state.
// The load_local_tenants() function in tenant_mgr relies on the marker file
// to determine whether a tenant has finished attaching.
let tenant_dir = self.conf.tenant_path(&self.tenant_id);
let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
if tenant_dir.exists() {
if !marker_file.is_file() {
anyhow::bail!(
"calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
tenant_dir.display(), marker_file.display());
}
} else {
crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
fs::File::create(&marker_file).context("create tenant attaching marker file")?;
crashsafe::fsync_file_and_parent(&marker_file)
.context("fsync tenant attaching marker file and parent")?;
}
debug_assert!(tenant_dir.is_dir());
debug_assert!(marker_file.is_file());
// Get list of remote timelines
// download index files for every tenant timeline
info!("listing remote timelines");
@@ -517,19 +540,18 @@ impl Tenant {
for (timeline_id, index_part) in remote_timelines.iter() {
let remote_metadata = index_part.parse_metadata().with_context(|| {
format!(
"Failed to parse metadata file from remote storage for tenant {}",
self.tenant_id
"Failed to parse metadata file from remote storage for tenant {} timeline {}",
self.tenant_id, timeline_id
)
})?;
timeline_ancestors.insert(*timeline_id, remote_metadata);
index_parts.insert(*timeline_id, index_part);
}
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
// For every timeline, download the metadata file, scan the local directory,
// and build a layer map that contains an entry for each remote and local
// layer file.
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
for (timeline_id, _metadata) in sorted_timelines {
// TODO again handle early failure
self.load_remote_timeline(
@@ -537,9 +559,20 @@ impl Tenant {
index_parts[&timeline_id],
remote_storage.clone(),
)
.await?
.await
.with_context(|| {
format!(
"failed to load remote timeline {} for tenant {}",
timeline_id, self.tenant_id
)
})?;
}
std::fs::remove_file(&marker_file)
.with_context(|| format!("unlink attach marker file {}", marker_file.display()))?;
crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
.context("fsync tenant directory after unlinking attach marker file")?;
// FIXME: Check if the state has changed to Stopping while we were downloading stuff
// We're ready for business.
// Change to active state under the hood spawns background loops

View File

@@ -930,6 +930,8 @@ impl Timeline {
num_layers += 1;
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
} else if crate::storage_sync::is_temp_download_file(&direntry_path) {
info!("skipping temp download file, reconcile_with_remote will resume / clean up: {}", fname);
} else if is_ephemeral_file(&fname) {
// Delete any old ephemeral files
trace!("deleting old ephemeral file in timeline dir: {}", fname);

View File

@@ -20,6 +20,7 @@ use crate::walredo::PostgresRedoManager;
use crate::TEMP_FILE_SUFFIX;
use utils::crashsafe::{self, path_with_suffix_extension};
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
mod tenants_state {
@@ -79,16 +80,25 @@ pub fn init_tenant_mgr(
);
}
} else {
number_of_tenants += 1;
if let Err(e) =
load_local_tenant(conf, &tenant_dir_path, remote_storage.clone())
{
error!(
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
Ok(Some(_)) => number_of_tenants += 1,
Ok(None) => {
// This case happens if we crash during attach before creating the attach marker file
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path.display()
)
}
}
Err(e) => {
error!(
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
tenants_dir.display(),
dir_entry,
e
);
}
}
}
}
@@ -114,7 +124,19 @@ fn load_local_tenant(
conf: &'static PageServerConf,
tenant_path: &Path,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<()> {
) -> anyhow::Result<Option<()>> {
if !tenant_path.is_dir() {
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
}
let is_empty = tenant_path
.is_empty_dir()
.context("check whether tenant_path is an empty dir")?;
if is_empty {
info!("skipping empty tenant directory {tenant_path:?}");
return Ok(None);
}
let tenant_id = tenant_path
.file_name()
.and_then(OsStr::to_str)
@@ -122,11 +144,33 @@ fn load_local_tenant(
.parse::<TenantId>()
.context("Could not parse tenant id out of the tenant dir name")?;
// Start loading the tenant into memory. It will initially be in Loading
// state.
let tenant = Tenant::spawn_load(conf, tenant_id, remote_storage)?;
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = remote_storage {
Tenant::spawn_attach(conf, tenant_id, &remote_storage)?
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
// XXX there should be a constructor to make a broken tenant
// TODO should we use Tenant::load_tenant_config() here?
let tenant_conf = TenantConfOpt::default();
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
let tenant = Tenant::new(
TenantState::Broken,
conf,
tenant_conf,
wal_redo_manager,
tenant_id,
remote_storage,
);
Arc::new(tenant)
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(conf, tenant_id, remote_storage)?
};
tenants_state::write_tenants().insert(tenant_id, tenant);
Ok(())
return Ok(Some(()));
}
///

View File

@@ -63,9 +63,11 @@ def test_remote_storage_backup_and_restore(
)
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
# FIXME retry downloads without throwing errors
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
# we have a bunch of pytest.raises for this below
env.pageserver.allowed_errors.append(".*tenant already exists.*")
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
@@ -118,19 +120,22 @@ def test_remote_storage_backup_and_restore(
time.sleep(10)
# assert cannot attach timeline that is scheduled for download
with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"):
# FIXME implement layer download retries
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
client.tenant_attach(tenant_id)
tenant_status = client.tenant_status(tenant_id)
log.info("Tenant status with active failpoint: %s", tenant_status)
assert tenant_status["has_in_progress_downloads"] is True
# FIXME implement layer download retries
# assert tenant_status["has_in_progress_downloads"] is True
# trigger temporary download files removal
env.pageserver.stop()
env.pageserver.start()
client.tenant_attach(tenant_id)
# ensure that an initiated attach operation survives pageserver restart
with pytest.raises(Exception, match="tenant already exists"):
client.tenant_attach(tenant_id)
log.info("waiting for timeline redownload")
wait_until(
number_of_iterations=20,