stop writing metadata file (#6769)

Building atop #6777, this PR removes the code that writes the `metadata`
file and adds a piece of migration code that removes any remaining
`metadata` files.

We'll remove the migration code after this PR has been deployed.

part of https://github.com/neondatabase/neon/issues/6663

More cleanups punted into follow-up issue, as they touch a lot of code: 
https://github.com/neondatabase/neon/issues/6890
This commit is contained in:
Christian Schwarz
2024-02-23 14:33:47 +01:00
committed by GitHub
parent 6f8f7c7de9
commit cd449d66ea
12 changed files with 95 additions and 270 deletions

View File

@@ -39,7 +39,7 @@ use crate::tenant::{
};
use crate::virtual_file;
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
};
@@ -826,17 +826,6 @@ impl PageServerConf {
.join(connection_id.to_string())
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.timeline_path(tenant_shard_id, timeline_id)
.join(METADATA_FILE_NAME)
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
remote_path.with_base(&self.workdir)

View File

@@ -169,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
}
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
if let Some(e) = e.io_error() {
if e.kind() == std::io::ErrorKind::NotFound {
return true;
}
}
false
}
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
/// blocking.
///

View File

@@ -172,9 +172,6 @@ pub(crate) mod throttle;
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
// re-export for use in remote_timeline_client.rs
pub use crate::tenant::metadata::save_metadata;
// re-export for use in walreceiver
pub use crate::tenant::timeline::WalReceiverInfo;
@@ -1151,17 +1148,6 @@ impl Tenant {
None
};
// timeline loading after attach expects to find metadata file for each metadata
save_metadata(
self.conf,
&self.tenant_shard_id,
&timeline_id,
&remote_metadata,
)
.await
.context("save_metadata")
.map_err(LoadLocalTimelineError::Load)?;
self.timeline_init_and_sync(
timeline_id,
resources,
@@ -3293,10 +3279,7 @@ impl Tenant {
timeline_struct.init_empty_layer_map(start_lsn);
if let Err(e) = self
.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
.await
{
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
cleanup_timeline_directory(uninit_mark);
return Err(e);
@@ -3313,26 +3296,13 @@ impl Tenant {
))
}
async fn create_timeline_files(
&self,
timeline_path: &Utf8Path,
new_timeline_id: &TimelineId,
new_metadata: &TimelineMetadata,
) -> anyhow::Result<()> {
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
});
save_metadata(
self.conf,
&self.tenant_shard_id,
new_timeline_id,
new_metadata,
)
.await
.context("Failed to create timeline metadata")?;
Ok(())
}

View File

@@ -8,20 +8,11 @@
//!
//! [`remote_timeline_client`]: super::remote_timeline_client
use std::io::{self};
use anyhow::{ensure, Context};
use pageserver_api::shard::TenantShardId;
use anyhow::ensure;
use serde::{de::Error, Deserialize, Serialize, Serializer};
use thiserror::Error;
use utils::bin_ser::SerializeError;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
use crate::config::PageServerConf;
use crate::virtual_file::VirtualFile;
use crate::TEMP_FILE_SUFFIX;
/// Use special format number to enable backward compatibility.
const METADATA_FORMAT_VERSION: u16 = 4;
@@ -268,32 +259,6 @@ impl Serialize for TimelineMetadata {
}
}
/// Save timeline metadata to file
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
pub async fn save_metadata(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
data: &TimelineMetadata,
) -> anyhow::Result<()> {
let path = conf.metadata_path(tenant_shard_id, timeline_id);
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
.await
.context("write metadata")?;
Ok(())
}
#[derive(Error, Debug)]
pub enum LoadMetadataError {
#[error(transparent)]
Read(#[from] io::Error),
#[error(transparent)]
Decode(#[from] anyhow::Error),
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -42,7 +42,7 @@ use crate::tenant::config::{
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext::PathExt;
@@ -359,12 +359,6 @@ fn load_tenant_config(
return Ok(None);
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
return Ok(None);
}
let tenant_shard_id = match tenant_dir_path
.file_name()
.unwrap_or_default()
@@ -377,6 +371,59 @@ fn load_tenant_config(
}
};
// Clean up legacy `metadata` files.
// Doing it here because every single tenant directory is visited here.
// In any later code, there's different treatment of tenant dirs
// ... depending on whether the tenant is in re-attach response or not
// ... epending on whether the tenant is ignored or not
assert_eq!(
&conf.tenant_path(&tenant_shard_id),
&tenant_dir_path,
"later use of conf....path() methods would be dubious"
);
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
Ok(iter) => {
let mut timelines = Vec::new();
for res in iter {
let p = res?;
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
// skip any entries that aren't TimelineId, such as
// - *.___temp dirs
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
continue;
};
timelines.push(timeline_id);
}
timelines
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
Err(e) => return Err(anyhow::anyhow!(e)),
};
for timeline_id in timelines {
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
match std::fs::remove_file(&metadata_path) {
Ok(()) => {
crashsafe::fsync(timeline_path)
.context("fsync timeline dir after removing legacy metadata file")?;
info!("removed legacy metadata file at {metadata_path}");
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// something removed the file earlier, or it was never there
// We don't care, this software version doesn't write it again, so, we're good.
}
Err(e) => {
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
}
}
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
return Ok(None);
}
Ok(Some((
tenant_shard_id,
Tenant::load_tenant_config(conf, &tenant_shard_id),

View File

@@ -45,7 +45,7 @@ use rand::Rng;
use remote_storage::{DownloadError, GenericRemoteStorage};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, Instrument};
use tracing::{info_span, instrument, warn, Instrument};
use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
};
@@ -791,6 +791,7 @@ async fn init_timeline_state(
let file_name = file_path.file_name().expect("created it from the dentry");
if file_name == METADATA_FILE_NAME {
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
continue;
} else if crate::is_temporary(&file_path) {
// Temporary files are frequently left behind from restarting during downloads

View File

@@ -54,7 +54,7 @@ use crate::pgdatadir_mapping::DirectoryKind;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
metadata::{save_metadata, TimelineMetadata},
metadata::TimelineMetadata,
par_fsync,
};
use crate::{
@@ -345,7 +345,7 @@ pub struct Timeline {
///
/// Must only be taken in two places:
/// - [`Timeline::compact`] (this file)
/// - [`delete::delete_local_layer_files`]
/// - [`delete::delete_local_timeline_directory`]
///
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
compaction_lock: tokio::sync::Mutex<()>,
@@ -354,7 +354,7 @@ pub struct Timeline {
///
/// Must only be taken in two places:
/// - [`Timeline::gc`] (this file)
/// - [`delete::delete_local_layer_files`]
/// - [`delete::delete_local_timeline_directory`]
///
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
gc_lock: tokio::sync::Mutex<()>,
@@ -1845,7 +1845,11 @@ impl Timeline {
discovered_layers.push((file_name, file_size));
continue;
}
Discovered::Metadata | Discovered::IgnoredBackup => {
Discovered::Metadata => {
warn!("found legacy metadata file, these should have been removed in load_tenant_config");
continue;
}
Discovered::IgnoredBackup => {
continue;
}
Discovered::Unknown(file_name) => {
@@ -2352,7 +2356,7 @@ impl Timeline {
fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
if !self
.conf
.metadata_path(&self.tenant_shard_id, &self.timeline_id)
.timeline_path(&self.tenant_shard_id, &self.timeline_id)
.exists()
{
error!("timeline-calculate-logical-size-pre metadata file does not exist")
@@ -3207,7 +3211,7 @@ impl Timeline {
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now. The flushed layer is stored in
// the mapping in `create_delta_layer`.
let metadata = {
{
let mut guard = self.layers.write().await;
if self.cancel.is_cancelled() {
@@ -3221,9 +3225,7 @@ impl Timeline {
self.disk_consistent_lsn.store(disk_consistent_lsn);
// Schedule remote uploads that will reflect our new disk_consistent_lsn
Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
} else {
None
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
}
// release lock on 'layers'
};
@@ -3238,22 +3240,6 @@ impl Timeline {
// This failpoint is used by another test case `test_pageserver_recovery`.
fail_point!("flush-frozen-exit");
// Update the metadata file, with new 'disk_consistent_lsn'
//
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
// *all* the layers, to avoid fsyncing the file multiple times.
// If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
if let Some(metadata) = metadata {
save_metadata(
self.conf,
&self.tenant_shard_id,
&self.timeline_id,
&metadata,
)
.await
.context("save_metadata")?;
}
Ok(())
}
@@ -3309,25 +3295,6 @@ impl Timeline {
Ok(metadata)
}
async fn update_metadata_file(
&self,
disk_consistent_lsn: Lsn,
layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
) -> anyhow::Result<()> {
let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
save_metadata(
self.conf,
&self.tenant_shard_id,
&self.timeline_id,
&metadata,
)
.await
.context("save_metadata")?;
Ok(())
}
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
if let Some(remote_client) = &self.remote_client {
remote_client
@@ -4660,18 +4627,11 @@ impl Timeline {
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
if !layers_to_remove.is_empty() {
// Persist the new GC cutoff value in the metadata file, before
// we actually remove anything.
//
// This does not in fact have any effect as we no longer consider local metadata unless
// running without remote storage.
//
// Persist the new GC cutoff value before we actually remove anything.
// This unconditionally schedules also an index_part.json update, even though, we will
// be doing one a bit later with the unlinked gc'd layers.
//
// TODO: remove when implementing <https://github.com/neondatabase/neon/issues/4099>.
self.update_metadata_file(self.disk_consistent_lsn.load(), None)
.await?;
let disk_consistent_lsn = self.disk_consistent_lsn.load();
self.schedule_uploads(disk_consistent_lsn, None)?;
let gc_layers = layers_to_remove
.iter()

View File

@@ -6,7 +6,7 @@ use std::{
use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use tokio::sync::OwnedMutexGuard;
use tracing::{debug, error, info, instrument, warn, Instrument};
use tracing::{debug, error, info, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId};
use crate::{
@@ -124,7 +124,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
/// No timeout here, GC & Compaction should be responsive to the
/// `TimelineState::Stopping` change.
// pub(super): documentation link
pub(super) async fn delete_local_layer_files(
pub(super) async fn delete_local_timeline_directory(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
timeline: &Timeline,
@@ -149,8 +149,6 @@ pub(super) async fn delete_local_layer_files(
// NB: This need not be atomic because the deleted flag in the IndexPart
// will be observed during tenant/timeline load. The deletion will be resumed there.
//
// For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
//
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
// This can happen if we're called a second time, e.g.,
// because of a previous failure/cancellation at/after
@@ -158,72 +156,21 @@ pub(super) async fn delete_local_layer_files(
//
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
// no locks are shared.
//
// For now, log and continue.
// warn! level is technically not appropriate for the
// first case because we should expect retries to happen.
// But the error is so rare, it seems better to get attention if it happens.
//
// Note that metadata removal is skipped, this is not technically needed,
// but allows to reuse timeline loading code during resumed deletion.
// (we always expect that metadata is in place when timeline is being loaded)
tokio::fs::remove_dir_all(local_timeline_directory)
.await
.or_else(fs_ext::ignore_not_found)
.context("remove local timeline directory")?;
#[cfg(feature = "testing")]
let mut counter = 0;
// Timeline directory may not exist if we failed to delete mark file and request was retried.
if !local_timeline_directory.exists() {
return Ok(());
}
let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
#[cfg(feature = "testing")]
{
counter += 1;
if counter == 2 {
fail::fail_point!("timeline-delete-during-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
});
}
}
let entry = entry?;
if entry.path() == metadata_path {
debug!("found metadata, skipping");
continue;
}
if entry.path() == local_timeline_directory {
// Keeping directory because metedata file is still there
debug!("found timeline dir itself, skipping");
continue;
}
let metadata = match entry.metadata() {
Ok(metadata) => metadata,
Err(e) => {
if crate::is_walkdir_io_not_found(&e) {
warn!(
timeline_dir=?local_timeline_directory,
path=?entry.path().display(),
"got not found err while removing timeline dir, proceeding anyway"
);
continue;
}
anyhow::bail!(e);
}
};
if metadata.is_dir() {
warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
tokio::fs::remove_dir(entry.path()).await
} else {
tokio::fs::remove_file(entry.path()).await
}
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
}
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let timeline_path = conf.timelines_path(&tenant_shard_id);
crashsafe::fsync_async(timeline_path)
.await
.context("fsync_pre_mark_remove")?;
info!("finished deleting layer files, releasing locks");
drop(guards);
@@ -254,39 +201,6 @@ async fn cleanup_remaining_timeline_fs_traces(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
// Remove local metadata
tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("remove metadata")?;
fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-after-rm-metadata"
))?
});
// Remove timeline dir
tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("timeline dir")?;
fail::fail_point!("timeline-delete-after-rm-dir", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
});
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let timeline_path = conf.timelines_path(&tenant_shard_id);
crashsafe::fsync_async(timeline_path)
.await
.context("fsync_pre_mark_remove")?;
// Remove delete mark
// TODO: once we are confident that no more exist in the field, remove this
// line. It cleans up a legacy marker file that might in rare cases be present.
@@ -552,15 +466,12 @@ impl DeleteTimelineFlow {
tenant: &Tenant,
timeline: &Timeline,
) -> Result<(), DeleteTimelineError> {
delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
delete_remote_layers_and_index(timeline).await?;
pausable_failpoint!("in_progress_delete");
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
.await?;
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
*guard = Self::Finished;

View File

@@ -694,10 +694,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
# index upload is now hitting the failpoint, it should block the shutdown
env.pageserver.stop(immediate=True)
local_metadata = (
env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
)
assert local_metadata.is_file()
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id)
assert timeline_dir.is_dir()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)

View File

@@ -130,7 +130,6 @@ FAILPOINTS = [
"timeline-delete-before-index-deleted-at",
"timeline-delete-before-rm",
"timeline-delete-before-index-delete",
"timeline-delete-after-rm-dir",
]
FAILPOINTS_BEFORE_BACKGROUND = [

View File

@@ -157,10 +157,7 @@ def switch_pg_to_new_pageserver(
timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
assert (
len(files_before_detach) >= 2
len(files_before_detach) >= 1
), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
return timeline_to_detach_local_path

View File

@@ -136,12 +136,9 @@ DELETE_FAILPOINTS = [
"timeline-delete-before-index-deleted-at",
"timeline-delete-before-schedule",
"timeline-delete-before-rm",
"timeline-delete-during-rm",
"timeline-delete-after-rm",
"timeline-delete-before-index-delete",
"timeline-delete-after-index-delete",
"timeline-delete-after-rm-metadata",
"timeline-delete-after-rm-dir",
]
@@ -801,7 +798,7 @@ def test_timeline_delete_resumed_on_attach(
)
# failpoint before we remove index_part from s3
failpoint = "timeline-delete-during-rm"
failpoint = "timeline-delete-after-rm"
ps_http.configure_failpoints((failpoint, "return"))
env.pageserver.allowed_errors.extend(