mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
stop writing metadata file (#6769)
Building atop #6777, this PR removes the code that writes the `metadata` file and adds a piece of migration code that removes any remaining `metadata` files. We'll remove the migration code after this PR has been deployed. part of https://github.com/neondatabase/neon/issues/6663 More cleanups punted into follow-up issue, as they touch a lot of code: https://github.com/neondatabase/neon/issues/6890
This commit is contained in:
committed by
GitHub
parent
6f8f7c7de9
commit
cd449d66ea
@@ -39,7 +39,7 @@ use crate::tenant::{
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
@@ -826,17 +826,6 @@ impl PageServerConf {
|
||||
.join(connection_id.to_string())
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
self.timeline_path(tenant_shard_id, timeline_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
|
||||
@@ -169,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
if let Some(e) = e.io_error() {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
|
||||
@@ -172,9 +172,6 @@ pub(crate) mod throttle;
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
pub use crate::tenant::metadata::save_metadata;
|
||||
|
||||
// re-export for use in walreceiver
|
||||
pub use crate::tenant::timeline::WalReceiverInfo;
|
||||
|
||||
@@ -1151,17 +1148,6 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
// timeline loading after attach expects to find metadata file for each metadata
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&remote_metadata,
|
||||
)
|
||||
.await
|
||||
.context("save_metadata")
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
@@ -3293,10 +3279,7 @@ impl Tenant {
|
||||
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
|
||||
if let Err(e) = self
|
||||
.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
|
||||
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
return Err(e);
|
||||
@@ -3313,26 +3296,13 @@ impl Tenant {
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_timeline_files(
|
||||
&self,
|
||||
timeline_path: &Utf8Path,
|
||||
new_timeline_id: &TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
});
|
||||
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_shard_id,
|
||||
new_timeline_id,
|
||||
new_metadata,
|
||||
)
|
||||
.await
|
||||
.context("Failed to create timeline metadata")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -8,20 +8,11 @@
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::io::{self};
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use anyhow::ensure;
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use thiserror::Error;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
|
||||
/// Use special format number to enable backward compatibility.
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
|
||||
@@ -268,32 +259,6 @@ impl Serialize for TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub async fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
data: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
|
||||
.await
|
||||
.context("write metadata")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoadMetadataError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -42,7 +42,7 @@ use crate::tenant::config::{
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::fs_ext::PathExt;
|
||||
@@ -359,12 +359,6 @@ fn load_tenant_config(
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_shard_id = match tenant_dir_path
|
||||
.file_name()
|
||||
.unwrap_or_default()
|
||||
@@ -377,6 +371,59 @@ fn load_tenant_config(
|
||||
}
|
||||
};
|
||||
|
||||
// Clean up legacy `metadata` files.
|
||||
// Doing it here because every single tenant directory is visited here.
|
||||
// In any later code, there's different treatment of tenant dirs
|
||||
// ... depending on whether the tenant is in re-attach response or not
|
||||
// ... epending on whether the tenant is ignored or not
|
||||
assert_eq!(
|
||||
&conf.tenant_path(&tenant_shard_id),
|
||||
&tenant_dir_path,
|
||||
"later use of conf....path() methods would be dubious"
|
||||
);
|
||||
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
|
||||
Ok(iter) => {
|
||||
let mut timelines = Vec::new();
|
||||
for res in iter {
|
||||
let p = res?;
|
||||
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
|
||||
// skip any entries that aren't TimelineId, such as
|
||||
// - *.___temp dirs
|
||||
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
|
||||
continue;
|
||||
};
|
||||
timelines.push(timeline_id);
|
||||
}
|
||||
timelines
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
|
||||
Err(e) => return Err(anyhow::anyhow!(e)),
|
||||
};
|
||||
for timeline_id in timelines {
|
||||
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
|
||||
match std::fs::remove_file(&metadata_path) {
|
||||
Ok(()) => {
|
||||
crashsafe::fsync(timeline_path)
|
||||
.context("fsync timeline dir after removing legacy metadata file")?;
|
||||
info!("removed legacy metadata file at {metadata_path}");
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// something removed the file earlier, or it was never there
|
||||
// We don't care, this software version doesn't write it again, so, we're good.
|
||||
}
|
||||
Err(e) => {
|
||||
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some((
|
||||
tenant_shard_id,
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id),
|
||||
|
||||
@@ -45,7 +45,7 @@ use rand::Rng;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
};
|
||||
@@ -791,6 +791,7 @@ async fn init_timeline_state(
|
||||
let file_name = file_path.file_name().expect("created it from the dentry");
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path) {
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
|
||||
@@ -54,7 +54,7 @@ use crate::pgdatadir_mapping::DirectoryKind;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::{save_metadata, TimelineMetadata},
|
||||
metadata::TimelineMetadata,
|
||||
par_fsync,
|
||||
};
|
||||
use crate::{
|
||||
@@ -345,7 +345,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// Must only be taken in two places:
|
||||
/// - [`Timeline::compact`] (this file)
|
||||
/// - [`delete::delete_local_layer_files`]
|
||||
/// - [`delete::delete_local_timeline_directory`]
|
||||
///
|
||||
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
|
||||
compaction_lock: tokio::sync::Mutex<()>,
|
||||
@@ -354,7 +354,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// Must only be taken in two places:
|
||||
/// - [`Timeline::gc`] (this file)
|
||||
/// - [`delete::delete_local_layer_files`]
|
||||
/// - [`delete::delete_local_timeline_directory`]
|
||||
///
|
||||
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
@@ -1845,7 +1845,11 @@ impl Timeline {
|
||||
discovered_layers.push((file_name, file_size));
|
||||
continue;
|
||||
}
|
||||
Discovered::Metadata | Discovered::IgnoredBackup => {
|
||||
Discovered::Metadata => {
|
||||
warn!("found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
}
|
||||
Discovered::IgnoredBackup => {
|
||||
continue;
|
||||
}
|
||||
Discovered::Unknown(file_name) => {
|
||||
@@ -2352,7 +2356,7 @@ impl Timeline {
|
||||
fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
|
||||
if !self
|
||||
.conf
|
||||
.metadata_path(&self.tenant_shard_id, &self.timeline_id)
|
||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id)
|
||||
.exists()
|
||||
{
|
||||
error!("timeline-calculate-logical-size-pre metadata file does not exist")
|
||||
@@ -3207,7 +3211,7 @@ impl Timeline {
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
// the mapping in `create_delta_layer`.
|
||||
let metadata = {
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -3221,9 +3225,7 @@ impl Timeline {
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
|
||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||
Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
|
||||
} else {
|
||||
None
|
||||
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
||||
}
|
||||
// release lock on 'layers'
|
||||
};
|
||||
@@ -3238,22 +3240,6 @@ impl Timeline {
|
||||
// This failpoint is used by another test case `test_pageserver_recovery`.
|
||||
fail_point!("flush-frozen-exit");
|
||||
|
||||
// Update the metadata file, with new 'disk_consistent_lsn'
|
||||
//
|
||||
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
||||
// *all* the layers, to avoid fsyncing the file multiple times.
|
||||
|
||||
// If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
|
||||
if let Some(metadata) = metadata {
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_shard_id,
|
||||
&self.timeline_id,
|
||||
&metadata,
|
||||
)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3309,25 +3295,6 @@ impl Timeline {
|
||||
Ok(metadata)
|
||||
}
|
||||
|
||||
async fn update_metadata_file(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
||||
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_shard_id,
|
||||
&self.timeline_id,
|
||||
&metadata,
|
||||
)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client
|
||||
@@ -4660,18 +4627,11 @@ impl Timeline {
|
||||
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
|
||||
|
||||
if !layers_to_remove.is_empty() {
|
||||
// Persist the new GC cutoff value in the metadata file, before
|
||||
// we actually remove anything.
|
||||
//
|
||||
// This does not in fact have any effect as we no longer consider local metadata unless
|
||||
// running without remote storage.
|
||||
//
|
||||
// Persist the new GC cutoff value before we actually remove anything.
|
||||
// This unconditionally schedules also an index_part.json update, even though, we will
|
||||
// be doing one a bit later with the unlinked gc'd layers.
|
||||
//
|
||||
// TODO: remove when implementing <https://github.com/neondatabase/neon/issues/4099>.
|
||||
self.update_metadata_file(self.disk_consistent_lsn.load(), None)
|
||||
.await?;
|
||||
let disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
self.schedule_uploads(disk_consistent_lsn, None)?;
|
||||
|
||||
let gc_layers = layers_to_remove
|
||||
.iter()
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use tracing::{debug, error, info, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
@@ -124,7 +124,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
|
||||
/// No timeout here, GC & Compaction should be responsive to the
|
||||
/// `TimelineState::Stopping` change.
|
||||
// pub(super): documentation link
|
||||
pub(super) async fn delete_local_layer_files(
|
||||
pub(super) async fn delete_local_timeline_directory(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline: &Timeline,
|
||||
@@ -149,8 +149,6 @@ pub(super) async fn delete_local_layer_files(
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
|
||||
//
|
||||
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
@@ -158,72 +156,21 @@ pub(super) async fn delete_local_layer_files(
|
||||
//
|
||||
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
|
||||
// no locks are shared.
|
||||
//
|
||||
// For now, log and continue.
|
||||
// warn! level is technically not appropriate for the
|
||||
// first case because we should expect retries to happen.
|
||||
// But the error is so rare, it seems better to get attention if it happens.
|
||||
//
|
||||
// Note that metadata removal is skipped, this is not technically needed,
|
||||
// but allows to reuse timeline loading code during resumed deletion.
|
||||
// (we always expect that metadata is in place when timeline is being loaded)
|
||||
tokio::fs::remove_dir_all(local_timeline_directory)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("remove local timeline directory")?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let mut counter = 0;
|
||||
|
||||
// Timeline directory may not exist if we failed to delete mark file and request was retried.
|
||||
if !local_timeline_directory.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
|
||||
|
||||
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
counter += 1;
|
||||
if counter == 2 {
|
||||
fail::fail_point!("timeline-delete-during-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let entry = entry?;
|
||||
if entry.path() == metadata_path {
|
||||
debug!("found metadata, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.path() == local_timeline_directory {
|
||||
// Keeping directory because metedata file is still there
|
||||
debug!("found timeline dir itself, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = match entry.metadata() {
|
||||
Ok(metadata) => metadata,
|
||||
Err(e) => {
|
||||
if crate::is_walkdir_io_not_found(&e) {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(e);
|
||||
}
|
||||
};
|
||||
|
||||
if metadata.is_dir() {
|
||||
warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
|
||||
tokio::fs::remove_dir(entry.path()).await
|
||||
} else {
|
||||
tokio::fs::remove_file(entry.path()).await
|
||||
}
|
||||
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
||||
}
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let timeline_path = conf.timelines_path(&tenant_shard_id);
|
||||
crashsafe::fsync_async(timeline_path)
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
info!("finished deleting layer files, releasing locks");
|
||||
drop(guards);
|
||||
@@ -254,39 +201,6 @@ async fn cleanup_remaining_timeline_fs_traces(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove local metadata
|
||||
tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("remove metadata")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-rm-metadata"
|
||||
))?
|
||||
});
|
||||
|
||||
// Remove timeline dir
|
||||
tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("timeline dir")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm-dir", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let timeline_path = conf.timelines_path(&tenant_shard_id);
|
||||
crashsafe::fsync_async(timeline_path)
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
// Remove delete mark
|
||||
// TODO: once we are confident that no more exist in the field, remove this
|
||||
// line. It cleans up a legacy marker file that might in rare cases be present.
|
||||
@@ -552,15 +466,12 @@ impl DeleteTimelineFlow {
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
|
||||
|
||||
delete_remote_layers_and_index(timeline).await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
|
||||
.await?;
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
@@ -694,10 +694,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
||||
# index upload is now hitting the failpoint, it should block the shutdown
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
local_metadata = (
|
||||
env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
|
||||
)
|
||||
assert local_metadata.is_file()
|
||||
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id)
|
||||
assert timeline_dir.is_dir()
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
|
||||
|
||||
@@ -130,7 +130,6 @@ FAILPOINTS = [
|
||||
"timeline-delete-before-index-deleted-at",
|
||||
"timeline-delete-before-rm",
|
||||
"timeline-delete-before-index-delete",
|
||||
"timeline-delete-after-rm-dir",
|
||||
]
|
||||
|
||||
FAILPOINTS_BEFORE_BACKGROUND = [
|
||||
|
||||
@@ -157,10 +157,7 @@ def switch_pg_to_new_pageserver(
|
||||
timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
|
||||
files_before_detach = os.listdir(timeline_to_detach_local_path)
|
||||
assert (
|
||||
"metadata" in files_before_detach
|
||||
), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
|
||||
assert (
|
||||
len(files_before_detach) >= 2
|
||||
len(files_before_detach) >= 1
|
||||
), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
|
||||
|
||||
return timeline_to_detach_local_path
|
||||
|
||||
@@ -136,12 +136,9 @@ DELETE_FAILPOINTS = [
|
||||
"timeline-delete-before-index-deleted-at",
|
||||
"timeline-delete-before-schedule",
|
||||
"timeline-delete-before-rm",
|
||||
"timeline-delete-during-rm",
|
||||
"timeline-delete-after-rm",
|
||||
"timeline-delete-before-index-delete",
|
||||
"timeline-delete-after-index-delete",
|
||||
"timeline-delete-after-rm-metadata",
|
||||
"timeline-delete-after-rm-dir",
|
||||
]
|
||||
|
||||
|
||||
@@ -801,7 +798,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
)
|
||||
|
||||
# failpoint before we remove index_part from s3
|
||||
failpoint = "timeline-delete-during-rm"
|
||||
failpoint = "timeline-delete-after-rm"
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
|
||||
Reference in New Issue
Block a user