diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b2064cb7fe..e667645c0a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -110,7 +110,6 @@ impl TenantState { // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, // tenant mgr startup distinguishes attaching from loading via marker file. - // If it's loading, there is no attach marker file, i.e., attach had finished in the past. Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached, // We only reach Active after successful load / attach. // So, call atttachment status Attached. diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 152929ecd3..f0521d27c5 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -23,8 +23,8 @@ use tracing::debug; use crate::s3_bucket::RequestKind; use crate::{ - AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, - StorageMetadata, + AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, + RemoteStorage, StorageMetadata, }; pub struct AzureBlobStorage { @@ -184,10 +184,11 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { #[async_trait::async_trait] impl RemoteStorage for AzureBlobStorage { - async fn list_prefixes( + async fn list( &self, prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { + mode: ListingMode, + ) -> anyhow::Result { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_name(p)) @@ -195,16 +196,19 @@ impl RemoteStorage for AzureBlobStorage { .map(|mut p| { // required to end with a separator // otherwise request will return only the entry of a prefix - if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } p }); - let mut builder = self - .client - .list_blobs() - .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + let mut builder = self.client.list_blobs(); + + if let ListingMode::WithDelimiter = mode { + builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } if let Some(prefix) = list_prefix { builder = builder.prefix(Cow::from(prefix.to_owned())); @@ -215,46 +219,23 @@ impl RemoteStorage for AzureBlobStorage { } let mut response = builder.into_stream(); - let mut res = Vec::new(); - while let Some(entry) = response.next().await { - let entry = entry.map_err(to_download_error)?; - let name_iter = entry + let mut res = Listing::default(); + while let Some(l) = response.next().await { + let entry = l.map_err(to_download_error)?; + let prefix_iter = entry .blobs .prefixes() .map(|prefix| self.name_to_relative_path(&prefix.name)); - res.extend(name_iter); - } - Ok(res) - } + res.prefixes.extend(prefix_iter); - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - let folder_name = folder - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()); - - let mut builder = self.client.list_blobs(); - - if let Some(folder_name) = folder_name { - builder = builder.prefix(Cow::from(folder_name.to_owned())); - } - - if let Some(limit) = self.max_keys_per_list_response { - builder = builder.max_results(MaxResults::new(limit)); - } - - let mut response = builder.into_stream(); - let mut res = Vec::new(); - while let Some(l) = response.next().await { - let entry = l.map_err(anyhow::Error::new)?; - let name_iter = entry + let blob_iter = entry .blobs .blobs() - .map(|bl| self.name_to_relative_path(&bl.name)); - res.extend(name_iter); + .map(|k| self.name_to_relative_path(&k.name)); + res.keys.extend(blob_iter); } Ok(res) } - async fn upload( &self, mut from: impl AsyncRead + Unpin + Send + Sync + 'static, diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 435364d83a..9b5b340db0 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -129,6 +129,22 @@ impl RemotePath { } } +/// We don't need callers to be able to pass arbitrary delimiters: just control +/// whether listings will use a '/' separator or not. +/// +/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The +/// NoDelimiter mode will only populate `keys`. +pub enum ListingMode { + WithDelimiter, + NoDelimiter, +} + +#[derive(Default)] +pub struct Listing { + pub prefixes: Vec, + pub keys: Vec, +} + /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. @@ -141,8 +157,13 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn list_prefixes( &self, prefix: Option<&RemotePath>, - ) -> Result, DownloadError>; - + ) -> Result, DownloadError> { + let result = self + .list(prefix, ListingMode::WithDelimiter) + .await? + .prefixes; + Ok(result) + } /// Lists all files in directory "recursively" /// (not really recursively, because AWS has a flat namespace) /// Note: This is subtely different than list_prefixes, @@ -154,7 +175,16 @@ pub trait RemoteStorage: Send + Sync + 'static { /// whereas, /// list_prefixes("foo/bar/") = ["cat", "dog"] /// See `test_real_s3.rs` for more details. - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result>; + async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result> { + let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys; + Ok(result) + } + + async fn list( + &self, + prefix: Option<&RemotePath>, + _mode: ListingMode, + ) -> anyhow::Result; /// Streams the local file contents into remote into the remote storage entry. async fn upload( @@ -205,6 +235,9 @@ pub enum DownloadError { BadInput(anyhow::Error), /// The file was not found in the remote storage. NotFound, + /// A cancellation token aborted the download, typically during + /// tenant detach or process shutdown. + Cancelled, /// The file was found in the remote storage, but the download failed. Other(anyhow::Error), } @@ -215,6 +248,7 @@ impl std::fmt::Display for DownloadError { DownloadError::BadInput(e) => { write!(f, "Failed to download a remote file due to user input: {e}") } + DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), DownloadError::NotFound => write!(f, "No file found for the remote object id given"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), } @@ -234,6 +268,19 @@ pub enum GenericRemoteStorage { } impl GenericRemoteStorage { + pub async fn list( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + ) -> anyhow::Result { + match self { + Self::LocalFs(s) => s.list(prefix, mode).await, + Self::AwsS3(s) => s.list(prefix, mode).await, + Self::AzureBlob(s) => s.list(prefix, mode).await, + Self::Unreliable(s) => s.list(prefix, mode).await, + } + } + // A function for listing all the files in a "directory" // Example: // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 3d32b6b631..1be50ce565 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -15,7 +15,7 @@ use tokio::{ use tracing::*; use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; -use crate::{Download, DownloadError, RemotePath}; +use crate::{Download, DownloadError, Listing, ListingMode, RemotePath}; use super::{RemoteStorage, StorageMetadata}; @@ -75,7 +75,7 @@ impl LocalFs { } #[cfg(test)] - async fn list(&self) -> anyhow::Result> { + async fn list_all(&self) -> anyhow::Result> { Ok(get_all_files(&self.storage_root, true) .await? .into_iter() @@ -89,52 +89,10 @@ impl LocalFs { }) .collect()) } -} - -#[async_trait::async_trait] -impl RemoteStorage for LocalFs { - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { - let path = match prefix { - Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), - None => Cow::Borrowed(&self.storage_root), - }; - - let prefixes_to_filter = get_all_files(path.as_ref(), false) - .await - .map_err(DownloadError::Other)?; - - let mut prefixes = Vec::with_capacity(prefixes_to_filter.len()); - - // filter out empty directories to mirror s3 behavior. - for prefix in prefixes_to_filter { - if prefix.is_dir() - && is_directory_empty(&prefix) - .await - .map_err(DownloadError::Other)? - { - continue; - } - - prefixes.push( - prefix - .strip_prefix(&self.storage_root) - .context("Failed to strip prefix") - .and_then(RemotePath::new) - .expect( - "We list files for storage root, hence should be able to remote the prefix", - ), - ) - } - - Ok(prefixes) - } // recursively lists all files in a directory, // mirroring the `list_files` for `s3_bucket` - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result> { let full_path = match folder { Some(folder) => folder.with_base(&self.storage_root), None => self.storage_root.clone(), @@ -186,6 +144,70 @@ impl RemoteStorage for LocalFs { Ok(files) } +} + +#[async_trait::async_trait] +impl RemoteStorage for LocalFs { + async fn list( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + ) -> Result { + let mut result = Listing::default(); + + if let ListingMode::NoDelimiter = mode { + let keys = self + .list_recursive(prefix) + .await + .map_err(DownloadError::Other)?; + + result.keys = keys + .into_iter() + .filter(|k| { + let path = k.with_base(&self.storage_root); + !path.is_dir() + }) + .collect(); + + return Ok(result); + } + + let path = match prefix { + Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), + None => Cow::Borrowed(&self.storage_root), + }; + + let prefixes_to_filter = get_all_files(path.as_ref(), false) + .await + .map_err(DownloadError::Other)?; + + // filter out empty directories to mirror s3 behavior. + for prefix in prefixes_to_filter { + if prefix.is_dir() + && is_directory_empty(&prefix) + .await + .map_err(DownloadError::Other)? + { + continue; + } + + let stripped = prefix + .strip_prefix(&self.storage_root) + .context("Failed to strip prefix") + .and_then(RemotePath::new) + .expect( + "We list files for storage root, hence should be able to remote the prefix", + ); + + if prefix.is_dir() { + result.prefixes.push(stripped); + } else { + result.keys.push(stripped); + } + } + + Ok(result) + } async fn upload( &self, @@ -479,7 +501,7 @@ mod fs_tests { let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?; assert_eq!( - storage.list().await?, + storage.list_all().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); @@ -667,7 +689,7 @@ mod fs_tests { let upload_target = upload_dummy_file(&storage, upload_name, None).await?; storage.delete(&upload_target).await?; - assert!(storage.list().await?.is_empty()); + assert!(storage.list_all().await?.is_empty()); storage .delete(&upload_target) @@ -725,6 +747,43 @@ mod fs_tests { Ok(()) } + #[tokio::test] + async fn list() -> anyhow::Result<()> { + // No delimiter: should recursively list everything + let storage = create_storage()?; + let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?; + let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?; + + let listing = storage.list(None, ListingMode::NoDelimiter).await?; + assert!(listing.prefixes.is_empty()); + assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec()); + + // Delimiter: should only go one deep + let listing = storage.list(None, ListingMode::WithDelimiter).await?; + + assert_eq!( + listing.prefixes, + [RemotePath::from_string("timelines").unwrap()].to_vec() + ); + assert!(listing.keys.is_empty()); + + // Delimiter & prefix + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), + ListingMode::WithDelimiter, + ) + .await?; + assert_eq!( + listing.prefixes, + [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()] + .to_vec() + ); + assert_eq!(listing.keys, [uncle.clone()].to_vec()); + + Ok(()) + } + async fn upload_dummy_file( storage: &LocalFs, name: &str, @@ -777,7 +836,7 @@ mod fs_tests { } async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { - let mut files = storage.list().await?; + let mut files = storage.list_all().await?; files.sort_by(|a, b| a.0.cmp(&b.0)); Ok(files) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index fc94281666..560a2c14e9 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -30,8 +30,8 @@ use tracing::debug; use super::StorageMetadata; use crate::{ - ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config, - MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, + S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; pub(super) mod metrics; @@ -299,13 +299,13 @@ impl AsyncRead for TimedDownload { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - /// See the doc for `RemoteStorage::list_prefixes` - /// Note: it wont include empty "directories" - async fn list_prefixes( + async fn list( &self, prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { + mode: ListingMode, + ) -> Result { let kind = RequestKind::List; + let mut result = Listing::default(); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix @@ -314,28 +314,33 @@ impl RemoteStorage for S3Bucket { .map(|mut p| { // required to end with a separator // otherwise request will return only the entry of a prefix - if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } p }); - let mut document_keys = Vec::new(); - let mut continuation_token = None; loop { let _guard = self.permit(kind).await; let started_at = start_measuring_requests(kind); - let fetch_response = self + let mut request = self .client .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) - .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) - .set_max_keys(self.max_keys_per_list_response) + .set_max_keys(self.max_keys_per_list_response); + + if let ListingMode::WithDelimiter = mode { + request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } + + let response = request .send() .await .context("Failed to list S3 prefixes") @@ -345,71 +350,35 @@ impl RemoteStorage for S3Bucket { metrics::BUCKET_METRICS .req_seconds - .observe_elapsed(kind, &fetch_response, started_at); + .observe_elapsed(kind, &response, started_at); - let fetch_response = fetch_response?; + let response = response?; - document_keys.extend( - fetch_response - .common_prefixes - .unwrap_or_default() - .into_iter() + let keys = response.contents().unwrap_or_default(); + let empty = Vec::new(); + let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); + + tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len()); + + for object in keys { + let object_path = object.key().expect("response does not contain a key"); + let remote_path = self.s3_object_to_relative_path(object_path); + result.keys.push(remote_path); + } + + result.prefixes.extend( + prefixes + .iter() .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); - continuation_token = match fetch_response.next_continuation_token { + continuation_token = match response.next_continuation_token { Some(new_token) => Some(new_token), None => break, }; } - Ok(document_keys) - } - - /// See the doc for `RemoteStorage::list_files` - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - let kind = RequestKind::List; - - let folder_name = folder - .map(|p| self.relative_path_to_s3_object(p)) - .or_else(|| self.prefix_in_bucket.clone()); - - // AWS may need to break the response into several parts - let mut continuation_token = None; - let mut all_files = vec![]; - loop { - let _guard = self.permit(kind).await; - let started_at = start_measuring_requests(kind); - - let response = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(folder_name.clone()) - .set_continuation_token(continuation_token) - .set_max_keys(self.max_keys_per_list_response) - .send() - .await - .context("Failed to list files in S3 bucket"); - - let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &response, started_at); - - let response = response?; - - for object in response.contents().unwrap_or_default() { - let object_path = object.key().expect("response does not contain a key"); - let remote_path = self.s3_object_to_relative_path(object_path); - all_files.push(remote_path); - } - match response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } - Ok(all_files) + Ok(result) } async fn upload( diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 6d6a5c1d24..cd13db1923 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -5,7 +5,9 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::Mutex; -use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata}; +use crate::{ + Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, +}; pub struct UnreliableWrapper { inner: crate::GenericRemoteStorage, @@ -95,6 +97,15 @@ impl RemoteStorage for UnreliableWrapper { self.inner.list_files(folder).await } + async fn list( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + ) -> Result { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; + self.inner.list(prefix, mode).await + } + async fn upload( &self, data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index fe62a7299a..a58c08e29b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -33,8 +33,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{ - TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, - TIMELINES_SEGMENT_NAME, + TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; use crate::{ IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME, @@ -633,11 +632,6 @@ impl PageServerConf { self.tenants_path().join(tenant_id.to_string()) } - pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id) - .join(TENANT_ATTACHING_MARKER_FILENAME) - } - pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 919291a7e7..3a426ac87b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -24,8 +24,8 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use utils::completion; -use utils::completion::Completion; use utils::crashsafe::path_with_suffix_extension; +use utils::fs_ext; use std::cmp::min; use std::collections::hash_map::Entry; @@ -78,7 +78,6 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; -use crate::METADATA_FILE_NAME; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; @@ -152,8 +151,6 @@ pub const TENANTS_SEGMENT_NAME: &str = "tenants"; /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; -pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching"; - pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; /// References to shared objects that are passed into each tenant, such @@ -192,6 +189,18 @@ struct TimelinePreload { index_part: Result, } +pub(crate) struct TenantPreload { + deleting: bool, + timelines: HashMap, +} + +/// When we spawn a tenant, there is a special mode for tenant creation that +/// avoids trying to read anything from remote storage. +pub(crate) enum SpawnMode { + Normal, + Create, +} + /// /// Tenant consists of multiple timelines. Keep them in a hash table. /// @@ -390,12 +399,6 @@ pub enum CreateTimelineError { Other(#[from] anyhow::Error), } -/// spawn_attach argument for whether the caller is using attachment markers -pub(super) enum AttachMarkerMode { - Expect, - Ignore, -} - struct TenantDirectoryScan { sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, timelines_to_resume_deletion: Vec<(TimelineId, Option)>, @@ -408,7 +411,6 @@ enum CreateTimelineCause { impl Tenant { /// Yet another helper for timeline initialization. - /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. /// /// - Initializes the Timeline struct and inserts it into the tenant's hash map /// - Scans the local timeline directory for layer files and builds the layer map @@ -508,7 +510,6 @@ impl Tenant { Ok(()) } - /// /// Attach a tenant that's available in cloud storage. /// /// This returns quickly, after just creating the in-memory object @@ -518,16 +519,17 @@ impl Tenant { /// finishes. You can use wait_until_active() to wait for the task to /// complete. /// - pub(crate) fn spawn_attach( + #[allow(clippy::too_many_arguments)] + pub(crate) fn spawn( conf: &'static PageServerConf, tenant_id: TenantId, resources: TenantSharedResources, attached_conf: AttachedTenantConf, + init_order: Option, tenants: &'static tokio::sync::RwLock, - expect_marker: AttachMarkerMode, + mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { - // TODO dedup with spawn_load let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( conf, tenant_id, ))); @@ -561,22 +563,61 @@ impl Tenant { false, async move { // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. - let make_broken = |t: &Tenant, err: anyhow::Error| { - error!("attach failed, setting tenant state to Broken: {err:?}"); - t.state.send_modify(|state| { - assert_eq!( - *state, - TenantState::Attaching, + let make_broken = + |t: &Tenant, err: anyhow::Error| { + error!("attach failed, setting tenant state to Broken: {err:?}"); + t.state.send_modify(|state| { + // The Stopping case is for when we have passed control on to DeleteTenantFlow: + // if it errors, we will call make_broken when tenant is already in Stopping. + assert!( + matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), "the attach task owns the tenant state until activation is complete" ); - *state = TenantState::broken_from_reason(err.to_string()); - }); + + *state = TenantState::broken_from_reason(err.to_string()); + }); + }; + + let mut init_order = init_order; + // take the completion because initial tenant loading will complete when all of + // these tasks complete. + let _completion = init_order + .as_mut() + .and_then(|x| x.initial_tenant_load.take()); + let remote_load_completion = init_order + .as_mut() + .and_then(|x| x.initial_tenant_load_remote.take()); + + let preload = match mode { + SpawnMode::Create => {None}, + SpawnMode::Normal => { + match &remote_storage { + Some(remote_storage) => Some( + match tenant_clone + .preload(remote_storage, task_mgr::shutdown_token()) + .instrument( + tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_id), + ) + .await { + Ok(p) => p, + Err(e) => { + make_broken(&tenant_clone, anyhow::anyhow!(e)); + return Ok(()); + } + }, + ), + None => None, + } + } }; + // Remote preload is complete. + drop(remote_load_completion); + let pending_deletion = { match DeleteTenantFlow::should_resume_deletion( conf, - remote_storage.as_ref(), + preload.as_ref().map(|p| p.deleting).unwrap_or(false), &tenant_clone, ) .await @@ -592,10 +633,27 @@ impl Tenant { info!("pending_deletion {}", pending_deletion.is_some()); if let Some(deletion) = pending_deletion { + // as we are no longer loading, signal completion by dropping + // the completion while we resume deletion + drop(_completion); + // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout + let _ = init_order + .as_mut() + .and_then(|x| x.initial_logical_size_attempt.take()); + let background_jobs_can_start = + init_order.as_ref().map(|x| &x.background_jobs_can_start); + if let Some(background) = background_jobs_can_start { + info!("waiting for backgound jobs barrier"); + background.clone().wait().await; + info!("ready for backgound jobs barrier"); + } + match DeleteTenantFlow::resume_from_attach( deletion, &tenant_clone, + preload, tenants, + init_order, &ctx, ) .await @@ -608,7 +666,7 @@ impl Tenant { } } - match tenant_clone.attach(&ctx, expect_marker).await { + match tenant_clone.attach(init_order, preload, &ctx).await { Ok(()) => { info!("attach finished, activating"); tenant_clone.activate(broker_client, None, &ctx); @@ -628,6 +686,42 @@ impl Tenant { Ok(tenant) } + pub(crate) async fn preload( + self: &Arc, + remote_storage: &GenericRemoteStorage, + cancel: CancellationToken, + ) -> anyhow::Result { + // Get list of remote timelines + // download index files for every tenant timeline + info!("listing remote timelines"); + let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( + remote_storage, + self.tenant_id, + cancel.clone(), + ) + .await?; + + let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME); + info!( + "found {} timelines, deleting={}", + remote_timeline_ids.len(), + deleting + ); + + for k in other_keys { + if k != TENANT_DELETED_MARKER_FILE_NAME { + warn!("Unexpected non timeline key {k}"); + } + } + + Ok(TenantPreload { + deleting, + timelines: self + .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel) + .await?, + }) + } + /// /// Background task that downloads all data for a tenant and brings it to Active state. /// @@ -635,89 +729,62 @@ impl Tenant { /// async fn attach( self: &Arc, + mut init_order: Option, + preload: Option, ctx: &RequestContext, - expect_marker: AttachMarkerMode, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); - let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id); - if let AttachMarkerMode::Expect = expect_marker { - if !tokio::fs::try_exists(&marker_file) - .await - .context("check for existence of marker file")? - { - anyhow::bail!( - "implementation error: marker file should exist at beginning of this function" - ); + crate::failpoint_support::sleep_millis_async!("before-attaching-tenant"); + + let preload = match preload { + Some(p) => p, + None => { + // Deprecated dev mode: load from local disk state instead of remote storage + // https://github.com/neondatabase/neon/issues/5624 + return self.load_local(init_order, ctx).await; } - } + }; - // Get list of remote timelines - // download index files for every tenant timeline - info!("listing remote timelines"); - - let remote_storage = self - .remote_storage - .as_ref() - .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?; - - let remote_timeline_ids = - remote_timeline_client::list_remote_timelines(remote_storage, self.tenant_id).await?; - - info!("found {} timelines", remote_timeline_ids.len()); - - // Download & parse index parts - let mut part_downloads = JoinSet::new(); - for timeline_id in remote_timeline_ids { - let client = RemoteTimelineClient::new( - remote_storage.clone(), - self.deletion_queue_client.clone(), - self.conf, - self.tenant_id, - timeline_id, - self.generation, - ); - part_downloads.spawn( - async move { - debug!("starting index part download"); - - let index_part = client - .download_index_file() - .await - .context("download index file")?; - - debug!("finished index part download"); - - Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part)) - } - .map(move |res| { - res.with_context(|| format!("download index part for timeline {timeline_id}")) - }) - .instrument(info_span!("download_index_part", %timeline_id)), - ); - } + // Signal that we have completed remote phase + init_order + .as_mut() + .and_then(|x| x.initial_tenant_load_remote.take()); let mut timelines_to_resume_deletions = vec![]; - // Wait for all the download tasks to complete & collect results. let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); - while let Some(result) = part_downloads.join_next().await { - // NB: we already added timeline_id as context to the error - let result: Result<_, anyhow::Error> = result.context("joinset task join")?; - let (timeline_id, client, index_part) = result?; - debug!("successfully downloaded index part for timeline {timeline_id}"); + let mut existent_timelines = HashSet::new(); + for (timeline_id, preload) in preload.timelines { + // In this context a timeline "exists" if it has any content in remote storage: this will + // be our cue to not delete any corresponding local directory + existent_timelines.insert(timeline_id); + + let index_part = match preload.index_part { + Ok(i) => { + debug!("remote index part exists for timeline {timeline_id}"); + i + } + Err(e) => { + // Timeline creation is not atomic: we might upload a layer but no index_part. We expect + // that the creation will be retried by the control plane and eventually result in + // a valid loadable state. + warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + continue; + } + }; match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => { timeline_ancestors.insert(timeline_id, index_part.metadata.clone()); - remote_index_and_client.insert(timeline_id, (index_part, client)); + remote_index_and_client.insert(timeline_id, (index_part, preload.client)); } MaybeDeletedIndexPart::Deleted(index_part) => { info!( "timeline {} is deleted, picking to resume deletion", timeline_id ); - timelines_to_resume_deletions.push((timeline_id, index_part, client)); + timelines_to_resume_deletions.push((timeline_id, index_part, preload.client)); } } } @@ -771,12 +838,9 @@ impl Tenant { .map_err(LoadLocalTimelineError::ResumeDeletion)?; } - if let AttachMarkerMode::Expect = expect_marker { - std::fs::remove_file(&marker_file) - .with_context(|| format!("unlink attach marker file {marker_file}"))?; - crashsafe::fsync(marker_file.parent().expect("marker file has parent dir")) - .context("fsync tenant directory after unlinking attach marker file")?; - } + // The local filesystem contents are a cache of what's in the remote IndexPart; + // IndexPart is the source of truth. + self.clean_up_timelines(&existent_timelines)?; crate::failpoint_support::sleep_millis_async!("attach-before-activate"); @@ -785,6 +849,69 @@ impl Tenant { Ok(()) } + /// Check for any local timeline directories that are temporary, or do not correspond to a + /// timeline that still exists: this can happen if we crashed during a deletion/creation, or + /// if a timeline was deleted while the tenant was attached to a different pageserver. + fn clean_up_timelines(&self, existent_timelines: &HashSet) -> anyhow::Result<()> { + let timelines_dir = self.conf.timelines_path(&self.tenant_id); + + let entries = match timelines_dir.read_dir_utf8() { + Ok(d) => d, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + return Ok(()); + } else { + return Err(e).context("list timelines directory for tenant"); + } + } + }; + + for entry in entries { + let entry = entry.context("read timeline dir entry")?; + let entry_path = entry.path(); + + let purge = if crate::is_temporary(entry_path) + // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already + // covered by the check that the timeline must exist in remote storage. + || is_uninit_mark(entry_path) + || crate::is_delete_mark(entry_path) + { + true + } else { + match TimelineId::try_from(entry_path.file_name()) { + Ok(i) => { + // Purge if the timeline ID does not exist in remote storage: remote storage is the authority. + !existent_timelines.contains(&i) + } + Err(e) => { + tracing::warn!( + "Unparseable directory in timelines directory: {entry_path}, ignoring ({e})" + ); + // Do not purge junk: if we don't recognize it, be cautious and leave it for a human. + false + } + } + }; + + if purge { + tracing::info!("Purging stale timeline dentry {entry_path}"); + if let Err(e) = match entry.file_type() { + Ok(t) => if t.is_dir() { + std::fs::remove_dir_all(entry_path) + } else { + std::fs::remove_file(entry_path) + } + .or_else(fs_ext::ignore_not_found), + Err(e) => Err(e), + } { + tracing::warn!("Failed to purge stale timeline dentry {entry_path}: {e}"); + } + } + } + + Ok(()) + } + /// Get sum of all remote timelines sizes /// /// This function relies on the index_part instead of listing the remote storage @@ -874,152 +1001,6 @@ impl Tenant { )) } - /// Load a tenant that's available on local disk - /// - /// This is used at pageserver startup, to rebuild the in-memory - /// structures from on-disk state. This is similar to attaching a tenant, - /// but the index files already exist on local disk, as well as some layer - /// files. - /// - /// If the loading fails for some reason, the Tenant will go into Broken - /// state. - #[instrument(skip_all, fields(tenant_id=%tenant_id))] - pub(crate) fn spawn_load( - conf: &'static PageServerConf, - tenant_id: TenantId, - attached_conf: AttachedTenantConf, - resources: TenantSharedResources, - init_order: Option, - tenants: &'static tokio::sync::RwLock, - ctx: &RequestContext, - ) -> Arc { - span::debug_assert_current_span_has_tenant_id(); - - let broker_client = resources.broker_client; - let remote_storage = resources.remote_storage; - - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, tenant_id, - ))); - let tenant = Tenant::new( - TenantState::Loading, - conf, - attached_conf, - wal_redo_manager, - tenant_id, - remote_storage.clone(), - resources.deletion_queue_client.clone(), - ); - let tenant = Arc::new(tenant); - - // Do all the hard work in a background task - let tenant_clone = Arc::clone(&tenant); - - let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn); - let _ = task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::InitialLoad, - Some(tenant_id), - None, - "initial tenant load", - false, - async move { - // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. - let make_broken = |t: &Tenant, err: anyhow::Error| { - error!("load failed, setting tenant state to Broken: {err:?}"); - t.state.send_modify(|state| { - assert!( - matches!(*state, TenantState::Loading | TenantState::Stopping { .. }), - "the loading task owns the tenant state until activation is complete" - ); - *state = TenantState::broken_from_reason(err.to_string()); - }); - }; - - let mut init_order = init_order; - - // take the completion because initial tenant loading will complete when all of - // these tasks complete. - let _completion = init_order - .as_mut() - .and_then(|x| x.initial_tenant_load.take()); - let remote_load_completion = init_order - .as_mut() - .and_then(|x| x.initial_tenant_load_remote.take()); - - // Dont block pageserver startup on figuring out deletion status - let pending_deletion = { - match DeleteTenantFlow::should_resume_deletion( - conf, - remote_storage.as_ref(), - &tenant_clone, - ) - .await - { - Ok(should_resume_deletion) => should_resume_deletion, - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - } - }; - - info!("pending deletion {}", pending_deletion.is_some()); - - if let Some(deletion) = pending_deletion { - // as we are no longer loading, signal completion by dropping - // the completion while we resume deletion - drop(_completion); - drop(remote_load_completion); - // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout - let _ = init_order - .as_mut() - .and_then(|x| x.initial_logical_size_attempt.take()); - - match DeleteTenantFlow::resume_from_load( - deletion, - &tenant_clone, - init_order.as_ref(), - tenants, - &ctx, - ) - .await - { - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - Ok(()) => return Ok(()), - } - } - - let background_jobs_can_start = - init_order.as_ref().map(|x| &x.background_jobs_can_start); - - match tenant_clone - .load(init_order.as_ref(), remote_load_completion, &ctx) - .await - { - Ok(()) => { - debug!("load finished"); - - tenant_clone.activate(broker_client, background_jobs_can_start, &ctx); - } - Err(err) => make_broken(&tenant_clone, err), - } - - Ok(()) - } - .instrument({ - let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id); - span.follows_from(Span::current()); - span - }), - ); - - tenant - } - fn scan_and_sort_timelines_dir(self: Arc) -> anyhow::Result { let mut timelines_to_load: HashMap = HashMap::new(); // Note timelines_to_resume_deletion needs to be separate because it can be not sortable @@ -1167,6 +1148,7 @@ impl Tenant { self: &Arc, timeline_ids: HashSet, remote_storage: &GenericRemoteStorage, + cancel: CancellationToken, ) -> anyhow::Result> { let mut part_downloads = JoinSet::new(); for timeline_id in timeline_ids { @@ -1178,11 +1160,12 @@ impl Tenant { timeline_id, self.generation, ); + let cancel_clone = cancel.clone(); part_downloads.spawn( async move { debug!("starting index part download"); - let index_part = client.download_index_file().await; + let index_part = client.download_index_file(cancel_clone).await; debug!("finished index part download"); @@ -1200,10 +1183,25 @@ impl Tenant { } let mut timeline_preloads: HashMap = HashMap::new(); - while let Some(result) = part_downloads.join_next().await { - let preload_result = result.context("join preload task")?; - let preload = preload_result?; - timeline_preloads.insert(preload.timeline_id, preload); + + loop { + tokio::select!( + next = part_downloads.join_next() => { + match next { + Some(result) => { + let preload_result = result.context("join preload task")?; + let preload = preload_result?; + timeline_preloads.insert(preload.timeline_id, preload); + }, + None => { + break; + } + } + }, + _ = cancel.cancelled() => { + anyhow::bail!("Cancelled while waiting for remote index download") + } + ) } Ok(timeline_preloads) @@ -1214,10 +1212,9 @@ impl Tenant { /// files on disk. Used at pageserver startup. /// /// No background tasks are started as part of this routine. - async fn load( + async fn load_local( self: &Arc, - init_order: Option<&InitializationOrder>, - remote_completion: Option, + init_order: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); @@ -1242,38 +1239,10 @@ impl Tenant { // FIXME original collect_timeline_files contained one more check: // 1. "Timeline has no ancestor and no layer files" - // Load remote content for timelines in this tenant - let all_timeline_ids = scan - .sorted_timelines_to_load - .iter() - .map(|i| i.0) - .chain(scan.timelines_to_resume_deletion.iter().map(|i| i.0)) - .collect(); - let mut preload = if let Some(remote_storage) = &self.remote_storage { - Some( - self.load_timeline_metadata(all_timeline_ids, remote_storage) - .await?, - ) - } else { - None - }; - - drop(remote_completion); - - crate::failpoint_support::sleep_millis_async!("before-loading-tenant"); - // Process loadable timelines first for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { - let timeline_preload = preload.as_mut().map(|p| p.remove(&timeline_id).unwrap()); if let Err(e) = self - .load_local_timeline( - timeline_id, - local_metadata, - timeline_preload, - init_order, - ctx, - false, - ) + .load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false) .await { match e { @@ -1306,14 +1275,11 @@ impl Tenant { } } Some(local_metadata) => { - let timeline_preload = - preload.as_mut().map(|p| p.remove(&timeline_id).unwrap()); if let Err(e) = self .load_local_timeline( timeline_id, local_metadata, - timeline_preload, - init_order, + init_order.as_ref(), ctx, true, ) @@ -1344,183 +1310,36 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, init_order, preload, ctx))] + #[instrument(skip(self, local_metadata, init_order, ctx))] async fn load_local_timeline( self: &Arc, timeline_id: TimelineId, local_metadata: TimelineMetadata, - preload: Option, init_order: Option<&InitializationOrder>, ctx: &RequestContext, found_delete_mark: bool, ) -> Result<(), LoadLocalTimelineError> { span::debug_assert_current_span_has_tenant_id(); - let mut resources = self.build_timeline_resources(timeline_id); + let resources = self.build_timeline_resources(timeline_id); - struct RemoteStartupData { - index_part: IndexPart, - remote_metadata: TimelineMetadata, + if found_delete_mark { + // There is no remote client, we found local metadata. + // Continue cleaning up local disk. + DeleteTimelineFlow::resume_deletion( + Arc::clone(self), + timeline_id, + &local_metadata, + None, + self.deletion_queue_client.clone(), + init_order, + ) + .await + .context("resume deletion") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; + return Ok(()); } - let (remote_startup_data, remote_client) = match preload { - Some(preload) => { - let TimelinePreload { - index_part, - client: remote_client, - timeline_id: _timeline_id, - } = preload; - match index_part { - Ok(index_part) => { - let index_part = match index_part { - MaybeDeletedIndexPart::IndexPart(index_part) => index_part, - MaybeDeletedIndexPart::Deleted(index_part) => { - // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation. - // Example: - // start deletion operation - // finishes upload of index part - // pageserver crashes - // remote storage gets de-configured - // pageserver starts - // - // We don't really anticipate remote storage to be de-configured, so, for now, this is fine. - // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099. - info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler"); - - remote_client - .init_upload_queue_stopped_to_continue_deletion(&index_part) - .context("init queue stopped") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - - DeleteTimelineFlow::resume_deletion( - Arc::clone(self), - timeline_id, - &local_metadata, - Some(remote_client), - self.deletion_queue_client.clone(), - init_order, - ) - .await - .context("resume deletion") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - - return Ok(()); - } - }; - - let remote_metadata = index_part.metadata.clone(); - ( - Some(RemoteStartupData { - index_part, - remote_metadata, - }), - Some(remote_client), - ) - } - Err(DownloadError::NotFound) => { - info!(found_delete_mark, "no index file was found on the remote, resuming deletion or cleaning unuploaded up"); - - if found_delete_mark { - // We could've resumed at a point where remote index was deleted, but metadata file wasnt. - // Cleanup: - return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces( - self, - timeline_id, - ) - .await - .context("cleanup_remaining_timeline_fs_traces") - .map_err(LoadLocalTimelineError::ResumeDeletion); - } - - // as the remote index_part.json did not exist, this timeline is a - // not-yet-uploaded one. it should be deleted now, because the branching might - // not have been valid as it's ancestor may have been restored to earlier state - // as well. in practice, control plane will keep retrying. - // - // first ensure that the un-uploaded timeline looks like it should, as in we - // are not accidentially deleting a timeline which was ever active: - // - root timelines have metadata and one possibly partial layer - // - branched timelines have metadata - // - // if the timeline does not look like expected, fail loading of the tenant. - // cleaning the timeline up manually and reloading the tenant is possible via - // the above log message. - let path = self.conf.timeline_path(&self.tenant_id, &timeline_id); - - let span = tracing::Span::current(); - - return tokio::task::spawn_blocking({ - move || { - use std::str::FromStr; - use crate::tenant::storage_layer::LayerFileName; - - let _e = span.entered(); - let mut metadata = false; - let mut layers = 0; - let mut others = 0; - for dentry in path.read_dir_utf8()? { - let dentry = dentry?; - let file_name = dentry.file_name(); - - if file_name == METADATA_FILE_NAME { - metadata = true; - continue; - } - - if LayerFileName::from_str(file_name).is_ok() - { - layers += 1; - continue; - } - - others += 1; - } - - // bootstrapped have the one image layer file, or one partial temp - // file, branched have just the metadata - if !(metadata && layers + others <= 1) { - anyhow::bail!("unexpected assumed unuploaded, never been active timeline: found metadata={}, layers={}, others={}", metadata, layers, others); - } - - let tmp_path = - path.with_file_name(format!("{timeline_id}{}", TEMP_FILE_SUFFIX)); - std::fs::rename(path, &tmp_path)?; - std::fs::remove_dir_all(&tmp_path)?; - Ok(()) - } - }) - .await - .map_err(anyhow::Error::new) - .and_then(|x| x) - .context("delete assumed unuploaded fresh timeline") - .map_err(LoadLocalTimelineError::Load); - } - Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))), - } - } - None => { - if found_delete_mark { - // There is no remote client, we found local metadata. - // Continue cleaning up local disk. - DeleteTimelineFlow::resume_deletion( - Arc::clone(self), - timeline_id, - &local_metadata, - None, - self.deletion_queue_client.clone(), - init_order, - ) - .await - .context("resume deletion") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - return Ok(()); - } - - (None, resources.remote_client) - } - }; - resources.remote_client = remote_client; - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}")) @@ -1530,27 +1349,11 @@ impl Tenant { None }; - let (index_part, metadata) = match remote_startup_data { - Some(RemoteStartupData { - index_part, - remote_metadata, - }) => { - // always choose the remote metadata to be crash consistent (see RFC 27) - save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata) - .await - .context("save_metadata") - .map_err(LoadLocalTimelineError::Load)?; - - (Some(index_part), remote_metadata) - } - None => (None, local_metadata), - }; - self.timeline_init_and_sync( timeline_id, resources, - index_part, - metadata, + None, + local_metadata, ancestor, init_order, ctx, @@ -3387,16 +3190,10 @@ fn remove_timeline_and_uninit_mark( Ok(()) } -pub(crate) enum CreateTenantFilesMode { - Create, - Attach, -} - pub(crate) async fn create_tenant_files( conf: &'static PageServerConf, location_conf: &LocationConf, tenant_id: &TenantId, - mode: CreateTenantFilesMode, ) -> anyhow::Result { let target_tenant_directory = conf.tenant_path(tenant_id); anyhow::ensure!( @@ -3419,7 +3216,6 @@ pub(crate) async fn create_tenant_files( conf, location_conf, tenant_id, - mode, &temporary_tenant_dir, &target_tenant_directory, ) @@ -3445,28 +3241,9 @@ async fn try_create_target_tenant_dir( conf: &'static PageServerConf, location_conf: &LocationConf, tenant_id: &TenantId, - mode: CreateTenantFilesMode, temporary_tenant_dir: &Utf8Path, target_tenant_directory: &Utf8Path, ) -> Result<(), anyhow::Error> { - match mode { - CreateTenantFilesMode::Create => {} // needs no attach marker, writing tenant conf + atomic rename of dir is good enough - CreateTenantFilesMode::Attach => { - let attach_marker_path = temporary_tenant_dir.join(TENANT_ATTACHING_MARKER_FILENAME); - let file = std::fs::OpenOptions::new() - .create_new(true) - .write(true) - .open(&attach_marker_path) - .with_context(|| { - format!("could not create attach marker file {attach_marker_path:?}") - })?; - file.sync_all().with_context(|| { - format!("could not sync attach marker file: {attach_marker_path:?}") - })?; - // fsync of the directory in which the file resides comes later in this function - } - } - let temporary_tenant_timelines_dir = rebase_directory( &conf.timelines_path(tenant_id), target_tenant_directory, @@ -3684,6 +3461,11 @@ pub(crate) mod harness { } } + enum LoadMode { + Local, + Remote, + } + pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, @@ -3767,7 +3549,36 @@ pub(crate) mod harness { ) } - pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { + fn remote_empty(&self) -> bool { + let tenant_path = self.conf.tenant_path(&self.tenant_id); + let remote_tenant_dir = self + .remote_fs_dir + .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap()); + if std::fs::metadata(&remote_tenant_dir).is_err() { + return true; + } + + match std::fs::read_dir(remote_tenant_dir) + .unwrap() + .flatten() + .next() + { + Some(entry) => { + tracing::debug!( + "remote_empty: not empty, found file {}", + entry.file_name().to_string_lossy(), + ); + false + } + None => true, + } + } + + async fn do_try_load( + &self, + ctx: &RequestContext, + mode: LoadMode, + ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); let tenant = Arc::new(Tenant::new( @@ -3783,12 +3594,26 @@ pub(crate) mod harness { Some(self.remote_storage.clone()), self.deletion_queue.new_client(), )); - tenant - .load(None, None, ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) - .await?; - // TODO reuse Tenant::activate (needs broker) + match mode { + LoadMode::Local => { + tenant + .load_local(None, ctx) + .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .await?; + } + LoadMode::Remote => { + let preload = tenant + .preload(&self.remote_storage, CancellationToken::new()) + .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_id)) + .await?; + tenant + .attach(None, Some(preload), ctx) + .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .await?; + } + } + tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { timeline.set_state(TimelineState::Active); @@ -3796,6 +3621,27 @@ pub(crate) mod harness { Ok(tenant) } + /// For tests that specifically want to exercise the local load path, which does + /// not use remote storage. + pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result> { + self.do_try_load(ctx, LoadMode::Local).await + } + + /// The 'load' in this function is either a local load or a normal attachment, + pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { + // If we have nothing in remote storage, must use load_local instead of attach: attach + // will error out if there are no timelines. + // + // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate + // this weird state of a Tenant which exists but doesn't have any timelines. + let mode = match self.remote_empty() { + true => LoadMode::Local, + false => LoadMode::Remote, + }; + + self.do_try_load(ctx, mode).await + } + pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_id, timeline_id) } @@ -4364,7 +4210,7 @@ mod tests { } #[tokio::test] - async fn corrupt_metadata() -> anyhow::Result<()> { + async fn corrupt_local_metadata() -> anyhow::Result<()> { const TEST_NAME: &str = "corrupt_metadata"; let harness = TenantHarness::create(TEST_NAME)?; let (tenant, ctx) = harness.load().await; @@ -4382,16 +4228,19 @@ mod tests { .unwrap(); drop(tenant); + // Corrupt local metadata let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); - assert!(metadata_path.is_file()); - let mut metadata_bytes = std::fs::read(&metadata_path)?; assert_eq!(metadata_bytes.len(), 512); metadata_bytes[8] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; - let err = harness.try_load(&ctx).await.err().expect("should fail"); + let err = harness + .try_load_local(&ctx) + .await + .err() + .expect("should fail"); // get all the stack with all .context, not only the last one let message = format!("{err:#}"); let expected = "failed to load metadata"; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 989f2cd779..3e2d6ca7bb 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -3,10 +3,10 @@ use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::models::TenantState; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::sync::OwnedMutexGuard; use tokio_util::sync::CancellationToken; -use tracing::{error, info, instrument, warn, Instrument, Span}; +use tracing::{error, instrument, warn, Instrument, Span}; use utils::{ backoff, completion, crashsafe, fs_ext, @@ -25,11 +25,9 @@ use super::{ remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, span, timeline::delete::DeleteTimelineFlow, - tree_sort_timelines, DeleteTimelineError, Tenant, + tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload, }; -const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3; - #[derive(Debug, thiserror::Error)] pub(crate) enum DeleteTenantError { #[error("GetTenant {0}")] @@ -60,7 +58,7 @@ fn remote_tenant_delete_mark_path( .context("Failed to strip workdir prefix") .and_then(RemotePath::new) .context("tenant path")?; - Ok(tenant_remote_path.join(Utf8Path::new("deleted"))) + Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted"))) } async fn create_remote_delete_mark( @@ -239,32 +237,6 @@ async fn cleanup_remaining_fs_traces( Ok(()) } -pub(crate) async fn remote_delete_mark_exists( - conf: &PageServerConf, - tenant_id: &TenantId, - remote_storage: &GenericRemoteStorage, -) -> anyhow::Result { - // If remote storage is there we rely on it - let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?; - - let result = backoff::retry( - || async { remote_storage.download(&remote_mark_path).await }, - |e| matches!(e, DownloadError::NotFound), - SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS, - SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS, - "fetch_tenant_deletion_mark", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), - ) - .await; - - match result { - Ok(_) => Ok(true), - Err(DownloadError::NotFound) => Ok(false), - Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?, - } -} - /// Orchestrates tenant shut down of all tasks, removes its in-memory structures, /// and deletes its data from both disk and s3. /// The sequence of steps: @@ -276,10 +248,9 @@ pub(crate) async fn remote_delete_mark_exists( /// 6. Remove remote mark /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark /// It is resumable from any step in case a crash/restart occurs. -/// There are three entrypoints to the process: +/// There are two entrypoints to the process: /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler. -/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there. -/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process. +/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process. /// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function. #[derive(Default)] pub enum DeleteTenantFlow { @@ -378,7 +349,7 @@ impl DeleteTenantFlow { pub(crate) async fn should_resume_deletion( conf: &'static PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_mark_exists: bool, tenant: &Tenant, ) -> Result, DeleteTenantError> { let acquire = |t: &Tenant| { @@ -389,66 +360,25 @@ impl DeleteTenantFlow { ) }; - let tenant_id = tenant.tenant_id; - // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. - if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { + if remote_mark_exists { return Ok(acquire(tenant)); } - let remote_storage = match remote_storage { - Some(remote_storage) => remote_storage, - None => return Ok(None), - }; - - if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? { + let tenant_id = tenant.tenant_id; + // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. + if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { Ok(acquire(tenant)) } else { Ok(None) } } - pub(crate) async fn resume_from_load( - guard: DeletionGuard, - tenant: &Arc, - init_order: Option<&InitializationOrder>, - tenants: &'static tokio::sync::RwLock, - ctx: &RequestContext, - ) -> Result<(), DeleteTenantError> { - let (_, progress) = completion::channel(); - - tenant - .set_stopping(progress, true, false) - .await - .expect("cant be stopping or broken"); - - // Do not consume valuable resources during the load phase, continue deletion once init phase is complete. - let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); - if let Some(background) = background_jobs_can_start { - info!("waiting for backgound jobs barrier"); - background.clone().wait().await; - info!("ready for backgound jobs barrier"); - } - - // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir) - let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id); - if timelines_path.exists() { - tenant.load(init_order, None, ctx).await.context("load")?; - } - - Self::background( - guard, - tenant.conf, - tenant.remote_storage.clone(), - tenants, - tenant, - ) - .await - } - pub(crate) async fn resume_from_attach( guard: DeletionGuard, tenant: &Arc, + preload: Option, tenants: &'static tokio::sync::RwLock, + init_order: Option, ctx: &RequestContext, ) -> Result<(), DeleteTenantError> { let (_, progress) = completion::channel(); @@ -459,7 +389,7 @@ impl DeleteTenantFlow { .expect("cant be stopping or broken"); tenant - .attach(ctx, super::AttachMarkerMode::Expect) + .attach(init_order, preload, ctx) .await .context("attach")?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 380b610a4c..33fdc76f8d 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -26,10 +26,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt}; use crate::tenant::delete::DeleteTenantFlow; -use crate::tenant::{ - create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant, - TenantState, -}; +use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -437,14 +434,15 @@ pub async fn init_tenant_mgr( location_conf.attach_in_generation(generation); Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; - match schedule_local_tenant_processing( + match tenant_spawn( conf, tenant_id, &tenant_dir_path, - AttachedTenantConf::try_from(location_conf)?, resources.clone(), + AttachedTenantConf::try_from(location_conf)?, Some(init_order.clone()), &TENANTS, + SpawnMode::Normal, &ctx, ) { Ok(tenant) => { @@ -464,15 +462,18 @@ pub async fn init_tenant_mgr( Ok(()) } +/// Wrapper for Tenant::spawn that checks invariants before running, and inserts +/// a broken tenant in the map if Tenant::spawn fails. #[allow(clippy::too_many_arguments)] -pub(crate) fn schedule_local_tenant_processing( +pub(crate) fn tenant_spawn( conf: &'static PageServerConf, tenant_id: TenantId, tenant_path: &Utf8Path, - location_conf: AttachedTenantConf, resources: TenantSharedResources, + location_conf: AttachedTenantConf, init_order: Option, tenants: &'static tokio::sync::RwLock, + mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { anyhow::ensure!( @@ -496,45 +497,24 @@ pub(crate) fn schedule_local_tenant_processing( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { - info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); - if resources.remote_storage.is_none() { - warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); - Tenant::create_broken_tenant( - conf, - tenant_id, - "attaching mark file present but no remote storage configured".to_string(), - ) - } else { - match Tenant::spawn_attach( - conf, - tenant_id, - resources, - location_conf, - tenants, - AttachMarkerMode::Expect, - ctx, - ) { - Ok(tenant) => tenant, - Err(e) => { - error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) - } - } + info!("Attaching tenant {tenant_id}"); + let tenant = match Tenant::spawn( + conf, + tenant_id, + resources, + location_conf, + init_order, + tenants, + mode, + ctx, + ) { + Ok(tenant) => tenant, + Err(e) => { + error!("Failed to spawn tenant {tenant_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) } - } else { - info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); - // Start loading the tenant into memory. It will initially be in Loading state. - Tenant::spawn_load( - conf, - tenant_id, - location_conf, - resources, - init_order, - tenants, - ctx, - ) }; + Ok(tenant) } @@ -670,29 +650,41 @@ pub(crate) async fn create_tenant( ctx: &RequestContext, ) -> Result, TenantMapInsertError> { tenant_map_insert(tenant_id, || async { - let location_conf = LocationConf::attached_single(tenant_conf, generation); // We're holding the tenants lock in write mode while doing local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating` // and do the work in that state. - let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?; + super::create_tenant_files(conf, &location_conf, &tenant_id).await?; + // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 - let created_tenant = - schedule_local_tenant_processing(conf, tenant_id, &tenant_directory, - AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?; + let tenant_path = conf.tenant_path(&tenant_id); + + let created_tenant = tenant_spawn( + conf, + tenant_id, + &tenant_path, + resources, + AttachedTenantConf::try_from(location_conf)?, + None, + &TENANTS, + SpawnMode::Create, + ctx, + )?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 let crated_tenant_id = created_tenant.tenant_id(); anyhow::ensure!( - tenant_id == crated_tenant_id, - "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})", - ); + tenant_id == crated_tenant_id, + "loaded created tenant has unexpected tenant id \ + (expect {tenant_id} != actual {crated_tenant_id})", + ); Ok(created_tenant) - }).await + }) + .await } #[derive(Debug, thiserror::Error)] @@ -801,9 +793,10 @@ pub(crate) async fn upsert_location( } } + let tenant_path = conf.tenant_path(&tenant_id); + let new_slot = match &new_location_config.mode { LocationMode::Secondary(_) => { - let tenant_path = conf.tenant_path(&tenant_id); // Directory doesn't need to be fsync'd because if we crash it can // safely be recreated next time this tenant location is configured. unsafe_create_dir_all(&tenant_path) @@ -833,28 +826,21 @@ pub(crate) async fn upsert_location( .await .map_err(SetNewTenantConfigError::Persist)?; - let tenant = match Tenant::spawn_attach( + let tenant = tenant_spawn( conf, tenant_id, + &tenant_path, TenantSharedResources { broker_client, remote_storage, deletion_queue_client, }, AttachedTenantConf::try_from(new_location_config)?, + None, &TENANTS, - // The LocationConf API does not use marker files, because we have Secondary - // locations where the directory's existence is not a signal that it contains - // all timelines. See https://github.com/neondatabase/neon/issues/5550 - AttachMarkerMode::Ignore, + SpawnMode::Normal, ctx, - ) { - Ok(tenant) => tenant, - Err(e) => { - error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) - } - }; + )?; TenantSlot::Attached(tenant) } @@ -1043,7 +1029,7 @@ pub(crate) async fn load_tenant( location_conf.attach_in_generation(generation); Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; - let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx) + let new_tenant = tenant_spawn(conf, tenant_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Normal, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -1117,18 +1103,12 @@ pub(crate) async fn attach_tenant( ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || async { let location_conf = LocationConf::attached_single(tenant_conf, generation); - let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?; + let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 - // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached - let marker_file_exists = conf - .tenant_attaching_mark_file_path(&tenant_id) - .try_exists() - .context("check for attach marker file existence")?; - anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - - let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?; + let attached_tenant = tenant_spawn(conf, tenant_id, &tenant_dir, + resources, AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Normal, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 245649c3ad..d91970a1c0 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -168,36 +168,14 @@ //! - create `Timeline` struct and a `RemoteTimelineClient` //! - initialize the client's upload queue with its `IndexPart` //! - schedule uploads for layers that are only present locally. -//! - if the remote `IndexPart`'s metadata was newer than the metadata in -//! the local filesystem, write the remote metadata to the local filesystem //! - After the above is done for each timeline, open the tenant for business by //! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. //! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! -//! We keep track of the fact that a client is in `Attaching` state in a marker -//! file on the local disk. This is critical because, when we restart the pageserver, -//! we do not want to do the `List timelines` step for each tenant that has already -//! been successfully attached (for performance & cost reasons). -//! Instead, for a tenant without the attach marker file, we assume that the -//! local state is in sync or ahead of the remote state. This includes the list -//! of all of the tenant's timelines, which is particularly critical to be up-to-date: -//! if there's a timeline on the remote that the pageserver doesn't know about, -//! the GC will not consider its branch point, leading to data loss. -//! So, for a tenant with the attach marker file, we know that we do not yet have -//! persisted all the remote timeline's metadata files locally. To exclude the -//! risk above, we re-run the procedure for such tenants -//! //! # Operating Without Remote Storage //! //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is //! not created and the uploads are skipped. -//! Theoretically, it should be ok to remove and re-add remote storage configuration to -//! the pageserver config at any time, since it doesn't make a difference to -//! [`Timeline::load_layer_map`]. -//! Of course, the remote timeline dir must not change while we have de-configured -//! remote storage, i.e., the pageserver must remain the owner of the given prefix -//! in remote storage. -//! But note that we don't test any of this right now. //! //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map @@ -468,7 +446,10 @@ impl RemoteTimelineClient { // /// Download index file - pub async fn download_index_file(&self) -> Result { + pub async fn download_index_file( + &self, + cancel: CancellationToken, + ) -> Result { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Index, &RemoteOpKind::Download, @@ -482,6 +463,7 @@ impl RemoteTimelineClient { &self.tenant_id, &self.timeline_id, self.generation, + cancel, ) .measure_remote_op( self.tenant_id, @@ -1725,7 +1707,11 @@ mod tests { let client = timeline.remote_client.as_ref().unwrap(); // Download back the index.json, and check that the list of files is correct - let initial_index_part = match client.download_index_file().await.unwrap() { + let initial_index_part = match client + .download_index_file(CancellationToken::new()) + .await + .unwrap() + { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), }; @@ -1814,7 +1800,11 @@ mod tests { } // Download back the index.json, and check that the list of files is correct - let index_part = match client.download_index_file().await.unwrap() { + let index_part = match client + .download_index_file(CancellationToken::new()) + .await + .unwrap() + { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), }; @@ -2013,7 +2003,7 @@ mod tests { let client = test_state.build_client(get_generation); let download_r = client - .download_index_file() + .download_index_file(CancellationToken::new()) .await .expect("download should always succeed"); assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index ca6e3293d6..6039b01ab8 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -18,8 +18,8 @@ use crate::config::PageServerConf; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME}; -use remote_storage::{DownloadError, GenericRemoteStorage}; +use crate::tenant::Generation; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; @@ -170,53 +170,43 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool { pub async fn list_remote_timelines( storage: &GenericRemoteStorage, tenant_id: TenantId, -) -> anyhow::Result> { + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { let remote_path = remote_timelines_path(&tenant_id); fail::fail_point!("storage-sync-list-remote-timelines", |_| { anyhow::bail!("storage-sync-list-remote-timelines"); }); - let timelines = download_retry( - || storage.list_prefixes(Some(&remote_path)), - &format!("list prefixes for {tenant_id}"), + let listing = download_retry_forever( + || storage.list(Some(&remote_path), ListingMode::WithDelimiter), + &format!("list timelines for {tenant_id}"), + cancel, ) .await?; - if timelines.is_empty() { - anyhow::bail!("no timelines found on the remote storage") - } - let mut timeline_ids = HashSet::new(); + let mut other_prefixes = HashSet::new(); - for timeline_remote_storage_key in timelines { - if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) { - // A `deleted` key within `timelines/` is a marker file, not a timeline. Ignore it. - // This code will be removed in https://github.com/neondatabase/neon/pull/5580 - continue; - } - + for timeline_remote_storage_key in listing.prefixes { let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") })?; - let timeline_id: TimelineId = object_name - .parse() - .with_context(|| format!("parse object name into timeline id '{object_name}'"))?; - - // list_prefixes is assumed to return unique names. Ensure this here. - // NB: it's safer to bail out than warn-log this because the pageserver - // needs to absolutely know about _all_ timelines that exist, so that - // GC knows all the branchpoints. If we skipped over a timeline instead, - // GC could delete a layer that's still needed by that timeline. - anyhow::ensure!( - !timeline_ids.contains(&timeline_id), - "list_prefixes contains duplicate timeline id {timeline_id}" - ); - timeline_ids.insert(timeline_id); + match object_name.parse::() { + Ok(t) => timeline_ids.insert(t), + Err(_) => other_prefixes.insert(object_name.to_string()), + }; } - Ok(timeline_ids) + for key in listing.keys { + let object_name = key + .object_name() + .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?; + other_prefixes.insert(object_name.to_string()); + } + + Ok((timeline_ids, other_prefixes)) } async fn do_download_index_part( @@ -224,10 +214,11 @@ async fn do_download_index_part( tenant_id: &TenantId, timeline_id: &TimelineId, index_generation: Generation, + cancel: CancellationToken, ) -> Result { let remote_path = remote_index_path(tenant_id, timeline_id, index_generation); - let index_part_bytes = download_retry( + let index_part_bytes = download_retry_forever( || async { let mut index_part_download = storage.download(&remote_path).await?; @@ -242,6 +233,7 @@ async fn do_download_index_part( Ok(index_part_bytes) }, &format!("download {remote_path:?}"), + cancel, ) .await?; @@ -263,19 +255,28 @@ pub(super) async fn download_index_part( tenant_id: &TenantId, timeline_id: &TimelineId, my_generation: Generation, + cancel: CancellationToken, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { // Operating without generations: just fetch the generation-less path - return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel) + .await; } // Stale case: If we were intentionally attached in a stale generation, there may already be a remote // index in our generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + let res = do_download_index_part( + storage, + tenant_id, + timeline_id, + my_generation, + cancel.clone(), + ) + .await; match res { Ok(index_part) => { tracing::debug!( @@ -295,8 +296,14 @@ pub(super) async fn download_index_part( // we want to find the most recent index from a previous generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = - do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await; + let res = do_download_index_part( + storage, + tenant_id, + timeline_id, + my_generation.previous(), + cancel.clone(), + ) + .await; match res { Ok(index_part) => { tracing::debug!("Found index_part from previous generation"); @@ -340,13 +347,14 @@ pub(super) async fn download_index_part( match max_previous_generation { Some(g) => { tracing::debug!("Found index_part in generation {g:?}"); - do_download_index_part(storage, tenant_id, timeline_id, g).await + do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await } None => { // Migration from legacy pre-generation state: we have a generation but no prior // attached pageservers did. Try to load from a no-generation path. tracing::info!("No index_part.json* found"); - do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await + do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel) + .await } } } @@ -376,3 +384,23 @@ where ) .await } + +async fn download_retry_forever( + op: O, + description: &str, + cancel: CancellationToken, +) -> Result +where + O: FnMut() -> F, + F: Future>, +{ + backoff::retry( + op, + |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + FAILED_DOWNLOAD_WARN_THRESHOLD, + u32::MAX, + description, + backoff::Cancel::new(cancel, || DownloadError::Cancelled), + ) + .await +} diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 7d55388f44..8cead5ebf2 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -294,6 +294,7 @@ async fn cleanup_remaining_timeline_fs_traces( // Remove delete mark tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id)) .await + .or_else(fs_ext::ignore_not_found) .context("remove delete mark") } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9184f0c43a..d1d19fa542 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -60,6 +60,7 @@ from fixtures.utils import ( allure_attach_from_dir, get_self_dir, subprocess_capture, + wait_until, ) """ @@ -1679,6 +1680,40 @@ class NeonPageserver(PgProtocol): self.running = False return self + def restart(self, immediate: bool = False): + """ + High level wrapper for restart: restarts the process, and waits for + tenant state to stabilize. + """ + self.stop(immediate=immediate) + self.start() + self.quiesce_tenants() + + def quiesce_tenants(self): + """ + Wait for all tenants to enter a stable state (Active or Broken) + + Call this after restarting the pageserver, or after attaching a tenant, + to ensure that it is ready for use. + """ + + stable_states = {"Active", "Broken"} + + client = self.http_client() + + def complete(): + log.info("Checking tenants...") + tenants = client.tenant_list() + log.info(f"Tenant list: {tenants}...") + any_unstable = any((t["state"]["slug"] not in stable_states) for t in tenants) + if any_unstable: + for t in tenants: + log.info(f"Waiting for tenant {t['id']} in state {t['state']['slug']}") + log.info(f"any_unstable={any_unstable}") + assert not any_unstable + + wait_until(20, 0.5, complete) + def __enter__(self) -> "NeonPageserver": return self diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 32b1466c90..c4f743204e 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -333,16 +333,30 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*Failed to load index_part from remote storage.*", + # On a fast restart, there may be an initdb still running in a basebackup...__temp directory + ".*Failed to purge.*Directory not empty.*", + ] ) ps_http = env.pageserver.http_client() # pause all uploads - ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) ps_http.tenant_create(env.initial_tenant) + # Create a timeline whose creation will succeed. The tenant will need at least one + # timeline to be loadable. + success_timeline = TimelineId.generate() + log.info(f"Creating timeline {success_timeline}") + ps_http.timeline_create(env.pg_version, env.initial_tenant, success_timeline, timeout=60) + + # Create a timeline whose upload to remote storage will be blocked + ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) + def start_creating_timeline(): + log.info(f"Creating (expect failure) timeline {env.initial_timeline}") with pytest.raises(RequestException): ps_http.timeline_create( env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 @@ -366,6 +380,9 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N with pytest.raises(PageserverApiException, match="not found"): ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + # The one successfully created timeline should still be there. + assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 + def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 48accdb43d..b1b47b3f2c 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -15,7 +15,7 @@ from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep # running. -def test_broken_timeline(neon_env_builder: NeonEnvBuilder): +def test_local_corruption(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( @@ -69,24 +69,19 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): env.pageserver.start() - # Tenant 0 should still work + # Un-damaged tenant works pg0.start() assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # But all others are broken - - # First timeline would not get loaded into pageserver due to corrupt metadata file - with pytest.raises( - Exception, match=f"Tenant {tenant1} will not become active. Current state: Broken" - ) as err: - pg1.start() - log.info( - f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}" - ) + # Tenant with corrupt local metadata works: remote storage is authoritative for metadata + pg1.start() + assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 # Second timeline will fail during basebackup, because the local layer file is corrupt. # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) + # + # This will change when we implement checksums for layers with pytest.raises(Exception, match="layer loading failed:") as err: pg2.start() log.info( @@ -133,8 +128,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) # Restart the page server - env.pageserver.stop(immediate=True) - env.pageserver.start() + env.pageserver.restart(immediate=True) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index b49a1a40dd..4c51155236 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -62,14 +62,14 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool) tenant_load_delay_ms = 5000 env.pageserver.stop() env.pageserver.start( - extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"} + extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"} ) - # Check that it's in Loading state + # Check that it's in Attaching state client = env.pageserver.http_client() tenant_status = client.tenant_status(env.initial_tenant) log.info("Tenant status : %s", tenant_status) - assert tenant_status["state"]["slug"] == "Loading" + assert tenant_status["state"]["slug"] == "Attaching" # Try to read. This waits until the loading finishes, and then return normally. cur.execute("SELECT count(*) FROM foo") diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 7a0b2694b8..ae77197088 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -241,8 +241,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( assert reason.endswith(f"failpoint: {failpoint}"), reason if check is Check.RETRY_WITH_RESTART: - env.pageserver.stop() - env.pageserver.start() + env.pageserver.restart() if failpoint in ( "tenant-delete-before-shutdown", diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e92a906fab..11e8a80e1d 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -66,10 +66,6 @@ def test_tenant_reattach( env.pageserver.allowed_errors.append( f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" ) - # Thats because of UnreliableWrapper's injected failures - env.pageserver.allowed_errors.append( - f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*" - ) with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: @@ -116,7 +112,7 @@ def test_tenant_reattach( assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 # Check that we had to retry the downloads - assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*") + assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*") assert env.pageserver.log_contains(".*download.*failed, will retry.*") @@ -643,47 +639,6 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder ensure_test_data(data_id, data_secret, endpoint) -# Tests that it's possible to `load` broken tenants: -# * `ignore` a tenant -# * removes its `metadata` file locally -# * `load` the same tenant -# * ensure that it's status is `Broken` -def test_ignored_tenant_stays_broken_without_metadata(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - env.endpoints.create_start("main") - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - # Attempts to connect from compute to pageserver while the tenant is - # temporarily detached produces these errors in the pageserver log. - env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") - env.pageserver.allowed_errors.append( - f".*Tenant {tenant_id} will not become active\\. Current state: (Broken|Stopping).*" - ) - - # ignore the tenant and remove its metadata - pageserver_http.tenant_ignore(tenant_id) - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) - metadata_removed = False - for dir_entry in timeline_dir.iterdir(): - if dir_entry.name == "metadata": - # Looks like a layer file. Remove it - dir_entry.unlink() - metadata_removed = True - assert metadata_removed, f"Failed to find metadata file in {timeline_dir}" - - env.pageserver.allowed_errors.append( - f".*{tenant_id}.*: load failed.*: failed to load metadata.*" - ) - - # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory - pageserver_http.tenant_load(tenant_id=tenant_id) - wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5) - - # Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally # Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored. def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): @@ -778,7 +733,8 @@ def test_ignore_while_attaching( tenants_before_ignore ), "Only ignored tenant should be missing" - # But can load it from local files, that will restore attach. + # Calling load will bring the tenant back online + pageserver_http.configure_failpoints([("attach-before-activate", "off")]) pageserver_http.tenant_load(tenant_id) wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 40dff194aa..090d586721 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,5 +1,4 @@ import os -import shutil import time from contextlib import closing from datetime import datetime @@ -20,7 +19,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import timeline_delete_wait_completed from fixtures.remote_storage import RemoteStorageKind, available_remote_storages -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until from prometheus_client.samples import Sample @@ -298,13 +297,8 @@ def test_pageserver_with_empty_tenants( client = env.pageserver.http_client() - tenant_with_empty_timelines = TenantId.generate() - client.tenant_create(tenant_with_empty_timelines) - temp_timelines = client.timeline_list(tenant_with_empty_timelines) - for temp_timeline in temp_timelines: - timeline_delete_wait_completed( - client, tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"]) - ) + tenant_with_empty_timelines = env.initial_tenant + timeline_delete_wait_completed(client, tenant_with_empty_timelines, env.initial_timeline) files_in_timelines_dir = sum( 1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines)) @@ -317,34 +311,19 @@ def test_pageserver_with_empty_tenants( env.endpoints.stop_all() env.pageserver.stop() - tenant_without_timelines_dir = env.initial_tenant - shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir)) - env.pageserver.start() client = env.pageserver.http_client() - def not_loading(): + def not_attaching(): tenants = client.tenant_list() - assert len(tenants) == 2 - assert all(t["state"]["slug"] != "Loading" for t in tenants) + assert len(tenants) == 1 + assert all(t["state"]["slug"] != "Attaching" for t in tenants) - wait_until(10, 0.2, not_loading) + wait_until(10, 0.2, not_attaching) tenants = client.tenant_list() - [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)] - assert ( - broken_tenant["state"]["slug"] == "Broken" - ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" - - broken_tenant_status = client.tenant_status(tenant_without_timelines_dir) - assert ( - broken_tenant_status["state"]["slug"] == "Broken" - ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" - - assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*") - [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)] assert ( loaded_tenant["state"]["slug"] == "Active" @@ -358,9 +337,6 @@ def test_pageserver_with_empty_tenants( time.sleep(1) # to allow metrics propagation ps_metrics = client.get_metrics() - broken_tenants_metric_filter = { - "tenant_id": str(tenant_without_timelines_dir), - } active_tenants_metric_filter = { "state": "Active", } @@ -374,13 +350,3 @@ def test_pageserver_with_empty_tenants( assert ( tenant_active_count == 1 ), f"Tenant {tenant_with_empty_timelines} should have metric as active" - - tenant_broken_count = int( - ps_metrics.query_one( - "pageserver_broken_tenants_count", filter=broken_tenants_metric_filter - ).value - ) - - assert ( - tenant_broken_count == 1 - ), f"Tenant {tenant_without_timelines_dir} should have metric as broken" diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 12866accc7..27d5cce5f2 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -70,8 +70,7 @@ def test_threshold_based_eviction( } # restart because changing tenant config is not instant - env.pageserver.stop() - env.pageserver.start() + env.pageserver.restart() assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == { "kind": "LayerAccessThreshold", diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index c412809a3a..2e1fcd38fe 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -277,13 +277,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints( if failpoint == "timeline-delete-after-index-delete": m = ps_http.get_metrics() - assert ( - m.query_one( - "remote_storage_s3_request_seconds_count", - filter={"request_type": "get_object", "result": "err"}, - ).value - == 2 # One is missing tenant deletion mark, second is missing index part - ) assert ( m.query_one( "remote_storage_s3_request_seconds_count",