From 0af66a60030282775217ad9df4f306f5c0d7bb8b Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 7 May 2024 18:03:12 +0100 Subject: [PATCH] pageserver: include generation number in local layer paths (#7609) ## Problem In https://github.com/neondatabase/neon/pull/7531, we would like to be able to rewrite layers safely. One option is to make `Layer` able to rewrite files in place safely (e.g. by blocking evictions/deletions for an old Layer while a new one is created), but that's relatively fragile. It's more robust in general if we simply never overwrite the same local file: we can do that by putting the generation number in the filename. ## Summary of changes - Add `local_layer_path` (counterpart to `remote_layer_path`) and convert all locations that manually constructed a local layer path by joining LayerFileName to timeline path - In the layer upload path, construct remote paths with `remote_layer_path` rather than trying to build them out of a local path. - During startup, carry the full path to layer files through `init::reconcile`, and pass it into `Layer::for_resident` - Add a test to make sure we handle upgrades properly. - Comment out the generation part of `local_layer_path`, since we need to maintain forward compatibility for one release. A tiny followup PR will enable it afterwards. We could make this a bit simpler if we bulk renamed existing layers on startup instead of carrying literal paths through init, but that is operationally risky on existing servers with millions of layer files. We can always do a renaming change in future if it becomes annoying, but for the moment it's kind of nice to have a structure that enables us to change local path names again in future quite easily. We should rename `LayerFileName` to `LayerName` or somesuch, to make it more obvious that it's not a literal filename: this was already a bit confusing where that type is used in remote paths. That will be a followup, to avoid polluting this PR's diff. --- pageserver/src/disk_usage_eviction_task.rs | 7 +- pageserver/src/http/routes.rs | 10 +- pageserver/src/metrics.rs | 2 + .../src/tenant/remote_timeline_client.rs | 111 +++++++++++------- .../tenant/remote_timeline_client/download.rs | 9 +- .../tenant/remote_timeline_client/upload.rs | 52 +++----- pageserver/src/tenant/secondary.rs | 16 ++- pageserver/src/tenant/secondary/downloader.rs | 49 +++++--- .../src/tenant/storage_layer/filename.rs | 71 ++++++++++- pageserver/src/tenant/storage_layer/layer.rs | 58 ++++++++- pageserver/src/tenant/timeline.rs | 48 +++++--- pageserver/src/tenant/timeline/init.rs | 38 ++++-- test_runner/fixtures/neon_fixtures.py | 33 +++++- test_runner/fixtures/pageserver/types.py | 59 ++++------ test_runner/regress/test_duplicate_layers.py | 24 ++-- test_runner/regress/test_layer_eviction.py | 53 +++++---- .../regress/test_pageserver_generations.py | 48 ++++++++ .../regress/test_pageserver_secondary.py | 64 ++++------ test_runner/regress/test_remote_storage.py | 16 +-- .../test_tenants_with_remote_storage.py | 8 +- .../regress/test_timeline_detach_ancestor.py | 2 +- 21 files changed, 507 insertions(+), 271 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 6248424cee..400930245b 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -540,7 +540,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( js.spawn(async move { layer .secondary_tenant - .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name) + .evict_layer( + tenant_manager.get_conf(), + layer.timeline_id, + layer.name, + layer.metadata, + ) .await; Ok(file_size) }); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ea4c7f1e3b..83b7b8a45e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -63,6 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; +use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::Timeline; use crate::tenant::SpawnMode; @@ -1228,13 +1229,15 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let layer_name = LayerFileName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let downloaded = timeline - .download_layer(layer_file_name) + .download_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1258,11 +1261,14 @@ async fn evict_timeline_layer_handler( let layer_file_name = get_request_param(&request, "layer_file_name")?; let state = get_state(&request); + let layer_name = LayerFileName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; let evicted = timeline - .evict_layer(layer_file_name) + .evict_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 40712e4895..256f2f334c 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2929,6 +2929,8 @@ pub fn preinitialize_metrics() { &WALRECEIVER_CANDIDATES_REMOVED, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, + &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + &REMOTE_ONDEMAND_DOWNLOADED_BYTES, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 255449c049..356a0dc51c 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1140,15 +1140,21 @@ impl RemoteTimelineClient { uploaded: &ResidentLayer, cancel: &CancellationToken, ) -> anyhow::Result<()> { + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &uploaded.layer_desc().filename(), + uploaded.metadata().generation, + ); + backoff::retry( || async { - let m = uploaded.metadata(); upload::upload_timeline_layer( - self.conf, &self.storage_impl, uploaded.local_path(), - &uploaded.metadata(), - m.generation, + &remote_path, + uploaded.metadata().file_size(), cancel, ) .await @@ -1173,15 +1179,30 @@ impl RemoteTimelineClient { adopted_as: &Layer, cancel: &CancellationToken, ) -> anyhow::Result<()> { + let source_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &adopted + .get_timeline_id() + .expect("Source timeline should be alive"), + self.tenant_shard_id.to_index(), + &adopted.layer_desc().filename(), + adopted.metadata().generation, + ); + + let target_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &adopted_as.layer_desc().filename(), + adopted_as.metadata().generation, + ); + backoff::retry( || async { upload::copy_timeline_layer( - self.conf, &self.storage_impl, - adopted.local_path(), - &adopted.metadata(), - adopted_as.local_path(), - &adopted_as.metadata(), + &source_remote_path, + &target_remote_path, cancel, ) .await @@ -1496,13 +1517,25 @@ impl RemoteTimelineClient { let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata) => { - let path = layer.local_path(); + let local_path = layer.local_path(); + + // We should only be uploading layers created by this `Tenant`'s lifetime, so + // the metadata in the upload should always match our current generation. + assert_eq!(layer_metadata.generation, self.generation); + + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + layer_metadata.shard, + &layer.layer_desc().filename(), + layer_metadata.generation, + ); + upload::upload_timeline_layer( - self.conf, &self.storage_impl, - path, - layer_metadata, - self.generation, + local_path, + &remote_path, + layer_metadata.file_size(), &self.cancel, ) .measure_remote_op( @@ -1931,29 +1964,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } } -/// Files on the remote storage are stored with paths, relative to the workdir. -/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. -/// -/// Errors if the path provided does not start from pageserver's workdir. -pub(crate) fn remote_path( - conf: &PageServerConf, - local_path: &Utf8Path, - generation: Generation, -) -> anyhow::Result { - let stripped = local_path - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix")?; - - let suffixed = format!("{0}{1}", stripped, generation.get_suffix()); - - RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| { - format!( - "to resolve remote part of path {:?} for base {:?}", - local_path, conf.workdir - ) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -1961,6 +1971,7 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, + storage_layer::layer::local_layer_path, Tenant, Timeline, }, DEFAULT_PG_VERSION, @@ -2143,11 +2154,20 @@ mod tests { ] .into_iter() .map(|(name, contents): (LayerFileName, Vec)| { - std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap(); + + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &generation, + ); + std::fs::write(&local_path, &contents).unwrap(); Layer::for_resident( harness.conf, &timeline, + local_path, name, LayerFileMetadata::new(contents.len() as u64, generation, shard), ) @@ -2284,19 +2304,22 @@ mod tests { .. } = TestSetup::new("metrics").await.unwrap(); let client = timeline.remote_client.as_ref().unwrap(); - let timeline_path = harness.timeline_path(&TIMELINE_ID); let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &layer_file_name_1, + &harness.generation, + ); let content_1 = dummy_contents("foo"); - std::fs::write( - timeline_path.join(layer_file_name_1.file_name()), - &content_1, - ) - .unwrap(); + std::fs::write(&local_path, &content_1).unwrap(); let layer_file_1 = Layer::for_resident( harness.conf, &timeline, + local_path, layer_file_name_1.clone(), LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b038f264f5..c86b22d481 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -21,6 +21,7 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; +use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; @@ -55,7 +56,13 @@ pub async fn download_layer_file<'a>( debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let local_path = timeline_path.join(layer_file_name.file_name()); + let local_path = local_layer_path( + conf, + &tenant_shard_id, + &timeline_id, + layer_file_name, + &layer_metadata.generation, + ); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index a988369b6a..caa843316f 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -12,18 +12,13 @@ use tokio_util::sync::CancellationToken; use utils::backoff; use super::Generation; -use crate::{ - config::PageServerConf, - tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_path, - }, +use crate::tenant::remote_timeline_client::{ + index::IndexPart, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, }; -use remote_storage::{GenericRemoteStorage, TimeTravelError}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; -use super::index::LayerFileMetadata; - use tracing::info; /// Serializes and uploads the given index part data to the remote storage. @@ -65,11 +60,10 @@ pub(crate) async fn upload_index_part<'a>( /// /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layer<'a>( - conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - source_path: &'a Utf8Path, - known_metadata: &'a LayerFileMetadata, - generation: Generation, + local_path: &'a Utf8Path, + remote_path: &'a RemotePath, + metadata_size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { @@ -78,8 +72,7 @@ pub(super) async fn upload_timeline_layer<'a>( pausable_failpoint!("before-upload-layer-pausable"); - let storage_path = remote_path(conf, source_path, generation)?; - let source_file_res = fs::File::open(&source_path).await; + let source_file_res = fs::File::open(&local_path).await; let source_file = match source_file_res { Ok(source_file) => source_file, Err(e) if e.kind() == ErrorKind::NotFound => { @@ -90,43 +83,37 @@ pub(super) async fn upload_timeline_layer<'a>( // it has been written to disk yet. // // This is tested against `test_compaction_delete_before_upload` - info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); + info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); return Ok(()); } - Err(e) => { - Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))? - } + Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?, }; let fs_size = source_file .metadata() .await - .with_context(|| format!("get the source file metadata for layer {source_path:?}"))? + .with_context(|| format!("get the source file metadata for layer {local_path:?}"))? .len(); - let metadata_size = known_metadata.file_size(); if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size) - .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?; + .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage - .upload(reader, fs_size, &storage_path, None, cancel) + .upload(reader, fs_size, remote_path, None, cancel) .await - .with_context(|| format!("upload layer from local path '{source_path}'")) + .with_context(|| format!("upload layer from local path '{local_path}'")) } pub(super) async fn copy_timeline_layer( - conf: &'static PageServerConf, storage: &GenericRemoteStorage, - source_path: &Utf8Path, - source_metadata: &LayerFileMetadata, - target_path: &Utf8Path, - target_metadata: &LayerFileMetadata, + source_path: &RemotePath, + target_path: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-copy-layer", |_| { @@ -135,11 +122,8 @@ pub(super) async fn copy_timeline_layer( pausable_failpoint!("before-copy-layer-pausable"); - let source_path = remote_path(conf, source_path, source_metadata.generation)?; - let target_path = remote_path(conf, target_path, target_metadata.generation)?; - storage - .copy_object(&source_path, &target_path, cancel) + .copy_object(source_path, target_path, cancel) .await .with_context(|| format!("copy layer {source_path} to {target_path}")) } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 5c46df268a..0bb25f0ace 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -21,8 +21,9 @@ use self::{ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, + remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerFileName}, }; use pageserver_api::{ @@ -182,6 +183,7 @@ impl SecondaryTenant { conf: &PageServerConf, timeline_id: TimelineId, name: LayerFileName, + metadata: LayerFileMetadata, ) { debug_assert_current_span_has_tenant_id(); @@ -195,9 +197,13 @@ impl SecondaryTenant { let now = SystemTime::now(); - let path = conf - .timeline_path(&self.tenant_shard_id, &timeline_id) - .join(name.file_name()); + let local_path = local_layer_path( + conf, + &self.tenant_shard_id, + &timeline_id, + &name, + &metadata.generation, + ); let this = self.clone(); @@ -208,7 +214,7 @@ impl SecondaryTenant { // it, the secondary downloader could have seen an updated heatmap that // resulted in a layer being deleted. // Other local I/O errors are process-fatal: these should never happen. - let deleted = std::fs::remove_file(path); + let deleted = std::fs::remove_file(local_path); let not_found = deleted .as_ref() diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index fb8907b5a8..092630e74d 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -22,7 +22,7 @@ use crate::{ FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerFileName}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, @@ -621,12 +621,12 @@ impl<'a> TenantDownloader<'a> { let layers_in_heatmap = heatmap_timeline .layers .iter() - .map(|l| &l.name) + .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state .on_disk_layers .iter() - .map(|l| l.0) + .map(|l| (l.0, l.1.metadata.generation)) .collect::>(); let mut layer_count = layers_on_disk.len(); @@ -637,16 +637,24 @@ impl<'a> TenantDownloader<'a> { .sum(); // Remove on-disk layers that are no longer present in heatmap - for layer in layers_on_disk.difference(&layers_in_heatmap) { + for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) { layer_count -= 1; layer_byte_count -= timeline_state .on_disk_layers - .get(layer) + .get(layer_file_name) .unwrap() .metadata .file_size(); - delete_layers.push((*timeline_id, (*layer).clone())); + let local_path = local_layer_path( + self.conf, + self.secondary_state.get_tenant_shard_id(), + timeline_id, + layer_file_name, + generation, + ); + + delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path)); } progress.bytes_downloaded += layer_byte_count; @@ -661,11 +669,7 @@ impl<'a> TenantDownloader<'a> { } // Execute accumulated deletions - for (timeline_id, layer_name) in delete_layers { - let timeline_path = self - .conf - .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); - let local_path = timeline_path.join(layer_name.to_string()); + for (timeline_id, layer_name, local_path) in delete_layers { tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); tokio::fs::remove_file(&local_path) @@ -754,9 +758,6 @@ impl<'a> TenantDownloader<'a> { ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - let timeline_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id); // Accumulate updates to the state let mut touched = Vec::new(); @@ -806,10 +807,14 @@ impl<'a> TenantDownloader<'a> { if cfg!(debug_assertions) { // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think // are already present on disk are really there. - let local_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id) - .join(layer.name.file_name()); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &layer.name, + &layer.metadata.generation, + ); + match tokio::fs::metadata(&local_path).await { Ok(meta) => { tracing::debug!( @@ -903,7 +908,13 @@ impl<'a> TenantDownloader<'a> { }; if downloaded_bytes != layer.metadata.file_size { - let local_path = timeline_path.join(layer.name.to_string()); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &layer.name, + &layer.metadata.generation, + ); tracing::warn!( "Downloaded layer {} with unexpected size {} != {}. Removing download.", diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index a98be0842b..fff66a9d07 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -2,11 +2,13 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::repository::Key; +use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use regex::Regex; use utils::lsn::Lsn; use super::PersistentLayerDesc; @@ -74,10 +76,19 @@ impl DeltaFileName { let key_end_str = key_parts.next()?; let lsn_start_str = lsn_parts.next()?; let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } + if key_start_str.len() != 36 + || key_end_str.len() != 36 + || lsn_start_str.len() != 16 + || lsn_end_str.len() != 16 + { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; @@ -182,6 +193,10 @@ impl ImageFileName { return None; } + if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; @@ -259,9 +274,22 @@ impl From for LayerFileName { impl FromStr for LayerFileName { type Err = String; + /// Conversion from either a physical layer filename, or the string-ization of + /// Self. When loading a physical layer filename, we drop any extra information + /// not needed to build Self. fn from_str(value: &str) -> Result { - let delta = DeltaFileName::parse_str(value); - let image = ImageFileName::parse_str(value); + let gen_suffix_regex = Regex::new("^(?.+)-(?[0-9a-f]{8})$").unwrap(); + let file_name: Cow = match gen_suffix_regex.captures(value) { + Some(captures) => captures + .name("base") + .expect("Non-optional group") + .as_str() + .into(), + None => value.into(), + }; + + let delta = DeltaFileName::parse_str(&file_name); + let image = ImageFileName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -315,3 +343,42 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { v.parse().map_err(|e| E::custom(e)) } } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn image_layer_parse() -> anyhow::Result<()> { + let expected = LayerFileName::Image(ImageFileName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn: Lsn::from_hex("00000000014FED58").unwrap(), + }); + let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected,); + + // Omitting generation suffix is valid + let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected,); + + Ok(()) + } + + #[test] + fn delta_layer_parse() -> anyhow::Result<()> { + let expected = LayerFileName::Delta(DeltaFileName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn_range: Lsn::from_hex("00000000014FED58").unwrap() + ..Lsn::from_hex("000000000154C481").unwrap(), + }); + let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected); + + // Omitting generation suffix is valid + let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?; + assert_eq!(parsed, expected); + + Ok(()) + } +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 27faa507ca..b5e69db7f4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,12 +4,13 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::heavier_once_cell; @@ -123,6 +124,25 @@ impl PartialEq for Layer { } } +pub(crate) fn local_layer_path( + conf: &PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer_file_name: &LayerFileName, + _generation: &Generation, +) -> Utf8PathBuf { + let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); + + timeline_path.join(layer_file_name.file_name()) + + // TOOD: include generation in the name in now+1 releases. + // timeline_path.join(format!( + // "{}{}", + // layer_file_name.file_name(), + // generation.get_suffix() + // )) +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( @@ -131,6 +151,14 @@ impl Layer { file_name: LayerFileName, metadata: LayerFileMetadata, ) -> Self { + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &file_name, + &metadata.generation, + ); + let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, @@ -143,6 +171,7 @@ impl Layer { let owner = Layer(Arc::new(LayerInner::new( conf, timeline, + local_path, access_stats, desc, None, @@ -159,6 +188,7 @@ impl Layer { pub(crate) fn for_resident( conf: &'static PageServerConf, timeline: &Arc, + local_path: Utf8PathBuf, file_name: LayerFileName, metadata: LayerFileMetadata, ) -> ResidentLayer { @@ -184,6 +214,7 @@ impl Layer { LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -225,9 +256,19 @@ impl Layer { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); + + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &desc.filename(), + &timeline.generation, + ); + LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -410,6 +451,13 @@ impl Layer { self.0.metadata() } + pub(crate) fn get_timeline_id(&self) -> Option { + self.0 + .timeline + .upgrade() + .map(|timeline| timeline.timeline_id) + } + /// Traditional debug dumping facility #[allow(unused)] pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> { @@ -709,19 +757,17 @@ impl Drop for LayerInner { } impl LayerInner { + #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, timeline: &Arc, + local_path: Utf8PathBuf, access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, shard: ShardIndex, ) -> Self { - let path = conf - .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) - .join(desc.filename().to_string()); - let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); @@ -737,7 +783,7 @@ impl LayerInner { LayerInner { conf, debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() }, - path, + path: local_path, desc, timeline: Arc::downgrade(timeline), have_remote_client: timeline.remote_client.is_some(), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7213ff8f75..d6d012c70c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -60,6 +60,7 @@ use std::{ ops::ControlFlow, }; +use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, @@ -1904,7 +1905,7 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub(crate) async fn download_layer( &self, - layer_file_name: &str, + layer_file_name: &LayerFileName, ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); @@ -1922,7 +1923,10 @@ impl Timeline { /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. - pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn evict_layer( + &self, + layer_file_name: &LayerFileName, + ) -> anyhow::Result> { let _gate = self .gate .enter() @@ -2413,8 +2417,8 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(file_name, file_size) => { - discovered_layers.push((file_name, file_size)); + Discovered::Layer(layer_file_name, local_path, file_size) => { + discovered_layers.push((layer_file_name, local_path, file_size)); continue; } Discovered::Metadata => { @@ -2459,7 +2463,7 @@ impl Timeline { let mut needs_cleanup = Vec::new(); let mut total_physical_size = 0; - for (name, decision) in decided { + for (name, local_path, decision) in decided { let decision = match decision { Ok(UseRemote { local, remote }) => { // Remote is authoritative, but we may still choose to retain @@ -2469,26 +2473,23 @@ impl Timeline { // the correct generation. UseLocal(remote) } else { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); + let local_path = local_path.as_ref().expect("Locally found layer must have path"); + init::cleanup_local_file_for_remote(local_path, &local, &remote)?; UseRemote { local, remote } } } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { if local.is_some() { - path.push(name.file_name()); - init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?; - path.pop(); + let local_path = local_path.expect("Locally found layer must have path"); + init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - path.push(name.file_name()); - init::cleanup_local_only_file(&path, &name, &local)?; - path.pop(); + let local_path = local_path.expect("Locally found layer must have path"); + init::cleanup_local_only_file(&local_path, &name, &local)?; // this file never existed remotely, we will have to do rework continue; } @@ -2504,7 +2505,18 @@ impl Timeline { let layer = match decision { UseLocal(m) => { total_physical_size += m.file_size(); - Layer::for_resident(conf, &this, name, m).drop_eviction_guard() + + let local_path = local_path.unwrap_or_else(|| { + local_layer_path( + conf, + &this.tenant_shard_id, + &this.timeline_id, + &name, + &m.generation, + ) + }); + + Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard() } Evicted(remote) | UseRemote { remote, .. } => { Layer::for_evicted(conf, &this, name, remote) @@ -2985,11 +2997,11 @@ impl Timeline { } } - async fn find_layer(&self, layer_file_name: &str) -> Option { + async fn find_layer(&self, layer_name: &LayerFileName) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename().file_name(); - if layer_file_name == historic_layer_name { + let historic_layer_name = historic_layer.filename(); + if layer_name == &historic_layer_name { return Some(guard.get_from_desc(&historic_layer)); } } diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 916ebfc6d9..9c33981807 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -12,7 +12,7 @@ use crate::{ METADATA_FILE_NAME, }; use anyhow::Context; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::ShardIndex; use std::{collections::HashMap, str::FromStr}; use utils::lsn::Lsn; @@ -20,7 +20,7 @@ use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, u64), + Layer(LayerFileName, Utf8PathBuf, u64), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed @@ -46,7 +46,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, file_size) + Discovered::Layer(file_name, direntry.path().to_owned(), file_size) } Err(_) => { if file_name == METADATA_FILE_NAME { @@ -104,26 +104,38 @@ pub(super) enum DismissedLayer { /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, u64)>, + discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, generation: Generation, shard: ShardIndex, -) -> Vec<(LayerFileName, Result)> { +) -> Vec<( + LayerFileName, + Option, + Result, +)> { use Decision::*; - // name => (local, remote) - type Collected = HashMap, Option)>; + // name => (local_path, local_metadata, remote_metadata) + type Collected = HashMap< + LayerFileName, + ( + Option, + Option, + Option, + ), + >; let mut discovered = discovered .into_iter() - .map(|(name, file_size)| { + .map(|(layer_name, local_path, file_size)| { ( - name, + layer_name, // The generation and shard here will be corrected to match IndexPart in the merge below, unless // it is not in IndexPart, in which case using our current generation makes sense // because it will be uploaded in this generation. ( + Some(local_path), Some(LayerFileMetadata::new(file_size, generation, shard)), None, ), @@ -140,15 +152,15 @@ pub(super) fn reconcile( .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata))) .for_each(|(name, metadata)| { if let Some(existing) = discovered.get_mut(name) { - existing.1 = Some(metadata); + existing.2 = Some(metadata); } else { - discovered.insert(name.to_owned(), (None, Some(metadata))); + discovered.insert(name.to_owned(), (None, None, Some(metadata))); } }); discovered .into_iter() - .map(|(name, (local, remote))| { + .map(|(name, (local_path, local, remote))| { let decision = if name.is_in_future(disk_consistent_lsn) { Err(DismissedLayer::Future { local }) } else { @@ -165,7 +177,7 @@ pub(super) fn reconcile( } }; - (name, decision) + (name, local_path, decision) }) .collect::>() } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fc66822eb9..30cec4c726 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -54,7 +54,7 @@ from fixtures.pageserver.allowed_errors import ( DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump +from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, @@ -2652,6 +2652,37 @@ class NeonPageserver(PgProtocol, LogUtils): tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) + def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. + + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = self.timeline_dir(tenant_id, timeline_id) + + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) + ) + + def layer_exists( + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName + ) -> bool: + layers = self.list_layers(tenant_id, timeline_id) + return layer_name in [parse_layer_file_name(p.name) for p in layers] + class PgBin: """A helper class for executing postgres binaries""" diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index 72fa30a2f2..fd018cb778 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -1,3 +1,4 @@ +import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union @@ -47,46 +48,36 @@ class InvalidFileName(Exception): pass +IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$") + + def parse_image_layer(f_name: str) -> Tuple[int, int, int]: """Parse an image layer file name. Return key start, key end, and snapshot lsn""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - try: - return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + + match = IMAGE_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an image layer filename") + + return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16) + + +DELTA_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$" +) def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - lsn_parts = parts[1].split("-") - if len(lsn_parts) != 2: - raise InvalidFileName( - f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}" - ) - try: - return ( - int(key_parts[0], 16), - int(key_parts[1], 16), - int(lsn_parts[0], 16), - int(lsn_parts[1], 16), - ) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + match = DELTA_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an delta layer filename") + + return ( + int(match.group(1), 16), + int(match.group(2), 16), + int(match.group(3), 16), + int(match.group(4), 16), + ) def parse_layer_file_name(file_name: str) -> LayerFileName: diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index cb4fa43be7..7471338ce5 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -2,6 +2,7 @@ import time import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload_queue_empty, @@ -86,14 +87,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # path = env.remote_storage.timeline_path(tenant_id, timeline_id) l1_found = None - for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir(): - if path.name == "metadata" or path.name.startswith("ephemeral-"): - continue - - if len(path.suffixes) > 0: - # temp files - continue - + for path in env.pageserver.list_layers(tenant_id, timeline_id): [key_range, lsn_range] = path.name.split("__", maxsplit=1) if "-" not in lsn_range: @@ -108,19 +102,21 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) if l1_found is not None: raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}") - l1_found = path + l1_found = parse_layer_file_name(path.name) assert l1_found is not None, "failed to find L1 locally" uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name + tenant_id, timeline_id, l1_found.to_str() ) assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" env.pageserver.start() wait_until_tenant_active(pageserver_http, tenant_id) - assert not l1_found.exists(), "partial compaction result should had been removed during startup" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, l1_found + ), "partial compaction result should had been removed during startup" # wait for us to catch up again wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) @@ -130,18 +126,18 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # give time for log flush time.sleep(1) - message = f".*duplicated L1 layer layer={l1_found.name}" + message = f".*duplicated L1 layer layer={l1_found}" found_msg = env.pageserver.log_contains(message) # resident or evicted, it should not be overwritten, however it should had been non-existing at startup assert ( found_msg is None ), "layer should had been removed during startup, did it live on as evicted?" - assert l1_found.exists(), "the L1 reappears" + assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears" wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name + tenant_id, timeline_id, l1_found.to_str() ) assert uploaded.exists(), "the L1 is uploaded" diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index fefb30bbdd..5c967fd72e 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -7,6 +7,7 @@ from fixtures.neon_fixtures import ( flush_ep_to_pageserver, wait_for_last_flush_lsn, ) +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind @@ -57,9 +58,9 @@ def test_basic_eviction( for sk in env.safekeepers: sk.stop() - timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) - initial_local_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + initial_local_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( len(initial_local_layers) > 1 @@ -73,6 +74,7 @@ def test_basic_eviction( assert len(initial_local_layers) == len( initial_layer_map_info.historic_layers ), "Should have the same layers in memory and on disk" + for returned_layer in initial_layer_map_info.historic_layers: assert ( returned_layer.kind == "Delta" @@ -81,27 +83,29 @@ def test_basic_eviction( not returned_layer.remote ), f"All created layers should be present locally, but got {returned_layer}" - local_layers = list( - filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers) + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers + ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + + local_layer_path = ( + env.pageserver.timeline_dir(tenant_id, timeline_id) + / initial_local_layers[returned_layer_name] ) assert ( - len(local_layers) == 1 - ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" - local_layer = local_layers[0] - assert ( - returned_layer.layer_file_size == local_layer.stat().st_size - ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}" + returned_layer.layer_file_size == local_layer_path.stat().st_size + ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map - for local_layer in initial_local_layers: + for local_layer_name, local_layer_path in initial_local_layers.items(): client.evict_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name ) - assert not any( - new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*") - ), f"Did not expect to find {local_layer} layer after evicting" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, local_layer_name + ), f"Did not expect to find {local_layer_name} layer after evicting" - empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + empty_layers = env.pageserver.list_layers(tenant_id, timeline_id) assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) @@ -118,15 +122,15 @@ def test_basic_eviction( assert ( returned_layer.remote ), f"All layers should be evicted and not present locally, but got {returned_layer}" - assert any( - local_layer.name == returned_layer.layer_file_name - for local_layer in initial_local_layers + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" # redownload all evicted layers and ensure the initial state is restored - for local_layer in initial_local_layers: + for local_layer_name, _local_layer_path in initial_local_layers.items(): client.download_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str() ) client.timeline_download_remote_layers( tenant_id, @@ -137,8 +141,9 @@ def test_basic_eviction( at_least_one_download=False, ) - redownloaded_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + redownloaded_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( redownloaded_layers == initial_local_layers diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index f957bea156..adcf7de8d4 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -10,6 +10,7 @@ of the pageserver are: """ import enum +import os import re import time from typing import Optional @@ -700,3 +701,50 @@ def test_multi_attach( # All data we wrote while multi-attached remains readable workload.validate(pageservers[2].id) + + +@pytest.mark.skip(reason="To be enabled after release with new local path style") +def test_upgrade_generationless_local_file_paths( + neon_env_builder: NeonEnvBuilder, +): + """ + Test pageserver behavior when startup up with local layer paths without + generation numbers: it should accept these layer files, and avoid doing + a delete/download cycle on them. + """ + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(1000) + + env.pageserver.stop() + + # Rename the local paths to legacy format, to simulate what + # we would see when upgrading + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-00000001"): + new_path = path[:-9] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 + + assert files_renamed > 0 + + env.pageserver.start() + + workload.validate() + + # Assert that there were no on-demand downloads + assert ( + env.pageserver.http_client().get_metric_value( + "pageserver_remote_ondemand_downloaded_layers_total" + ) + == 0 + ) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8f194e5dda..c40bb962f2 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -2,12 +2,12 @@ import json import os import random import time -from pathlib import Path from typing import Any, Dict, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, poll_for_remote_storage_iterations, @@ -51,9 +51,13 @@ def evict_random_layers( if "ephemeral" in layer.name or "temp_download" in layer.name: continue + layer_name = parse_layer_file_name(layer.name) + if rng.choice([True, False]): - log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") - client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}") + client.evict_layer( + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str() + ) @pytest.mark.parametrize("seed", [1, 2, 3]) @@ -402,32 +406,6 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) -def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: - """ - Inspect local storage on a pageserver to discover which layer files are present. - - :return: list of relative paths to layers, from the timeline root. - """ - timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) - - def relative(p: Path) -> Path: - return p.relative_to(timeline_path) - - return sorted( - list( - map( - relative, - filter( - lambda path: path.name != "metadata" - and "ephemeral" not in path.name - and "temp" not in path.name, - timeline_path.glob("*"), - ), - ) - ) - ) - - def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -482,8 +460,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) # Make changes on attached pageserver, check secondary downloads them @@ -500,8 +478,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) try: - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) except: # Do a full listing of the secondary location on errors, to help debug of @@ -523,8 +501,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # ================================================================== try: log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1] + layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] + some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] log.info(f"Victim layer: {layer_to_evict.name}") ps_attached.http_client().evict_layer( tenant_id, timeline_id, layer_name=layer_to_evict.name @@ -537,13 +515,13 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] ) assert layer_to_evict.name not in heatmap_layers - assert some_other_layer.name in heatmap_layers + assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) except: # On assertion failures, log some details to help with debugging @@ -630,7 +608,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): for timeline_id in timelines: log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") # One or more layers should be present for all timelines - assert list_layers(ps_secondary, tenant_id, timeline_id) + assert ps_secondary.list_layers(tenant_id, timeline_id) # Delete the second timeline: this should be reflected later on the secondary env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) @@ -645,10 +623,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary = next(p for p in env.pageservers if p != ps_attached) # This one was not deleted - assert list_layers(ps_secondary, tenant_id, timelines[0]) + assert ps_secondary.list_layers(tenant_id, timelines[0]) # This one was deleted - assert not list_layers(ps_secondary, tenant_id, timelines[1]) + assert not ps_secondary.list_layers(tenant_id, timelines[1]) t_end = time.time() @@ -708,7 +686,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) # Expect lots of layers - assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10 + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index ad4b4a42f1..70c025c225 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -12,6 +12,7 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -829,8 +830,9 @@ def test_compaction_waits_for_upload( assert len(upload_stuck_layers) > 0 for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "while uploads are stuck the layers should be present on disk" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "while uploads are stuck the layers should be present on disk" # now this will do the L0 => L1 compaction and want to remove # upload_stuck_layers and the original initdb L0 @@ -838,8 +840,9 @@ def test_compaction_waits_for_upload( # as uploads are paused, the upload_stuck_layers should still be with us for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "uploads are stuck still over compaction" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "uploads are stuck still over compaction" compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() overlap = compacted_layers.intersection(upload_stuck_layers) @@ -873,9 +876,8 @@ def test_compaction_waits_for_upload( wait_until(10, 1, until_layer_deletes_completed) for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert ( - not path.exists() + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" # We should not have hit the error handling path in uploads where a uploaded file is gone diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index d16978d02a..a1e96928bf 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -18,6 +18,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) +from fixtures.pageserver.types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -246,7 +247,10 @@ def test_tenant_redownloads_truncated_file_on_startup( # ensure the same size is found from the index_part.json index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id) - assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + assert ( + index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"] + == expected_size + ) ## Start the pageserver. It will notice that the file size doesn't match, and ## rename away the local file. It will be re-downloaded when it's needed. @@ -276,7 +280,7 @@ def test_tenant_redownloads_truncated_file_on_startup( # the remote side of local_layer_truncated remote_layer_path = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, path.name + tenant_id, timeline_id, parse_layer_file_name(path.name).to_str() ) # if the upload ever was ongoing, this check would be racy, but at least one diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index bc983c36ee..5abb3e28e4 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -63,7 +63,7 @@ def test_ancestor_detach_branched_from( env.pageserver.allowed_errors.extend( [ - ".*initial size calculation failed: downloading failed, possibly for shutdown" + ".*initial size calculation failed: downloading failed, possibly for shutdown", ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ] )