diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index ebeb8bbb20..7f25e49570 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( } EvictionLayer::Secondary(layer) => { let file_size = layer.metadata.file_size(); - let tenant_manager = tenant_manager.clone(); js.spawn(async move { layer .secondary_tenant - .evict_layer( - tenant_manager.get_conf(), - layer.timeline_id, - layer.name, - layer.metadata, - ) + .evict_layer(layer.timeline_id, layer.name) .await; Ok(file_size) }); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index c5462dac43..07d6af696c 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -518,6 +518,7 @@ impl RemoteTimelineClient { &self, layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { @@ -536,6 +537,7 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + local_path, cancel, ctx, ) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index f3c9e64533..70c5cae05e 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -21,7 +21,6 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; -use crate::tenant::storage_layer::layer::local_layer_path; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; @@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>( timeline_id: TimelineId, layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); - let local_path = local_layer_path( - conf, - &tenant_shard_id, - &timeline_id, - layer_file_name, - &layer_metadata.generation, - ); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>( // For more context about durable_rename check this email from postgres mailing list: // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 7075044baf..252b6eb11b 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -6,11 +6,9 @@ mod scheduler; use std::{sync::Arc, time::SystemTime}; use crate::{ - config::PageServerConf, context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - virtual_file::MaybeFatalIo, }; use self::{ @@ -21,9 +19,8 @@ use self::{ use super::{ config::{SecondaryLocationConfig, TenantConfOpt}, mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName}, + storage_layer::LayerName, }; use pageserver_api::{ @@ -178,13 +175,7 @@ impl SecondaryTenant { /// Cancellation safe, but on cancellation the eviction will go through #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] - pub(crate) async fn evict_layer( - self: &Arc, - conf: &PageServerConf, - timeline_id: TimelineId, - name: LayerName, - metadata: LayerFileMetadata, - ) { + pub(crate) async fn evict_layer(self: &Arc, timeline_id: TimelineId, name: LayerName) { debug_assert_current_span_has_tenant_id(); let guard = match self.gate.enter() { @@ -197,41 +188,11 @@ impl SecondaryTenant { let now = SystemTime::now(); - let local_path = local_layer_path( - conf, - &self.tenant_shard_id, - &timeline_id, - &name, - &metadata.generation, - ); - let this = self.clone(); // spawn it to be cancellation safe tokio::task::spawn_blocking(move || { let _guard = guard; - // We tolerate ENOENT, because between planning eviction and executing - // it, the secondary downloader could have seen an updated heatmap that - // resulted in a layer being deleted. - // Other local I/O errors are process-fatal: these should never happen. - let deleted = std::fs::remove_file(local_path); - - let not_found = deleted - .as_ref() - .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound); - - let deleted = if not_found { - false - } else { - deleted - .map(|()| true) - .fatal_err("Deleting layer during eviction") - }; - - if !deleted { - // skip updating accounting and putting perhaps later timestamp - return; - } // Update the timeline's state. This does not have to be synchronized with // the download process, because: @@ -250,8 +211,15 @@ impl SecondaryTenant { // of the cache. let mut detail = this.detail.lock().unwrap(); if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - timeline_detail.on_disk_layers.remove(&name); - timeline_detail.evicted_at.insert(name, now); + let removed = timeline_detail.on_disk_layers.remove(&name); + + // We might race with removal of the same layer during downloads, if it was removed + // from the heatmap. If we see that the OnDiskState is gone, then no need to + // do a physical deletion or store in evicted_at. + if let Some(removed) = removed { + removed.remove_blocking(); + timeline_detail.evicted_at.insert(name, now); + } } }) .await diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 46a3d7e81f..8f27220771 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -111,6 +111,7 @@ struct SecondaryDownloader { pub(super) struct OnDiskState { metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, } impl OnDiskState { @@ -121,12 +122,26 @@ impl OnDiskState { _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, ) -> Self { Self { metadata, access_time, + local_path, } } + + // This is infallible, because all errors are either acceptable (ENOENT), or totally + // unexpected (fatal). + pub(super) fn remove_blocking(&self) { + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + std::fs::remove_file(&self.local_path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Deleting secondary layer") + } } #[derive(Debug, Clone, Default)] @@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> { if cfg!(debug_assertions) { // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think // are already present on disk are really there. - let local_path = local_layer_path( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - &layer.name, - &layer.metadata.generation, - ); - - match tokio::fs::metadata(&local_path).await { + match tokio::fs::metadata(&on_disk.local_path).await { Ok(meta) => { tracing::debug!( "Layer {} present at {}, size {}", layer.name, - local_path, + on_disk.local_path, meta.len(), ); } @@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> { tracing::warn!( "Layer {} not found at {} ({})", layer.name, - local_path, + on_disk.local_path, e ); debug_assert!(false); @@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> { v.get_mut().access_time = t.access_time; } Entry::Vacant(e) => { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &t.name, + &t.metadata.generation, + ); e.insert(OnDiskState::new( self.conf, tenant_shard_id, @@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> { t.name, LayerFileMetadata::from(&t.metadata), t.access_time, + local_path, )); } } @@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> { &self.secondary_state.cancel ); + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally let downloaded_bytes = match download_layer_file( self.conf, @@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> { *timeline_id, &layer.name, &LayerFileMetadata::from(&layer.metadata), + &local_path, &self.secondary_state.cancel, ctx, ) @@ -1116,6 +1140,7 @@ async fn init_timeline_state( name, LayerFileMetadata::from(&remote_meta.metadata), remote_meta.access_time, + file_path, ), ); } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index e8c712c4c6..97b349f635 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1108,6 +1108,7 @@ impl LayerInner { .download_layer_file( &self.desc.layer_name(), &self.metadata(), + &self.path, &timeline.cancel, ctx, ) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 9b97254410..0235cf6d20 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -25,6 +25,7 @@ from fixtures.neon_fixtures import ( S3Scrubber, generate_uploads_and_deletions, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, @@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths( generation numbers: it should accept these layer files, and avoid doing a delete/download cycle on them. """ - env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(1000) - env.pageserver.stop() + attached_pageserver = env.get_tenant_pageserver(tenant_id) + secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[ + 0 + ] + + attached_pageserver.http_client().tenant_heatmap_upload(tenant_id) + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) # Rename the local paths to legacy format, to simulate what - # we would see when upgrading - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) - files_renamed = 0 - for filename in os.listdir(timeline_dir): - path = os.path.join(timeline_dir, filename) - log.info(f"Found file {path}") - if path.endswith("-v1-00000001"): - new_path = path[:-12] - os.rename(path, new_path) - log.info(f"Renamed {path} -> {new_path}") - files_renamed += 1 + # we would see when upgrading. Do this on both attached and secondary locations, as we will + # test the behavior of both. + for pageserver in env.pageservers: + pageserver.stop() + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 - assert files_renamed > 0 + assert files_renamed > 0 - env.pageserver.start() + pageserver.start() workload.validate() # Assert that there were no on-demand downloads assert ( - env.pageserver.http_client().get_metric_value( + attached_pageserver.http_client().get_metric_value( "pageserver_remote_ondemand_downloaded_layers_total" ) == 0 ) + + # Do a secondary download and ensure there were no layer downloads + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + assert ( + secondary_pageserver.http_client().get_metric_value( + "pageserver_secondary_download_layer_total" + ) + == 0 + ) + + # Check that when we evict and promote one of the legacy-named layers, everything works as + # expected + local_layers = list( + ( + parse_layer_file_name(path.name), + os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path), + ) + for path in attached_pageserver.list_layers(tenant_id, timeline_id) + ) + (victim_layer_name, victim_path) = local_layers[0] + assert os.path.exists(victim_path) + + attached_pageserver.http_client().evict_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + assert not os.path.exists(victim_path) + + attached_pageserver.http_client().download_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + # We should download into the same local path we started with + assert os.path.exists(victim_path)