diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index c5e5e04945..d72c337369 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -9,13 +9,14 @@ use crate::{ metrics::SECONDARY_MODE, tenant::{ config::AttachmentMode, - mgr::GetTenantError, - mgr::TenantManager, + mgr::{GetTenantError, TenantManager}, remote_timeline_client::remote_heatmap_path, span::debug_assert_current_span_has_tenant_id, tasks::{warn_when_period_overrun, BackgroundLoopKind}, Tenant, }, + virtual_file::VirtualFile, + TEMP_FILE_SUFFIX, }; use futures::Future; @@ -32,7 +33,10 @@ use super::{ }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; -use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; +use utils::{ + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, + yielding_loop::yielding_loop, +}; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, @@ -461,6 +465,18 @@ async fn upload_tenant_heatmap( } } + // After a successful upload persist the fresh heatmap to disk. + // When restarting, the tenant will read the heatmap from disk + // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]). + // If the heatmap is stale, the additive generation can lead to keeping previously + // evicted timelines on the secondarie's disk. + let tenant_shard_id = tenant.get_tenant_shard_id(); + let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id); + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); + if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await { + tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}"); + } + tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8909f7f249..7c4991ffab 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2766,6 +2766,11 @@ class NeonPageserver(PgProtocol, LogUtils): log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}") raise + def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any: + path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json" + with open(path) as f: + return json.load(f) + def tenant_create( self, tenant_id: TenantId, diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 1292682f9e..590093d23c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): workload.write_rows(256, env.pageservers[0].id) env.pageserver.http_client().tenant_heatmap_upload(tenant_id) - def validate_heatmap(heatmap): + def validate_heatmap(heatmap, on_disk_heatmap): assert len(heatmap["timelines"]) == 1 assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id) assert len(heatmap["timelines"][0]["layers"]) > 0 @@ -452,10 +452,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): # Each layer appears at most once assert len(set(layer["name"] for layer in layers)) == len(layers) + assert heatmap == on_disk_heatmap + # Download and inspect the heatmap that the pageserver uploaded heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id) log.info(f"Read back heatmap: {heatmap_first}") - validate_heatmap(heatmap_first) + validate_heatmap(heatmap_first, heatmap_first_on_disk) # Do some more I/O to generate more layers workload.churn_rows(64, env.pageservers[0].id) @@ -463,9 +466,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): # Ensure that another heatmap upload includes the new layers heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id) log.info(f"Read back heatmap: {heatmap_second}") assert heatmap_second != heatmap_first - validate_heatmap(heatmap_second) + validate_heatmap(heatmap_second, heatmap_second_on_disk) def list_elegible_layers(