From 637ad4a6380000ad5af17726deccea6bc963efab Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 18 Apr 2024 13:16:03 +0100 Subject: [PATCH] pageserver: fix secondary download scheduling (#7396) ## Problem Some tenants were observed to stop doing downloads after some time ## Summary of changes - Fix a rogue `<` that was incorrectly scheduling work when `now` was _before_ the scheduling target, rather than after. This usually resulted in too-frequent execution, but could also result in never executing, if the current time has advanced ahead of `next_download` at the time we call `schedule()`. - Fix in-memory list of timelines not being amended after timeline deletion: the resulted in repeated harmless logs about the timeline being removed, and redundant calls to remove_dir_all for the timeline path. - Add a log at startup to make it easier to see a particular tenant starting in secondary mode (this is for parity with the logging that exists when spawning an attached tenant). Previously searching on tenant ID didn't provide a clear signal as to how the tenant was started during pageserver start. - Add a test that exercises secondary downloads using the background scheduling, whereas existing tests were using the API hook to invoke download directly. --- pageserver/src/metrics.rs | 6 +- pageserver/src/tenant/mgr.rs | 19 ++-- pageserver/src/tenant/secondary/downloader.rs | 11 ++- .../regress/test_pageserver_secondary.py | 86 +++++++++++++++++++ 4 files changed, 112 insertions(+), 10 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index be61a755ff..e6db95082b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1518,7 +1518,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) download_heatmap: IntCounter, pub(crate) download_layer: IntCounter, } -pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { + SecondaryModeMetrics { upload_heatmap: register_int_counter!( "pageserver_secondary_upload_heatmap", "Number of heatmaps written to remote storage by attached tenants" @@ -1536,7 +1537,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco .expect("failed to define a metric"), download_heatmap: register_int_counter!( "pageserver_secondary_download_heatmap", - "Number of downloads of heatmaps by secondary mode locations" + "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" ) .expect("failed to define a metric"), download_layer: register_int_counter!( @@ -1544,6 +1545,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Number of downloads of layers by secondary mode locations" ) .expect("failed to define a metric"), +} }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 73967f2949..2c9476ba0a 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -678,12 +678,19 @@ pub async fn init_tenant_mgr( } } } - LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - shard_identity, - location_conf.tenant_conf, - &secondary_conf, - )), + LocationMode::Secondary(secondary_conf) => { + info!( + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + "Starting secondary tenant" + ); + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )) + } }; tenants.insert(tenant_shard_id, slot); diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 5b29c126d1..67f866cb7b 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -312,7 +312,7 @@ impl JobGenerator next_download { Some(PendingDownload { secondary_state: secondary_tenant, last_download, @@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> { progress.bytes_downloaded += layer_byte_count; progress.layers_downloaded += layer_count; } + + for delete_timeline in &delete_timelines { + // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal + // from disk fails that will be a fatal error. + detail.timelines.remove(delete_timeline); + } } // Execute accumulated deletions @@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> { .await .map_err(UpdateError::from)?; + SECONDARY_MODE.download_heatmap.inc(); + if Some(&download.etag) == prev_etag { Ok(HeatMapDownload::Unmodified) } else { let mut heatmap_bytes = Vec::new(); let mut body = tokio_util::io::StreamReader::new(download.download_stream); let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; - SECONDARY_MODE.download_heatmap.inc(); Ok(HeatMapDownload::Modified(HeatMapModified { etag: download.etag, last_modified: download.last_modified, diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 345abdc072..8f194e5dda 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,6 +1,7 @@ import json import os import random +import time from pathlib import Path from typing import Any, Dict, Optional @@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) +def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): + """ + Slow test that runs in realtime, checks that the background scheduling of secondary + downloads happens as expected. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Create this many tenants, each with two timelines + tenant_count = 4 + tenant_timelines = {} + + # This mirrors a constant in `downloader.rs` + freshen_interval_secs = 60 + + for _i in range(0, tenant_count): + tenant_id = TenantId.generate() + timeline_a = TimelineId.generate() + timeline_b = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_a, + placement_policy='{"Attached":1}', + # Run with a low heatmap period so that we can avoid having to do synthetic API calls + # to trigger the upload promptly. + conf={"heatmap_period": "1s"}, + ) + env.neon_cli.create_timeline("main2", tenant_id, timeline_b) + + tenant_timelines[tenant_id] = [timeline_a, timeline_b] + + t_start = time.time() + + # Wait long enough that the background downloads should happen; we expect all the inital layers + # of all the initial timelines to show up on the secondary location of each tenant. + time.sleep(freshen_interval_secs * 1.5) + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + for timeline_id in timelines: + log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") + # One or more layers should be present for all timelines + assert list_layers(ps_secondary, tenant_id, timeline_id) + + # Delete the second timeline: this should be reflected later on the secondary + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) + + # Wait long enough for the secondary locations to see the deletion + time.sleep(freshen_interval_secs * 1.5) + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # This one was not deleted + assert list_layers(ps_secondary, tenant_id, timelines[0]) + + # This one was deleted + assert not list_layers(ps_secondary, tenant_id, timelines[1]) + + t_end = time.time() + + # Measure how many heatmap downloads we did in total: this checks that we succeeded with + # proper scheduling, and not some bug that just runs downloads in a loop. + total_heatmap_downloads = 0 + for ps in env.pageservers: + v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total") + assert v is not None + total_heatmap_downloads += int(v) + + download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) + + expect_download_rate = 1.0 / freshen_interval_secs + log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") + + assert download_rate < expect_download_rate * 2 + + @pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") @pytest.mark.parametrize("via_controller", [True, False]) def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):