From f7ec7668a2a452f23f197ee2670a4d799be20ed4 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 5 Jun 2025 16:04:37 +0400 Subject: [PATCH] pageserver, tests: prepare test_basebackup_cache for --timelines-onto-safekeepers (#12143) ## Problem - `test_basebackup_cache` fails in https://github.com/neondatabase/neon/pull/11712 because after the timelines on safekeepers are managed by storage controller, they do contain proper start_lsn and the compute_ctl tool sends the first basebackup request with this LSN. - `Failed to prepare basebackup` log messages during timeline initialization, because the timeline is not yet in the global timeline map. - Relates to https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Account for `timeline_onto_safekeepers` storcon's option in the test. - Do not trigger basebackup prepare during the timeline initialization. --- pageserver/src/tenant/timeline.rs | 7 ++++++ test_runner/regress/test_basebackup.py | 33 +++++++++++++++++++------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6798606141..3522af2de0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2506,6 +2506,13 @@ impl Timeline { // Preparing basebackup doesn't make sense for shards other than shard zero. return; } + if !self.is_active() { + // May happen during initial timeline creation. + // Such timeline is not in the global timeline map yet, + // so basebackup cache will not be able to find it. + // TODO(diko): We can prepare such timelines in finish_creation(). + return; + } let res = self .basebackup_prepare_sender diff --git a/test_runner/regress/test_basebackup.py b/test_runner/regress/test_basebackup.py index b083c394c7..2d42be4051 100644 --- a/test_runner/regress/test_basebackup.py +++ b/test_runner/regress/test_basebackup.py @@ -26,6 +26,10 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): ps = env.pageserver ps_http = ps.http_client() + storcon_managed_timelines = (env.storage_controller_config or {}).get( + "timelines_onto_safekeepers", False + ) + # 1. Check that we always hit the cache after compute restart. for i in range(3): ep.start() @@ -33,15 +37,26 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): def check_metrics(i=i): metrics = ps_http.get_metrics() - # Never miss. - # The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests. - # All other requests should be a hit - assert ( - metrics.query_one( - "pageserver_basebackup_cache_read_total", {"result": "miss"} - ).value - == 0 - ) + if storcon_managed_timelines: + # We do not cache the initial basebackup yet, + # so the first compute startup should be a miss. + assert ( + metrics.query_one( + "pageserver_basebackup_cache_read_total", {"result": "miss"} + ).value + == 1 + ) + else: + # If the timeline is not initialized on safekeeprs, + # the compute_ctl sends `get_basebackup` with lsn=None for the first startup. + # We do not use cache for such requests, so it's niether a hit nor a miss. + assert ( + metrics.query_one( + "pageserver_basebackup_cache_read_total", {"result": "miss"} + ).value + == 0 + ) + # All but the first requests are hits. assert ( metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value