From 12b79f710ecfb07c68637be9ffa58e5b8c8deeed Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 17 Oct 2023 13:29:18 +0100 Subject: [PATCH] pageserver: tolerate missing index_parts in remote storage --- pageserver/src/tenant.rs | 19 +++++++++++++++++-- test_runner/regress/test_branching.py | 21 ++++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4e60f907d2..39464f4b3e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -766,8 +766,19 @@ impl Tenant { let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); for (timeline_id, preload) in preload { - let index_part = preload.index_part?; - debug!("successfully downloaded index part for timeline {timeline_id}"); + let index_part = match preload.index_part { + Ok(i) => { + debug!("successfully downloaded index part for timeline {timeline_id}"); + i + } + Err(e) => { + // Timeline creation is not atomic: we might upload a layer but no index_part. We expect + // that the creation will be retried by the control plane and eventually result in + // a valid loadable state. + warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + continue; + } + }; match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => { timeline_ancestors.insert(timeline_id, index_part.metadata.clone()); @@ -783,6 +794,10 @@ impl Tenant { } } + if timeline_ancestors.is_empty() { + anyhow::bail!("no valid timelines found on the remote storage") + } + // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 32b1466c90..581b05cca2 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -333,16 +333,28 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*Failed to load index_part from remote storage.*", + ] ) ps_http = env.pageserver.http_client() # pause all uploads - ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) ps_http.tenant_create(env.initial_tenant) + # Create a timeline whose creation will succeed. The tenant will need at least one + # timeline to be loadable. + success_timeline = TimelineId.generate() + log.info(f"Creating timeline {success_timeline}") + ps_http.timeline_create(env.pg_version, env.initial_tenant, success_timeline, timeout=60) + + # Create a timeline whose upload to remote storage will be blocked + ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) + def start_creating_timeline(): + log.info(f"Creating (expect failure) timeline {env.initial_timeline}") with pytest.raises(RequestException): ps_http.timeline_create( env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 @@ -366,6 +378,9 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N with pytest.raises(PageserverApiException, match="not found"): ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + # The one successfully created timeline should still be there. + assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 + def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder): """