diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index bf063664b4..12610b1c89 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -796,20 +796,31 @@ impl Tenant { let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { - // In this context a timeline "exists" if it has any content in remote storage: this will - // be our cue to not delete any corresponding local directory - existent_timelines.insert(timeline_id); - let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); + // We found index_part on the remote, this is the standard case. + existent_timelines.insert(timeline_id); i } + Err(DownloadError::NotFound) => { + // There is no index_part on the remote. We only get here + // if there is some prefix for the timeline in the remote storage. + // This can e.g. be the initdb.tar.zst archive, maybe a + // remnant from a prior incomplete creation or deletion attempt. + // Delete the local directory as the deciding criterion for a + // timeline's existence is presence of index_part. + info!(%timeline_id, "index_part not found on remote"); + continue; + } Err(e) => { - // Timeline creation is not atomic: we might upload a layer but no index_part. We expect - // that the creation will be retried by the control plane and eventually result in - // a valid loadable state. + // Some (possibly ephemeral) error happened during index_part download. + // Pretend the timeline exists to not delete the timeline directory, + // as it might be a temporary issue and we don't want to re-download + // everything after it resolves. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + + existent_timelines.insert(timeline_id); continue; } }; diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4911fc09d6..84a322039a 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -114,7 +114,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) [ ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*", ".*Timeline got dropped without initializing, cleaning its files.*", - ".*Failed to load index_part from remote storage, failed creation?.*", ] ) @@ -144,8 +143,13 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) ), "pageserver should clean its temp timeline files on timeline creation failure" -def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() +# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups +@pytest.mark.parametrize("exit_or_return", ["return", "exit"]) +def test_timeline_init_break_before_checkpoint_recreate( + neon_env_builder: NeonEnvBuilder, exit_or_return: str +): + env = neon_env_builder.init_configs() + env.start() pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend( @@ -156,6 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn ] ) + pageserver_http.tenant_create(env.initial_tenant) tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) @@ -166,13 +171,17 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b") # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. - pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) - with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline( - "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id - ) + failpoint = "before-checkpoint-new-timeline" + pattern = failpoint + if exit_or_return == "exit": + # in reality a read error happens, but there are automatic retries which now fail because pageserver is dead + pattern = "Connection aborted." - # Restart the page server + pageserver_http.configure_failpoints((failpoint, exit_or_return)) + with pytest.raises(Exception, match=pattern): + _ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id) + + # Restart the page server (with the failpoint disabled) env.pageserver.restart(immediate=True) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. @@ -186,11 +195,9 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn timeline_dirs == initial_timeline_dirs ), "pageserver should clean its temp timeline files on timeline creation failure" - # Disable the failpoint again - pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off")) # creating the branch should have worked now - new_timeline_id = env.neon_cli.create_timeline( - "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id + new_timeline_id = TimelineId( + pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"] ) assert timeline_id == new_timeline_id diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 9bafa60d18..89c474286a 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -197,7 +197,6 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # So by ignoring these instead of waiting for empty upload queue # we execute more distinct code paths. '.*stopping left-over name="remote upload".*', - ".*Failed to load index_part from remote storage, failed creation?.*", ] ) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index a31f410f94..c81be41530 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -293,7 +293,6 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.extend( [ ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ".*Failed to load index_part from remote storage, failed creation?.*", ".*load failed.*list timelines directory.*", ] ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 51ad971cdb..b1a2755394 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -230,9 +230,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints( env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*") # In one of the branches we poll for tenant to become active. Polls can generate this log message: env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*") - env.pageserver.allowed_errors.append( - ".*Failed to load index_part from remote storage, failed creation?.*" - ) ps_http.configure_failpoints((failpoint, "return")) @@ -311,9 +308,10 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ) timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id) - if failpoint != "timeline-delete-after-index-delete": - # Check local is empty - assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0 + + # Check local is empty + assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0 + # Check no delete mark present assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()