Clean up local state if index_part.json request gives 404 (#6009)

If `index_part.json` is (verifiably) not present on remote storage, we
should regard the timeline as inexistent. This lets `clean_up_timelines`
purge the partial local disk state, which is important in the case of
incomplete creations leaving behind state that hinders retries. For
incomplete deletions, we also want the timeline's local disk content be
gone completely.

The PR removes the allowed warnings added by #5390 and #5912, as we now
are only supposed to issue info level messages. It also adds a
reproducer for #6007, by parametrizing the
`test_timeline_init_break_before_checkpoint_recreate` test added by
#5390. If one reverts the .rs changes, the "cannot create its uninit
mark file" log line occurs once one comments out the failing checks for
the local disk state being actually empty.

Closes #6007

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
This commit is contained in:
Arpad Müller
2023-12-01 11:58:06 +01:00
committed by GitHub
parent f784e59b12
commit 1ce1c82d78
5 changed files with 42 additions and 28 deletions

View File

@@ -796,20 +796,31 @@ impl Tenant {
let mut timeline_ancestors = HashMap::new();
let mut existent_timelines = HashSet::new();
for (timeline_id, preload) in preload.timelines {
// In this context a timeline "exists" if it has any content in remote storage: this will
// be our cue to not delete any corresponding local directory
existent_timelines.insert(timeline_id);
let index_part = match preload.index_part {
Ok(i) => {
debug!("remote index part exists for timeline {timeline_id}");
// We found index_part on the remote, this is the standard case.
existent_timelines.insert(timeline_id);
i
}
Err(DownloadError::NotFound) => {
// There is no index_part on the remote. We only get here
// if there is some prefix for the timeline in the remote storage.
// This can e.g. be the initdb.tar.zst archive, maybe a
// remnant from a prior incomplete creation or deletion attempt.
// Delete the local directory as the deciding criterion for a
// timeline's existence is presence of index_part.
info!(%timeline_id, "index_part not found on remote");
continue;
}
Err(e) => {
// Timeline creation is not atomic: we might upload a layer but no index_part. We expect
// that the creation will be retried by the control plane and eventually result in
// a valid loadable state.
// Some (possibly ephemeral) error happened during index_part download.
// Pretend the timeline exists to not delete the timeline directory,
// as it might be a temporary issue and we don't want to re-download
// everything after it resolves.
warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
existent_timelines.insert(timeline_id);
continue;
}
};

View File

@@ -114,7 +114,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
[
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
".*Timeline got dropped without initializing, cleaning its files.*",
".*Failed to load index_part from remote storage, failed creation?.*",
]
)
@@ -144,8 +143,13 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
), "pageserver should clean its temp timeline files on timeline creation failure"
def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
def test_timeline_init_break_before_checkpoint_recreate(
neon_env_builder: NeonEnvBuilder, exit_or_return: str
):
env = neon_env_builder.init_configs()
env.start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
@@ -156,6 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn
]
)
pageserver_http.tenant_create(env.initial_tenant)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
@@ -166,13 +171,17 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn
timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
_ = env.neon_cli.create_timeline(
"test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
)
failpoint = "before-checkpoint-new-timeline"
pattern = failpoint
if exit_or_return == "exit":
# in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
pattern = "Connection aborted."
# Restart the page server
pageserver_http.configure_failpoints((failpoint, exit_or_return))
with pytest.raises(Exception, match=pattern):
_ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
# Restart the page server (with the failpoint disabled)
env.pageserver.restart(immediate=True)
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
@@ -186,11 +195,9 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn
timeline_dirs == initial_timeline_dirs
), "pageserver should clean its temp timeline files on timeline creation failure"
# Disable the failpoint again
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off"))
# creating the branch should have worked now
new_timeline_id = env.neon_cli.create_timeline(
"test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
new_timeline_id = TimelineId(
pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
)
assert timeline_id == new_timeline_id

View File

@@ -197,7 +197,6 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# So by ignoring these instead of waiting for empty upload queue
# we execute more distinct code paths.
'.*stopping left-over name="remote upload".*',
".*Failed to load index_part from remote storage, failed creation?.*",
]
)

View File

@@ -293,7 +293,6 @@ def test_pageserver_with_empty_tenants(
env.pageserver.allowed_errors.extend(
[
".*marking .* as locally complete, while it doesnt exist in remote index.*",
".*Failed to load index_part from remote storage, failed creation?.*",
".*load failed.*list timelines directory.*",
]
)

View File

@@ -230,9 +230,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
# In one of the branches we poll for tenant to become active. Polls can generate this log message:
env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
env.pageserver.allowed_errors.append(
".*Failed to load index_part from remote storage, failed creation?.*"
)
ps_http.configure_failpoints((failpoint, "return"))
@@ -311,9 +308,10 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
)
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
if failpoint != "timeline-delete-after-index-delete":
# Check local is empty
assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
# Check local is empty
assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
# Check no delete mark present
assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()