pageserver: tolerate missing index_parts in remote storage

This commit is contained in:
John Spray
2023-10-17 13:29:18 +01:00
parent dd2136bd09
commit 12b79f710e
2 changed files with 35 additions and 5 deletions

View File

@@ -766,8 +766,19 @@ impl Tenant {
let mut remote_index_and_client = HashMap::new();
let mut timeline_ancestors = HashMap::new();
for (timeline_id, preload) in preload {
let index_part = preload.index_part?;
debug!("successfully downloaded index part for timeline {timeline_id}");
let index_part = match preload.index_part {
Ok(i) => {
debug!("successfully downloaded index part for timeline {timeline_id}");
i
}
Err(e) => {
// Timeline creation is not atomic: we might upload a layer but no index_part. We expect
// that the creation will be retried by the control plane and eventually result in
// a valid loadable state.
warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
continue;
}
};
match index_part {
MaybeDeletedIndexPart::IndexPart(index_part) => {
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
@@ -783,6 +794,10 @@ impl Tenant {
}
}
if timeline_ancestors.is_empty() {
anyhow::bail!("no valid timelines found on the remote storage")
}
// For every timeline, download the metadata file, scan the local directory,
// and build a layer map that contains an entry for each remote and local
// layer file.

View File

@@ -333,16 +333,28 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.append(
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
env.pageserver.allowed_errors.extend(
[
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
".*Failed to load index_part from remote storage.*",
]
)
ps_http = env.pageserver.http_client()
# pause all uploads
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
ps_http.tenant_create(env.initial_tenant)
# Create a timeline whose creation will succeed. The tenant will need at least one
# timeline to be loadable.
success_timeline = TimelineId.generate()
log.info(f"Creating timeline {success_timeline}")
ps_http.timeline_create(env.pg_version, env.initial_tenant, success_timeline, timeout=60)
# Create a timeline whose upload to remote storage will be blocked
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
def start_creating_timeline():
log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
with pytest.raises(RequestException):
ps_http.timeline_create(
env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
@@ -366,6 +378,9 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
with pytest.raises(PageserverApiException, match="not found"):
ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
# The one successfully created timeline should still be there.
assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
"""