crash-safe and resumable tenant attach

This change introduces a marker file

  $repo/tenants/$tenant_id/attaching

that is present while a tenant is in Attaching state.

When pageserver restarts, we use it to resume the tenant attach operation.
Before this change, a crash during tenant attach would result in one of
the following:
1. crash upon restart due to missing metadata file (IIRC)
2. "successful" loading of the tenant with a subset of timelines
This commit is contained in:
Christian Schwarz
2022-11-14 13:08:45 -05:00
committed by Dmitry Rodionov
parent c4c4558736
commit bb6dbd2f43
9 changed files with 175 additions and 24 deletions

View File

@@ -63,9 +63,11 @@ def test_remote_storage_backup_and_restore(
)
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
# FIXME retry downloads without throwing errors
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
# we have a bunch of pytest.raises for this below
env.pageserver.allowed_errors.append(".*tenant already exists.*")
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
@@ -118,19 +120,22 @@ def test_remote_storage_backup_and_restore(
time.sleep(10)
# assert cannot attach timeline that is scheduled for download
with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"):
# FIXME implement layer download retries
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
client.tenant_attach(tenant_id)
tenant_status = client.tenant_status(tenant_id)
log.info("Tenant status with active failpoint: %s", tenant_status)
assert tenant_status["has_in_progress_downloads"] is True
# FIXME implement layer download retries
# assert tenant_status["has_in_progress_downloads"] is True
# trigger temporary download files removal
env.pageserver.stop()
env.pageserver.start()
client.tenant_attach(tenant_id)
# ensure that an initiated attach operation survives pageserver restart
with pytest.raises(Exception, match="tenant already exists"):
client.tenant_attach(tenant_id)
log.info("waiting for timeline redownload")
wait_until(
number_of_iterations=20,