mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 17:10:38 +00:00
crash-safe and resumable tenant attach
This change introduces a marker file $repo/tenants/$tenant_id/attaching that is present while a tenant is in Attaching state. When pageserver restarts, we use it to resume the tenant attach operation. Before this change, a crash during tenant attach would result in one of the following: 1. crash upon restart due to missing metadata file (IIRC) 2. "successful" loading of the tenant with a subset of timelines
This commit is contained in:
committed by
Dmitry Rodionov
parent
c4c4558736
commit
bb6dbd2f43
@@ -63,9 +63,11 @@ def test_remote_storage_backup_and_restore(
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
|
||||
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
|
||||
env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
|
||||
# FIXME retry downloads without throwing errors
|
||||
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
|
||||
# we have a bunch of pytest.raises for this below
|
||||
env.pageserver.allowed_errors.append(".*tenant already exists.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
@@ -118,19 +120,22 @@ def test_remote_storage_backup_and_restore(
|
||||
time.sleep(10)
|
||||
|
||||
# assert cannot attach timeline that is scheduled for download
|
||||
with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"):
|
||||
# FIXME implement layer download retries
|
||||
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
tenant_status = client.tenant_status(tenant_id)
|
||||
log.info("Tenant status with active failpoint: %s", tenant_status)
|
||||
assert tenant_status["has_in_progress_downloads"] is True
|
||||
# FIXME implement layer download retries
|
||||
# assert tenant_status["has_in_progress_downloads"] is True
|
||||
|
||||
# trigger temporary download files removal
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
# ensure that an initiated attach operation survives pageserver restart
|
||||
with pytest.raises(Exception, match="tenant already exists"):
|
||||
client.tenant_attach(tenant_id)
|
||||
log.info("waiting for timeline redownload")
|
||||
wait_until(
|
||||
number_of_iterations=20,
|
||||
|
||||
Reference in New Issue
Block a user