mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-02 04:50:38 +00:00
Ensure all temporary and empty directories and files are cleansed on pageserver startup
This commit is contained in:
committed by
Kirill Bulatov
parent
d3f83eda52
commit
c9e7c2f014
@@ -32,33 +32,34 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Leave the first timeline alone, but corrupt the others in different ways
|
||||
(tenant0, timeline0, pg0) = tenant_timelines[0]
|
||||
log.info(f"Timeline {tenant0}/{timeline0} is left intact")
|
||||
|
||||
# Corrupt metadata file on timeline 1
|
||||
(tenant1, timeline1, pg1) = tenant_timelines[1]
|
||||
metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1)
|
||||
print(f"overwriting metadata file at {metadata_path}")
|
||||
metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
|
||||
f = open(metadata_path, "w")
|
||||
f.write("overwritten with garbage!")
|
||||
f.close()
|
||||
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
|
||||
|
||||
# Missing layer files file on timeline 2. (This would actually work
|
||||
# if we had Cloud Storage enabled in this test.)
|
||||
(tenant2, timeline2, pg2) = tenant_timelines[2]
|
||||
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2)
|
||||
timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/"
|
||||
for filename in os.listdir(timeline_path):
|
||||
if filename.startswith("00000"):
|
||||
# Looks like a layer file. Remove it
|
||||
os.remove(f"{timeline_path}/{filename}")
|
||||
log.info(
|
||||
f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
|
||||
)
|
||||
|
||||
# Corrupt layer files file on timeline 3
|
||||
(tenant3, timeline3, pg3) = tenant_timelines[3]
|
||||
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3)
|
||||
timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/"
|
||||
for filename in os.listdir(timeline_path):
|
||||
if filename.startswith("00000"):
|
||||
# Looks like a layer file. Corrupt it
|
||||
f = open(f"{timeline_path}/{filename}", "w")
|
||||
f.write("overwritten with garbage!")
|
||||
f.close()
|
||||
log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
@@ -69,20 +70,28 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
# But all others are broken
|
||||
|
||||
# First timeline would not get loaded into pageserver due to corrupt metadata file
|
||||
(_tenant, _timeline, pg) = tenant_timelines[1]
|
||||
with pytest.raises(
|
||||
Exception, match=f"Could not get timeline {timeline1} in tenant {tenant1}"
|
||||
) as err:
|
||||
pg.start()
|
||||
pg1.start()
|
||||
log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
|
||||
|
||||
# Second timeline has no ancestors, only the metadata file and no layer files
|
||||
# We don't have the remote storage enabled, which means timeline is in an incorrect state,
|
||||
# it's not loaded at all
|
||||
with pytest.raises(
|
||||
Exception, match=f"Could not get timeline {timeline2} in tenant {tenant2}"
|
||||
) as err:
|
||||
pg2.start()
|
||||
log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
|
||||
|
||||
# Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline
|
||||
for n in range(2, 4):
|
||||
(_tenant, _timeline, pg) = tenant_timelines[n]
|
||||
for n in range(3, 4):
|
||||
(bad_tenant, bad_timeline, pg) = tenant_timelines[n]
|
||||
with pytest.raises(Exception, match="extracting base backup failed") as err:
|
||||
pg.start()
|
||||
log.info(
|
||||
f"compute startup failed lazily for timeline with corrupt layers, during basebackup preparation: {err}"
|
||||
f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}"
|
||||
)
|
||||
|
||||
|
||||
@@ -107,6 +116,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv):
|
||||
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
|
||||
# Introduce failpoint when creating a new timeline
|
||||
env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return")
|
||||
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
||||
@@ -116,6 +127,8 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv):
|
||||
env.neon_cli.pageserver_stop(immediate=True)
|
||||
env.neon_cli.pageserver_start()
|
||||
|
||||
# Check that tenant with "broken" timeline is not loaded.
|
||||
with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id}"):
|
||||
env.neon_cli.list_timelines(tenant_id)
|
||||
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
||||
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
assert (
|
||||
new_tenant_timelines == old_tenant_timelines
|
||||
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
||||
|
||||
Reference in New Issue
Block a user