mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 19:40:39 +00:00
Part of https://github.com/neondatabase/neon/pull/2239 Regular, from scratch, timeline creation involves initdb to be run in a separate directory, data from this directory to be imported into pageserver and, finally, timeline-related background tasks to start. This PR ensures we don't leave behind any directories that are not marked as temporary and that pageserver removes such directories on restart, allowing timeline creation to be retried with the same IDs, if needed. It would be good to later rewrite the logic to use a temporary directory, similar what tenant creation does. Yet currently it's harder than this change, so not done.
171 lines
7.1 KiB
Python
171 lines
7.1 KiB
Python
import concurrent.futures
|
|
import os
|
|
from typing import List, Tuple
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
|
|
from fixtures.types import TenantId, TimelineId
|
|
|
|
|
|
# Test restarting page server, while safekeeper and compute node keep
|
|
# running.
|
|
def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
|
# One safekeeper is enough for this test.
|
|
neon_env_builder.num_safekeepers = 3
|
|
env = neon_env_builder.init_start()
|
|
|
|
tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []
|
|
|
|
for n in range(4):
|
|
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
|
|
|
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
|
with pg.cursor() as cur:
|
|
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
|
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
|
|
pg.stop()
|
|
tenant_timelines.append((tenant_id, timeline_id, pg))
|
|
|
|
# Stop the pageserver
|
|
env.pageserver.stop()
|
|
|
|
# Leave the first timeline alone, but corrupt the others in different ways
|
|
(tenant0, timeline0, pg0) = tenant_timelines[0]
|
|
log.info(f"Timeline {tenant0}/{timeline0} is left intact")
|
|
|
|
(tenant1, timeline1, pg1) = tenant_timelines[1]
|
|
metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
|
|
f = open(metadata_path, "w")
|
|
f.write("overwritten with garbage!")
|
|
f.close()
|
|
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
|
|
|
|
(tenant2, timeline2, pg2) = tenant_timelines[2]
|
|
timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/"
|
|
for filename in os.listdir(timeline_path):
|
|
if filename.startswith("00000"):
|
|
# Looks like a layer file. Remove it
|
|
os.remove(f"{timeline_path}/{filename}")
|
|
log.info(
|
|
f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
|
|
)
|
|
|
|
(tenant3, timeline3, pg3) = tenant_timelines[3]
|
|
timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/"
|
|
for filename in os.listdir(timeline_path):
|
|
if filename.startswith("00000"):
|
|
# Looks like a layer file. Corrupt it
|
|
f = open(f"{timeline_path}/{filename}", "w")
|
|
f.write("overwritten with garbage!")
|
|
f.close()
|
|
log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
|
|
|
|
env.pageserver.start()
|
|
|
|
# Tenant 0 should still work
|
|
pg0.start()
|
|
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
|
|
|
|
# But all others are broken
|
|
|
|
# First timeline would not get loaded into pageserver due to corrupt metadata file
|
|
with pytest.raises(
|
|
Exception, match=f"Timeline {timeline1} was not found for tenant {tenant1}"
|
|
) as err:
|
|
pg1.start()
|
|
log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
|
|
|
|
# Second timeline has no ancestors, only the metadata file and no layer files
|
|
# We don't have the remote storage enabled, which means timeline is in an incorrect state,
|
|
# it's not loaded at all
|
|
with pytest.raises(
|
|
Exception, match=f"Timeline {timeline2} was not found for tenant {tenant2}"
|
|
) as err:
|
|
pg2.start()
|
|
log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
|
|
|
|
# Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline
|
|
for n in range(3, 4):
|
|
(bad_tenant, bad_timeline, pg) = tenant_timelines[n]
|
|
with pytest.raises(Exception, match="extracting base backup failed") as err:
|
|
pg.start()
|
|
log.info(
|
|
f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}"
|
|
)
|
|
|
|
|
|
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id
|
|
)
|
|
for i in range(4)
|
|
]
|
|
for future in futures:
|
|
future.result()
|
|
|
|
|
|
def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
|
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
|
|
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
|
_ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)
|
|
|
|
# Restart the page server
|
|
env.neon_cli.pageserver_stop(immediate=True)
|
|
env.neon_cli.pageserver_start()
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|
|
|
|
|
|
def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):
|
|
env = neon_simple_env
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
|
|
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
|
|
# Introduce failpoint when creating a new timeline uninit mark, before any other files were created
|
|
pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
|
|
with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
|
|
_ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id)
|
|
|
|
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
|
# "New" timeline is not present in the list, allowing pageserver to retry the same request
|
|
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
|
assert (
|
|
new_tenant_timelines == old_tenant_timelines
|
|
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
|
|
|
|
timeline_dirs = [d for d in timelines_dir.iterdir()]
|
|
assert (
|
|
timeline_dirs == initial_timeline_dirs
|
|
), "pageserver should clean its temp timeline files on timeline creation failure"
|