Files
neon/test_runner/regress/test_broken_timeline.py
Tristan Partin 5bd8e2363a Enable all pyupgrade checks in ruff
This will help to keep us from using deprecated Python features going
forward.

Signed-off-by: Tristan Partin <tristan@neon.tech>
2024-10-08 14:32:26 -05:00

222 lines
9.0 KiB
Python

from __future__ import annotations
import concurrent.futures
import os
import pytest
from fixtures.common_types import TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.pg_version import PgVersion
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_local_corruption(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
".*get_values_reconstruct_data for layer .*",
".*could not find data for key.*",
".*is not active. Current state: Broken.*",
".*will not become active. Current state: Broken.*",
".*failed to load metadata.*",
".*load failed.*load local timeline.*",
".*: layer load failed, assuming permanent failure:.*",
]
)
tenant_timelines: list[tuple[TenantId, TimelineId, Endpoint]] = []
for _ in range(3):
tenant_id, timeline_id = env.create_tenant()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
endpoint.stop()
tenant_timelines.append((tenant_id, timeline_id, endpoint))
# Stop the pageserver -- this has to be not immediate or we need to wait for uploads
env.pageserver.stop()
# Leave the first timeline alone, but corrupt the others in different ways
(tenant0, timeline0, pg0) = tenant_timelines[0]
log.info(f"Timeline {tenant0}/{timeline0} is left intact")
(tenant1, timeline1, pg1) = tenant_timelines[2]
timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/"
for filename in os.listdir(timeline_path):
if filename.startswith("00000"):
# Looks like a layer file. Corrupt it
p = f"{timeline_path}/{filename}"
size = os.path.getsize(p)
with open(p, "wb") as f:
f.truncate(0)
f.truncate(size)
log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled")
env.pageserver.start()
# Un-damaged tenant works
pg0.start()
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
# Second timeline will fail during basebackup, because the local layer file is corrupt.
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
# (We don't check layer file contents on startup, when loading the timeline)
#
# This will change when we implement checksums for layers
with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
pg1.start()
log.info(
f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
)
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
env = neon_simple_env
tenant_id, _ = env.create_tenant()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [
executor.submit(env.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id)
for i in range(4)
]
for future in futures:
future.result()
def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
[
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
".*Timeline got dropped without initializing, cleaning its files.*",
]
)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.timeline_list(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
# Restart the page server
env.pageserver.restart(immediate=True)
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
new_tenant_timelines = env.neon_cli.timeline_list(tenant_id)
assert (
new_tenant_timelines == old_tenant_timelines
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
timeline_dirs = [d for d in timelines_dir.iterdir()]
assert (
timeline_dirs == initial_timeline_dirs
), "pageserver should clean its temp timeline files on timeline creation failure"
# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
def test_timeline_init_break_before_checkpoint_recreate(
neon_env_builder: NeonEnvBuilder, exit_or_return: str
):
env = neon_env_builder.init_configs()
env.start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
[
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
".*Timeline got dropped without initializing, cleaning its files.*",
".*Failed to load index_part from remote storage, failed creation?.*",
]
)
env.create_tenant(env.initial_tenant)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.timeline_list(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
# Some fixed timeline ID (like control plane does)
timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
failpoint = "before-checkpoint-new-timeline"
pattern = failpoint
if exit_or_return == "exit":
# in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
pattern = "Connection aborted."
pageserver_http.configure_failpoints((failpoint, exit_or_return))
with pytest.raises(Exception, match=pattern):
_ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
# Restart the page server (with the failpoint disabled)
env.pageserver.restart(immediate=True)
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
new_tenant_timelines = env.neon_cli.timeline_list(tenant_id)
assert (
new_tenant_timelines == old_tenant_timelines
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
timeline_dirs = [d for d in timelines_dir.iterdir()]
assert (
timeline_dirs == initial_timeline_dirs
), "pageserver should clean its temp timeline files on timeline creation failure"
# creating the branch should have worked now
new_timeline_id = TimelineId(
pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
)
assert timeline_id == new_timeline_id
def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.timeline_list(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
# Introduce failpoint when creating a new timeline, right after creating its directory
pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return"))
with pytest.raises(Exception, match="after-timeline-dir-creation"):
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
# "New" timeline is not present in the list, allowing pageserver to retry the same request
new_tenant_timelines = env.neon_cli.timeline_list(tenant_id)
assert (
new_tenant_timelines == old_tenant_timelines
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
timeline_dirs = [d for d in timelines_dir.iterdir()]
assert (
timeline_dirs == initial_timeline_dirs
), "pageserver should clean its temp timeline files on timeline creation failure"