Files
neon/test_runner/batch_others/test_broken_timeline.py
Dmitry Rodionov 092a9b74d3 use only s3 in boto3-stubs and update mypy
Newer version of mypy fixes buggy error when trying to update only boto3 stubs.
However it brings new checks and starts to yell when we index into
cusror.fetchone without checking for None first. So this introduces a wrapper
to simplify quering for scalar values. I tried to use cursor_factory connection
argument but without success. There can be a better way to do that,
but this looks the simplest
2022-08-01 18:28:49 +03:00

115 lines
4.2 KiB
Python

from typing import List, Tuple
from uuid import UUID
import pytest
import concurrent.futures
from contextlib import closing
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres
from fixtures.log_helper import log
import os
from fixtures.utils import query_scalar
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
# One safekeeper is enough for this test.
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
tenant_timelines: List[Tuple[str, str, Postgres]] = []
for n in range(4):
tenant_id_uuid, timeline_id_uuid = env.neon_cli.create_tenant()
tenant_id = tenant_id_uuid.hex
timeline_id = timeline_id_uuid.hex
pg = env.postgres.create_start(f'main', tenant_id=tenant_id_uuid)
with pg.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
timeline_id = query_scalar(cur, "SHOW neon.timeline_id")
pg.stop()
tenant_timelines.append((tenant_id, timeline_id, pg))
# Stop the pageserver
env.pageserver.stop()
# Leave the first timeline alone, but corrupt the others in different ways
(tenant0, timeline0, pg0) = tenant_timelines[0]
# Corrupt metadata file on timeline 1
(tenant1, timeline1, pg1) = tenant_timelines[1]
metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1)
print(f'overwriting metadata file at {metadata_path}')
f = open(metadata_path, "w")
f.write("overwritten with garbage!")
f.close()
# Missing layer files file on timeline 2. (This would actually work
# if we had Cloud Storage enabled in this test.)
(tenant2, timeline2, pg2) = tenant_timelines[2]
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2)
for filename in os.listdir(timeline_path):
if filename.startswith('00000'):
# Looks like a layer file. Remove it
os.remove(f'{timeline_path}/{filename}')
# Corrupt layer files file on timeline 3
(tenant3, timeline3, pg3) = tenant_timelines[3]
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3)
for filename in os.listdir(timeline_path):
if filename.startswith('00000'):
# Looks like a layer file. Corrupt it
f = open(f'{timeline_path}/{filename}', "w")
f.write("overwritten with garbage!")
f.close()
env.pageserver.start()
# Tenant 0 should still work
pg0.start()
assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
# But all others are broken
for n in range(1, 4):
(tenant, timeline, pg) = tenant_timelines[n]
with pytest.raises(Exception, match="Cannot load local timeline") as err:
pg.start()
log.info(f'compute startup failed as expected: {err}')
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
env = neon_simple_env
tenant_id, _ = env.neon_cli.create_tenant()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [
executor.submit(env.neon_cli.create_timeline,
f"test-create-multiple-timelines-{i}",
tenant_id) for i in range(4)
]
for future in futures:
future.result()
def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv):
env = neon_simple_env
tenant_id, _ = env.neon_cli.create_tenant()
# Introduce failpoint when creating a new timeline
env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return")
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
_ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id)
# Restart the page server
env.neon_cli.pageserver_stop(immediate=True)
env.neon_cli.pageserver_start()
# Check that tenant with "broken" timeline is not loaded.
with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id.hex}"):
env.neon_cli.list_timelines(tenant_id)