Files
neon/test_runner/batch_others/test_recovery.py
Heikki Linnakangas 51ea9c3053 Don't swallow panics when the pageserver is build with failpoints.
It's very confusing, and because you don't get a stack trace and error
message in the logs, makes debugging very hard. However, the
'test_pageserver_recovery' test relied on that behavior. To support that,
add a new "exit" action to the pageserver 'failpoints' command, so that
you can explicitly request to exit the process when a failpoint is hit.
2022-05-16 09:58:58 +03:00

65 lines
2.3 KiB
Python

import os
import time
import psycopg2.extras
import json
from ast import Assert
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.log_helper import log
#
# Test pageserver recovery after crash
#
def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
# Override default checkpointer settings to run it more often
zenith_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
env = zenith_env_builder.init()
# Check if failpoints enables. Otherwise the test doesn't make sense
f = env.zenith_cli.pageserver_enabled_features()
assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test"
zenith_env_builder.start()
# Create a branch for us
env.zenith_cli.create_branch("test_pageserver_recovery", "main")
pg = env.postgres.create_start('test_pageserver_recovery')
log.info("postgres is running on 'test_pageserver_recovery' branch")
connstr = pg.connstr()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with closing(env.pageserver.connect()) as psconn:
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
# Create and initialize test table
cur.execute("CREATE TABLE foo(x bigint)")
cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))")
# Sleep for some time to let checkpoint create image layers
time.sleep(2)
# Configure failpoints
pscur.execute(
"failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit")
# Do some updates until pageserver is crashed
try:
while True:
cur.execute("update foo set x=x+1")
except Exception as err:
log.info(f"Expected server crash {err}")
log.info("Wait before server restart")
env.pageserver.stop()
env.pageserver.start()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("select count(*) from foo")
assert cur.fetchone() == (100000, )