Files
neon/test_runner/regress/test_recovery.py
Heikki Linnakangas 46d30bf054 Check for errors in pageserver log after each test.
If there are any unexpected ERRORs or WARNs in pageserver.log after test
finishes, fail the test. This requires whitelisting the errors that *are*
expected in each test, and there's also a few common errors that are
printed by most tests, which are whitelisted in the fixture itself.

With this, we don't need the special abort() call in testing mode, when
compaction or GC fails. Those failures will print ERRORs to the logs,
which will be picked up by this new mechanisms.

A bunch of errors are currently whitelisted that we probably shouldn't
be emitting in the first place, but fixing those is out of scope for this
commit, so I just left FIXME comments on them.
2022-11-15 18:47:28 +02:00

63 lines
2.2 KiB
Python

import time
from contextlib import closing
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
#
# Test pageserver recovery after crash
#
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
# Override default checkpointer settings to run it more often
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
env = neon_env_builder.init()
env.pageserver.is_testing_enabled_or_skip()
neon_env_builder.start()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future delta layer.*")
env.pageserver.allowed_errors.append(".*found future image layer.*")
# Create a branch for us
env.neon_cli.create_branch("test_pageserver_recovery", "main")
pg = env.postgres.create_start("test_pageserver_recovery")
log.info("postgres is running on 'test_pageserver_recovery' branch")
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with env.pageserver.http_client() as pageserver_http:
# Create and initialize test table
cur.execute("CREATE TABLE foo(x bigint)")
cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))")
# Sleep for some time to let checkpoint create image layers
time.sleep(2)
# Configure failpoints
pageserver_http.configure_failpoints(
[
("flush-frozen-before-sync", "sleep(2000)"),
("checkpoint-after-sync", "exit"),
]
)
# Do some updates until pageserver is crashed
try:
while True:
cur.execute("update foo set x=x+1")
except Exception as err:
log.info(f"Expected server crash {err}")
log.info("Wait before server restart")
env.pageserver.stop()
env.pageserver.start()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("select count(*) from foo")
assert cur.fetchone() == (100000,)