Files
neon/test_runner/regress/test_recovery.py
Kirill Bulatov 8712e1899e Move initial timeline creation into pytest (#3270)
For every Python test, we start the storage first, and expect that
later, in the test, when we start a compute, it will work without
specific timeline and tenant creation or their IDs specified.

For that, we have a concept of "default" branch that was created on the
control plane level first, but that's not needed at all, given that it's
only Python tests that need it: let them create the initial timeline
during set-up.

Before, control plane started and stopped pageserver for timeline
creation, now Python harness runs an extra tenant creation request on
test env init.

I had to adjust the metrics test, turns out it registered the metrics
from the default tenant after an extra pageserver restart.
New model does not sent the metrics before the collection time happens,
and that was 30s before.
2023-01-05 17:48:27 +02:00

61 lines
2.2 KiB
Python

import time
from contextlib import closing
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
#
# Test pageserver recovery after crash
#
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
# Override default checkpointer settings to run it more often
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
env = neon_env_builder.init_start()
env.pageserver.is_testing_enabled_or_skip()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future delta layer.*")
env.pageserver.allowed_errors.append(".*found future image layer.*")
# Create a branch for us
env.neon_cli.create_branch("test_pageserver_recovery", "main")
pg = env.postgres.create_start("test_pageserver_recovery")
log.info("postgres is running on 'test_pageserver_recovery' branch")
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with env.pageserver.http_client() as pageserver_http:
# Create and initialize test table
cur.execute("CREATE TABLE foo(x bigint)")
cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))")
# Sleep for some time to let checkpoint create image layers
time.sleep(2)
# Configure failpoints
pageserver_http.configure_failpoints(
[
("flush-frozen-before-sync", "sleep(2000)"),
("checkpoint-after-sync", "exit"),
]
)
# Do some updates until pageserver is crashed
try:
while True:
cur.execute("update foo set x=x+1")
except Exception as err:
log.info(f"Expected server crash {err}")
log.info("Wait before server restart")
env.pageserver.stop()
env.pageserver.start()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("select count(*) from foo")
assert cur.fetchone() == (100000,)