mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
For every Python test, we start the storage first, and expect that later, in the test, when we start a compute, it will work without specific timeline and tenant creation or their IDs specified. For that, we have a concept of "default" branch that was created on the control plane level first, but that's not needed at all, given that it's only Python tests that need it: let them create the initial timeline during set-up. Before, control plane started and stopped pageserver for timeline creation, now Python harness runs an extra tenant creation request on test env init. I had to adjust the metrics test, turns out it registered the metrics from the default tenant after an extra pageserver restart. New model does not sent the metrics before the collection time happens, and that was 30s before.
61 lines
2.2 KiB
Python
61 lines
2.2 KiB
Python
import time
|
|
from contextlib import closing
|
|
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
|
|
|
|
#
|
|
# Test pageserver recovery after crash
|
|
#
|
|
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
|
|
# Override default checkpointer settings to run it more often
|
|
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
|
|
|
|
env = neon_env_builder.init_start()
|
|
env.pageserver.is_testing_enabled_or_skip()
|
|
|
|
# These warnings are expected, when the pageserver is restarted abruptly
|
|
env.pageserver.allowed_errors.append(".*found future delta layer.*")
|
|
env.pageserver.allowed_errors.append(".*found future image layer.*")
|
|
|
|
# Create a branch for us
|
|
env.neon_cli.create_branch("test_pageserver_recovery", "main")
|
|
|
|
pg = env.postgres.create_start("test_pageserver_recovery")
|
|
log.info("postgres is running on 'test_pageserver_recovery' branch")
|
|
|
|
with closing(pg.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
with env.pageserver.http_client() as pageserver_http:
|
|
# Create and initialize test table
|
|
cur.execute("CREATE TABLE foo(x bigint)")
|
|
cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))")
|
|
|
|
# Sleep for some time to let checkpoint create image layers
|
|
time.sleep(2)
|
|
|
|
# Configure failpoints
|
|
pageserver_http.configure_failpoints(
|
|
[
|
|
("flush-frozen-before-sync", "sleep(2000)"),
|
|
("checkpoint-after-sync", "exit"),
|
|
]
|
|
)
|
|
|
|
# Do some updates until pageserver is crashed
|
|
try:
|
|
while True:
|
|
cur.execute("update foo set x=x+1")
|
|
except Exception as err:
|
|
log.info(f"Expected server crash {err}")
|
|
|
|
log.info("Wait before server restart")
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
with closing(pg.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("select count(*) from foo")
|
|
assert cur.fetchone() == (100000,)
|