Files
neon/test_runner/regress/test_recovery.py
Christian Schwarz 6d951e69d6 test_suite: patch, don't replace, the tenant_config field, where appropriate (#7771)
Before this PR, the changed tests would overwrite the entire
`tenant_config` because `pageserver_config_override` is merged
non-recursively into the `ps_cfg`.

This meant they would override the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our
matrix build for `compaction_algorithm=Tiered|Legacy` in
https://github.com/neondatabase/neon/pull/7748.

I found the tests fixed in this PR using the
`NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that
I added in #7748. Therefore, I think this is an exhaustive fix. This is
better than just searching the code base for `tenant_config`, which is
what I had sketched in #7747.

refs #7749
2024-05-17 12:24:02 +02:00

65 lines
2.2 KiB
Python

import time
from contextlib import closing
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
#
# Test pageserver recovery after crash
#
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
# Override default checkpointer settings to run it more often
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "1048576",
}
)
env.pageserver.is_testing_enabled_or_skip()
# We expect the pageserver to exit, which will cause storage storage controller
# requests to fail and warn.
env.storage_controller.allowed_errors.append(".*management API still failed.*")
env.storage_controller.allowed_errors.append(
".*Reconcile error.*error sending request for url.*"
)
# Create a branch for us
env.neon_cli.create_branch("test_pageserver_recovery", "main")
endpoint = env.endpoints.create_start("test_pageserver_recovery")
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
with env.pageserver.http_client() as pageserver_http:
# Create and initialize test table
cur.execute("CREATE TABLE foo(x bigint)")
cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))")
# Sleep for some time to let checkpoint create image layers
time.sleep(2)
# Configure failpoints
pageserver_http.configure_failpoints(
[
("flush-frozen-pausable", "sleep(2000)"),
("flush-frozen-exit", "exit"),
]
)
# Do some updates until pageserver is crashed
try:
while True:
cur.execute("update foo set x=x+1")
except Exception as err:
log.info(f"Expected server crash {err}")
log.info("Wait before server restart")
env.pageserver.stop()
env.pageserver.start()
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("select count(*) from foo")
assert cur.fetchone() == (100000,)