mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
Before this PR, the changed tests would overwrite the entire `tenant_config` because `pageserver_config_override` is merged non-recursively into the `ps_cfg`. This meant they would override the `PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our matrix build for `compaction_algorithm=Tiered|Legacy` in https://github.com/neondatabase/neon/pull/7748. I found the tests fixed in this PR using the `NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that I added in #7748. Therefore, I think this is an exhaustive fix. This is better than just searching the code base for `tenant_config`, which is what I had sketched in #7747. refs #7749
65 lines
2.2 KiB
Python
65 lines
2.2 KiB
Python
import time
|
|
from contextlib import closing
|
|
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
|
|
|
|
#
|
|
# Test pageserver recovery after crash
|
|
#
|
|
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
|
|
# Override default checkpointer settings to run it more often
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf={
|
|
"checkpoint_distance": "1048576",
|
|
}
|
|
)
|
|
env.pageserver.is_testing_enabled_or_skip()
|
|
|
|
# We expect the pageserver to exit, which will cause storage storage controller
|
|
# requests to fail and warn.
|
|
env.storage_controller.allowed_errors.append(".*management API still failed.*")
|
|
env.storage_controller.allowed_errors.append(
|
|
".*Reconcile error.*error sending request for url.*"
|
|
)
|
|
|
|
# Create a branch for us
|
|
env.neon_cli.create_branch("test_pageserver_recovery", "main")
|
|
|
|
endpoint = env.endpoints.create_start("test_pageserver_recovery")
|
|
|
|
with closing(endpoint.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
with env.pageserver.http_client() as pageserver_http:
|
|
# Create and initialize test table
|
|
cur.execute("CREATE TABLE foo(x bigint)")
|
|
cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))")
|
|
|
|
# Sleep for some time to let checkpoint create image layers
|
|
time.sleep(2)
|
|
|
|
# Configure failpoints
|
|
pageserver_http.configure_failpoints(
|
|
[
|
|
("flush-frozen-pausable", "sleep(2000)"),
|
|
("flush-frozen-exit", "exit"),
|
|
]
|
|
)
|
|
|
|
# Do some updates until pageserver is crashed
|
|
try:
|
|
while True:
|
|
cur.execute("update foo set x=x+1")
|
|
except Exception as err:
|
|
log.info(f"Expected server crash {err}")
|
|
|
|
log.info("Wait before server restart")
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
with closing(endpoint.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("select count(*) from foo")
|
|
assert cur.fetchone() == (100000,)
|