mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
pageserver: implement emergency mode for operating without control plane (#5469)
## Problem Pageservers with `control_plane_api` configured require a control plane to start up: in an incident this might be a problem. ## Summary of changes Note to reviewers: most of the code churn in mgr.rs is the refactor commit that enables the later emergency mode commit: you may want to review commits separately. - Add `control_plane_emergency_mode` configuration property - Refactor init_tenant_mgr to separate loading configurations from the main loop where we construct Tenant, so that the generations fetch can peek at the configs in emergency mode. - During startup, in emergency mode, attach any tenants that were attached on their last run, using the same generation number. Closes: #5381 Closes: https://github.com/neondatabase/neon/issues/5492
This commit is contained in:
@@ -81,7 +81,7 @@ def generate_uploads_and_deletions(
|
||||
f"""
|
||||
INSERT INTO foo (id, val)
|
||||
SELECT g, '{data}'
|
||||
FROM generate_series(1, 20000) g
|
||||
FROM generate_series(1, 200) g
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET val = EXCLUDED.val
|
||||
""",
|
||||
@@ -378,3 +378,73 @@ def test_deletion_queue_recovery(
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
|
||||
def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
generate_uploads_and_deletions(env)
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# When the pageserver can't reach the control plane, it will complain
|
||||
".*calling control plane generation validation API failed.*",
|
||||
# Emergency mode is a big deal, we log errors whenever it is used.
|
||||
".*Emergency mode!.*",
|
||||
]
|
||||
)
|
||||
|
||||
# Simulate a major incident: the control plane goes offline
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.stop()
|
||||
|
||||
# Remember how many validations had happened before the control plane went offline
|
||||
validated = get_deletion_queue_validated(ps_http)
|
||||
|
||||
generate_uploads_and_deletions(env, init=False)
|
||||
|
||||
# The running pageserver should stop progressing deletions
|
||||
time.sleep(10)
|
||||
assert get_deletion_queue_validated(ps_http) == validated
|
||||
|
||||
# Restart the pageserver: ordinarily we would _avoid_ doing this during such an
|
||||
# incident, but it might be unavoidable: if so, we want to be able to start up
|
||||
# and serve clients.
|
||||
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
|
||||
env.pageserver.start(
|
||||
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
|
||||
)
|
||||
|
||||
# The pageserver should provide service to clients
|
||||
generate_uploads_and_deletions(env, init=False)
|
||||
|
||||
# The pageserver should neither validate nor execute any deletions, it should have
|
||||
# loaded the DeletionLists from before though
|
||||
time.sleep(10)
|
||||
assert get_deletion_queue_depth(ps_http) > 0
|
||||
assert get_deletion_queue_validated(ps_http) == 0
|
||||
assert get_deletion_queue_executed(ps_http) == 0
|
||||
|
||||
# When the control plane comes back up, normal service should resume
|
||||
env.attachment_service.start()
|
||||
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert get_deletion_queue_depth(ps_http) == 0
|
||||
assert get_deletion_queue_validated(ps_http) > 0
|
||||
assert get_deletion_queue_executed(ps_http) > 0
|
||||
|
||||
# The pageserver should work fine when subsequently restarted in non-emergency mode
|
||||
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
|
||||
env.pageserver.start()
|
||||
|
||||
generate_uploads_and_deletions(env, init=False)
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert get_deletion_queue_depth(ps_http) == 0
|
||||
assert get_deletion_queue_validated(ps_http) > 0
|
||||
assert get_deletion_queue_executed(ps_http) > 0
|
||||
|
||||
Reference in New Issue
Block a user