mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 19:42:55 +00:00
NB: effectively a no-op in the neon env since the handling is config gated in storcon ## Problem When a pageserver suffers from a local disk/node failure and restarts, the storage controller will receive a re-attach call and return all the tenants the pageserver is suppose to attach, but the pageserver will not act on any tenants that it doesn't know about locally. As a result, the pageserver will not rehydrate any tenants from remote storage if it restarted following a local disk loss, while the storage controller still thinks that the pageserver have all the tenants attached. This leaves the system in a bad state, and the symptom is that PG's pageserver connections will fail with "tenant not found" errors. ## Summary of changes Made a slight change to the storage controller's `re_attach` API: * The pageserver will set an additional bit `empty_local_disk` in the reattach request, indicating whether it has started with an empty disk or does not know about any tenants. * Upon receiving the reattach request, if this `empty_local_disk` bit is set, the storage controller will go ahead and clear all observed locations referencing the pageserver. The reconciler will then discover the discrepancy between the intended state and observed state of the tenant and take care of the situation. To facilitate rollouts this extra behavior in the `re_attach` API is guarded by the `handle_ps_local_disk_loss` command line flag of the storage controller. --------- Co-authored-by: William Huang <william.huang@databricks.com>
48 lines
2.0 KiB
Python
48 lines
2.0 KiB
Python
import shutil
|
|
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
from fixtures.utils import query_scalar
|
|
|
|
|
|
def test_hcc_handling_ps_data_loss(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
Test that following a pageserver local data loss event, the system can recover automatically (i.e.
|
|
rehydrating the restarted pageserver from remote storage) without manual intervention. The
|
|
pageserver indicates to the storage controller that it has restarted without any local tenant
|
|
data in its "reattach" request and the storage controller uses this information to detect the
|
|
data loss condition and reconfigure the pageserver as necessary.
|
|
"""
|
|
env = neon_env_builder.init_configs()
|
|
env.broker.start()
|
|
env.storage_controller.start(handle_ps_local_disk_loss=True)
|
|
env.pageserver.start()
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
# create new nenant
|
|
tenant_id, _ = env.create_tenant(shard_count=4)
|
|
|
|
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
with endpoint.cursor() as cur:
|
|
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
|
|
cur.execute("CREATE DATABASE testdb")
|
|
|
|
with endpoint.cursor(dbname="testdb") as cur:
|
|
cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
|
|
endpoint.stop()
|
|
|
|
# Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
|
|
# that restarted with the same ID but has lost all its local disk data.
|
|
env.pageserver.stop(immediate=True)
|
|
shutil.rmtree(env.pageserver.tenant_dir())
|
|
env.pageserver.start()
|
|
|
|
# Test that the endpoint can start and query the database after the pageserver restarts. This
|
|
# indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
|
|
# storage automatically.
|
|
endpoint.start()
|
|
with endpoint.cursor(dbname="testdb") as cur:
|
|
assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100
|