mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 08:00:37 +00:00
storcon: do not update observed state on node activation (#11155)
## Problem When a node becomes active, we query its locations and update the observed state in-place. This can race with the observed state updates done when processing reconcile results. ## Summary of changes The argument for this reconciliation step is that is reduces the need for background reconciliations. I don't think is actually true anymore. There's two cases. 1. Restart of node after drain. Usually the node does not go through the offline state here, so observed locations were not marked as none. In any case, there should be a handful of shards max on the node since we've just drained it. 2. Node comes back online after failure or network partition. When the node is marked offline, we reschedule everything away from it. When it later becomes active, the previous observed location is extraneous and requires a reconciliation anyway. Closes https://github.com/neondatabase/neon/issues/11148
This commit is contained in:
@@ -1749,18 +1749,23 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
|
||||
# Restart the failed pageserver
|
||||
victim_ps.start()
|
||||
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# We expect that the re-attach call correctly tipped off the pageserver that its locations
|
||||
# are all secondaries now.
|
||||
locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
assert len(locations) == 2
|
||||
assert all(loc[1]["mode"] == "Secondary" for loc in locations)
|
||||
|
||||
# We expect that this situation resulted from the re_attach call, and not any explicit
|
||||
# Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
|
||||
# We expect that this situation resulted from background reconciliations
|
||||
# Reconciler runs: assert that the reconciliation count has gone up by exactly
|
||||
# one for each shard
|
||||
reconciles_after_restart = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
assert reconciles_after_restart == reconciles_before_restart
|
||||
|
||||
assert reconciles_before_restart is not None
|
||||
assert reconciles_after_restart == reconciles_before_restart + 2
|
||||
|
||||
|
||||
def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -436,7 +436,7 @@ def test_single_branch_get_tenant_size_grows(
|
||||
# when our tenant is configured with a tiny pitr interval, dropping a table should
|
||||
# cause synthetic size to go down immediately
|
||||
tenant_config["pitr_interval"] = "0s"
|
||||
env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
|
||||
env.storage_controller.pageserver_api().set_tenant_config(tenant_id, tenant_config)
|
||||
(current_lsn, size) = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user