mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 20:10:38 +00:00
fix(walproposer): Do not restart on safekeepers reordering (#8840)
## Problem Currently, we compare `neon.safekeepers` values as is, so we unnecessarily restart walproposer even if safekeepers set didn't change. This leads to errors like: ```log FATAL: [WP] restarting walproposer to change safekeeper list from safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401 to safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401 ``` ## Summary of changes Split the GUC into the list of individual safekeepers and properly compare. We could've done that somewhere on the upper level, e.g., control plane, but I think it's still better when the actual config consumer is smarter and doesn't rely on upper levels.
This commit is contained in:
@@ -220,6 +220,64 @@ nwp_register_gucs(void)
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
safekeepers[n_safekeepers-1] = curr_sk;
|
||||
|
||||
if (coma != NULL) {
|
||||
*coma++ = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
return n_safekeepers;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accept two coma-separated strings with list of safekeeper host:port addresses.
|
||||
* Split them into arrays and return false if two sets do not match, ignoring the order.
|
||||
*/
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
|
||||
if (len_old != len_new)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
|
||||
qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
|
||||
|
||||
for (int i = 0; i < len_new; i++)
|
||||
{
|
||||
if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
|
||||
* the list changed.
|
||||
@@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
|
||||
/* Copy values because we will modify them in split_safekeepers_list() */
|
||||
char *newval_copy = pstrdup(newval);
|
||||
char *oldval = pstrdup(wal_acceptors_list);
|
||||
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (strcmp(wal_acceptors_list, newval) != 0)
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
{
|
||||
wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
|
||||
wal_acceptors_list, newval);
|
||||
}
|
||||
pfree(newval_copy);
|
||||
pfree(oldval);
|
||||
}
|
||||
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
static uint64
|
||||
backpressure_lag_impl(void)
|
||||
{
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
|
||||
|
||||
#
|
||||
@@ -28,3 +29,45 @@ def test_config(neon_simple_env: NeonEnv):
|
||||
|
||||
# check that config change was applied
|
||||
assert cur.fetchone() == ("debug1",)
|
||||
|
||||
|
||||
#
|
||||
# Test that reordering of safekeepers does not restart walproposer
|
||||
#
|
||||
def test_safekeepers_reconfigure_reorder(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder")
|
||||
|
||||
endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder")
|
||||
|
||||
old_sks = ""
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW neon.safekeepers")
|
||||
res = cur.fetchone()
|
||||
assert res is not None, "neon.safekeepers GUC is set"
|
||||
old_sks = res[0]
|
||||
|
||||
# Reorder safekeepers
|
||||
safekeepers = endpoint.active_safekeepers
|
||||
safekeepers = safekeepers[1:] + safekeepers[:1]
|
||||
|
||||
endpoint.reconfigure(safekeepers=safekeepers)
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW neon.safekeepers")
|
||||
res = cur.fetchone()
|
||||
assert res is not None, "neon.safekeepers GUC is set"
|
||||
new_sks = res[0]
|
||||
|
||||
assert new_sks != old_sks, "GUC changes were applied"
|
||||
|
||||
log_path = os.path.join(endpoint.endpoint_path(), "compute.log")
|
||||
with open(log_path, "r") as log_file:
|
||||
logs = log_file.read()
|
||||
# Check that walproposer was not restarted
|
||||
assert "restarting walproposer" not in logs
|
||||
|
||||
Reference in New Issue
Block a user