fix(walproposer): Do not restart on safekeepers reordering (#8840)

## Problem

Currently, we compare `neon.safekeepers` values as is, so we
unnecessarily restart walproposer even if safekeepers set didn't change.
This leads to errors like:
```log
FATAL:  [WP] restarting walproposer to change safekeeper list
from safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401
to safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401
```

## Summary of changes

Split the GUC into the list of individual safekeepers and properly
compare. We could've done that somewhere on the upper level, e.g.,
control plane, but I think it's still better when the actual config
consumer is smarter and doesn't rely on upper levels.
This commit is contained in:
Alexey Kondratov
2024-08-27 15:49:47 +02:00
committed by GitHub
parent 52cb33770b
commit 9b9f90c562
2 changed files with 111 additions and 3 deletions

View File

@@ -220,6 +220,64 @@ nwp_register_gucs(void)
NULL, NULL, NULL);
}
static int
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
{
int n_safekeepers = 0;
char *curr_sk = safekeepers_list;
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
{
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
wpg_log(FATAL, "too many safekeepers");
}
coma = strchr(coma, ',');
safekeepers[n_safekeepers-1] = curr_sk;
if (coma != NULL) {
*coma++ = '\0';
}
}
return n_safekeepers;
}
/*
* Accept two coma-separated strings with list of safekeeper host:port addresses.
* Split them into arrays and return false if two sets do not match, ignoring the order.
*/
static bool
safekeepers_cmp(char *old, char *new)
{
char *safekeepers_old[MAX_SAFEKEEPERS];
char *safekeepers_new[MAX_SAFEKEEPERS];
int len_old = 0;
int len_new = 0;
len_old = split_safekeepers_list(old, safekeepers_old);
len_new = split_safekeepers_list(new, safekeepers_new);
if (len_old != len_new)
{
return false;
}
qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
for (int i = 0; i < len_new; i++)
{
if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
{
return false;
}
}
return true;
}
/*
* GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
* the list changed.
@@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra)
wpg_log(FATAL, "neon.safekeepers is empty");
}
/* Copy values because we will modify them in split_safekeepers_list() */
char *newval_copy = pstrdup(newval);
char *oldval = pstrdup(wal_acceptors_list);
/*
* TODO: restarting through FATAL is stupid and introduces 1s delay before
* next bgw start. We should refactor walproposer to allow graceful exit and
* thus remove this delay.
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
*/
if (strcmp(wal_acceptors_list, newval) != 0)
if (!safekeepers_cmp(oldval, newval_copy))
{
wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
wal_acceptors_list, newval);
}
pfree(newval_copy);
pfree(oldval);
}
/* Check if we need to suspend inserts because of lagging replication. */
/* Check if we need to suspend inserts because of lagging replication. */
static uint64
backpressure_lag_impl(void)
{

View File

@@ -1,6 +1,7 @@
import os
from contextlib import closing
from fixtures.neon_fixtures import NeonEnv
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
#
@@ -28,3 +29,45 @@ def test_config(neon_simple_env: NeonEnv):
# check that config change was applied
assert cur.fetchone() == ("debug1",)
#
# Test that reordering of safekeepers does not restart walproposer
#
def test_safekeepers_reconfigure_reorder(
neon_env_builder: NeonEnvBuilder,
):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder")
endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder")
old_sks = ""
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SHOW neon.safekeepers")
res = cur.fetchone()
assert res is not None, "neon.safekeepers GUC is set"
old_sks = res[0]
# Reorder safekeepers
safekeepers = endpoint.active_safekeepers
safekeepers = safekeepers[1:] + safekeepers[:1]
endpoint.reconfigure(safekeepers=safekeepers)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SHOW neon.safekeepers")
res = cur.fetchone()
assert res is not None, "neon.safekeepers GUC is set"
new_sks = res[0]
assert new_sks != old_sks, "GUC changes were applied"
log_path = os.path.join(endpoint.endpoint_path(), "compute.log")
with open(log_path, "r") as log_file:
logs = log_file.read()
# Check that walproposer was not restarted
assert "restarting walproposer" not in logs