From 9b9f90c562d8b78628b66e1441fea7036ed66fc8 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 27 Aug 2024 15:49:47 +0200 Subject: [PATCH] fix(walproposer): Do not restart on safekeepers reordering (#8840) ## Problem Currently, we compare `neon.safekeepers` values as is, so we unnecessarily restart walproposer even if safekeepers set didn't change. This leads to errors like: ```log FATAL: [WP] restarting walproposer to change safekeeper list from safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401 to safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401 ``` ## Summary of changes Split the GUC into the list of individual safekeepers and properly compare. We could've done that somewhere on the upper level, e.g., control plane, but I think it's still better when the actual config consumer is smarter and doesn't rely on upper levels. --- pgxn/neon/walproposer_pg.c | 69 +++++++++++++++++++++++++++++- test_runner/regress/test_config.py | 45 ++++++++++++++++++- 2 files changed, 111 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index f3ddc64061..65ef588ba5 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -220,6 +220,64 @@ nwp_register_gucs(void) NULL, NULL, NULL); } + +static int +split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) +{ + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; + + for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) + { + if (++n_safekeepers >= MAX_SAFEKEEPERS) { + wpg_log(FATAL, "too many safekeepers"); + } + + coma = strchr(coma, ','); + safekeepers[n_safekeepers-1] = curr_sk; + + if (coma != NULL) { + *coma++ = '\0'; + } + } + + return n_safekeepers; +} + +/* + * Accept two coma-separated strings with list of safekeeper host:port addresses. + * Split them into arrays and return false if two sets do not match, ignoring the order. + */ +static bool +safekeepers_cmp(char *old, char *new) +{ + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; + + len_old = split_safekeepers_list(old, safekeepers_old); + len_new = split_safekeepers_list(new, safekeepers_new); + + if (len_old != len_new) + { + return false; + } + + qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp); + qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp); + + for (int i = 0; i < len_new; i++) + { + if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0) + { + return false; + } + } + + return true; +} + /* * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if * the list changed. @@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra) wpg_log(FATAL, "neon.safekeepers is empty"); } + /* Copy values because we will modify them in split_safekeepers_list() */ + char *newval_copy = pstrdup(newval); + char *oldval = pstrdup(wal_acceptors_list); + /* * TODO: restarting through FATAL is stupid and introduces 1s delay before * next bgw start. We should refactor walproposer to allow graceful exit and * thus remove this delay. + * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. */ - if (strcmp(wal_acceptors_list, newval) != 0) + if (!safekeepers_cmp(oldval, newval_copy)) { wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s", wal_acceptors_list, newval); } + pfree(newval_copy); + pfree(oldval); } -/* Check if we need to suspend inserts because of lagging replication. */ +/* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 4bb7df1e6a..2ef28eb94b 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,6 +1,7 @@ +import os from contextlib import closing -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder # @@ -28,3 +29,45 @@ def test_config(neon_simple_env: NeonEnv): # check that config change was applied assert cur.fetchone() == ("debug1",) + + +# +# Test that reordering of safekeepers does not restart walproposer +# +def test_safekeepers_reconfigure_reorder( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder") + + endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder") + + old_sks = "" + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.safekeepers") + res = cur.fetchone() + assert res is not None, "neon.safekeepers GUC is set" + old_sks = res[0] + + # Reorder safekeepers + safekeepers = endpoint.active_safekeepers + safekeepers = safekeepers[1:] + safekeepers[:1] + + endpoint.reconfigure(safekeepers=safekeepers) + + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.safekeepers") + res = cur.fetchone() + assert res is not None, "neon.safekeepers GUC is set" + new_sks = res[0] + + assert new_sks != old_sks, "GUC changes were applied" + + log_path = os.path.join(endpoint.endpoint_path(), "compute.log") + with open(log_path, "r") as log_file: + logs = log_file.read() + # Check that walproposer was not restarted + assert "restarting walproposer" not in logs