From f54c3b96e08f78e56d1f82f8305994e9cc8f867f Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 21 May 2024 14:42:25 +0300 Subject: [PATCH] Fix bugs in hot standby feedback propagation and add test for it. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/walproposer_pg.c | 19 +++--- safekeeper/src/send_wal.rs | 9 ++- test_runner/regress/test_hot_standby.py | 88 +++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 12 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index e5ef93b456..492a46fd54 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1852,34 +1852,30 @@ static void CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) { hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ + hs->xmin = InvalidFullTransactionId; + hs->catalog_xmin = InvalidFullTransactionId; for (int i = 0; i < wp->n_safekeepers; i++) { - if (wp->safekeeper[i].appendResponse.hs.ts != 0) + + if (wp->safekeeper[i].state == SS_ACTIVE) { HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; if (FullTransactionIdIsNormal(skhs->xmin) - && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin))) { hs->xmin = skhs->xmin; hs->ts = skhs->ts; } if (FullTransactionIdIsNormal(skhs->catalog_xmin) - && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin))) { hs->catalog_xmin = skhs->catalog_xmin; hs->ts = skhs->ts; } } } - - if (hs->xmin.value == ~0) - hs->xmin = InvalidFullTransactionId; - if (hs->catalog_xmin.value == ~0) - hs->catalog_xmin = InvalidFullTransactionId; } /* @@ -1946,9 +1942,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) + if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { agg_hs_feedback = hsFeedback; + elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, XidFromFullTransactionId(hsFeedback.xmin), EpochFromFullTransactionId(hsFeedback.xmin), diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4edd09a318..5a9745e1c9 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -756,8 +756,15 @@ impl ReplyReader { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - let hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let mut hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; + // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way: + // pq_sendint32(&reply_message, xmin); + // pq_sendint32(&reply_message, xmin_epoch); + // So it is two big endian 32-bit words in low endian order! + hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32); + hs_feedback.catalog_xmin = + (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 31f436cb4c..244d482c18 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -204,3 +204,91 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): log_replica_lag(primary, secondary) res = s_cur.fetchone() assert res[0] == 10000 + + +def run_pgbench(connstr: str, pg_bin: PgBin): + log.info(f"Start a pgbench workload on pg {connstr}") + # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + log.info("pgbench init done") + pg_bin.run_capture(["pgbench", "-T60", connstr]) + + +# assert that pgbench_accounts and its index are created. +def pgbench_accounts_initialized(ep): + ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass") + + +# Test that hot_standby_feedback works in neon (it is forwarded through +# safekeepers). That is, ensure queries on standby don't fail during load on +# primary under the following conditions: +# - pgbench bombards primary with updates. +# - On the secondary we run long select of the updated table. +# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts +# so apply doesn't need to wait. +# - Do agressive vacuum on primary which still shouldn't create conflicts. +# Actually this appears to be redundant due to microvacuum existence. +# +# Without hs feedback enabled we'd see 'User query might have needed to see row +# versions that must be removed.' errors. +def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + agressive_vacuum_conf = [ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime = 10s", + "autovacuum_vacuum_threshold = 25", + "autovacuum_vacuum_scale_factor = 0.1", + "autovacuum_vacuum_cost_delay = -1", + ] + with env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf + ) as primary: + # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with + # 'User was holding shared buffer pin for too long.'. + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_standby_streaming_delay=2s", + "neon.protocol_version=2", + "hot_standby_feedback=true", + ], + ) as secondary: + log.info( + f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}" + ) + t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin)) + t.start() + # Wait until pgbench_accounts is created + filled on replica *and* + # index is created. Otherwise index creation would conflict with + # read queries and hs feedback won't save us. + wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + + # Test should fail if hs feedback is disabled anyway, but cross + # check that walproposer sets some xmin. + def xmin_is_not_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert int(slot_xmin) > 0 + + wait_until(10, 1.0, xmin_is_not_null) + for _ in range(1, 5): + # in debug mode takes about 5-7s + balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") + log.info(f"balance={balance}") + log_replica_lag(primary, secondary) + t.join() + + # check xmin is reset when standby is gone + def xmin_is_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert slot_xmin is None + + wait_until(10, 1.0, xmin_is_null)