diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index d660602149..4d6cbae9a9 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, shard_ps_feedback: [empty_feedback; 128], num_shards: 0, + replica_promote: false, min_ps_feedback: empty_feedback, } } diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c index 7c404fb5a9..6f57b618da 100644 --- a/pgxn/neon/neon_pgversioncompat.c +++ b/pgxn/neon/neon_pgversioncompat.c @@ -5,6 +5,7 @@ #include "funcapi.h" #include "miscadmin.h" +#include "access/xlog.h" #include "utils/tuplestore.h" #include "neon_pgversioncompat.h" @@ -41,5 +42,12 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags) rsinfo->setDesc = stored_tupdesc; MemoryContextSwitchTo(old_context); } + +TimeLineID GetWALInsertionTimeLine(void) +{ + return ThisTimeLineID + 1; +} + + #endif diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index bf91a02b45..787bd552f8 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -162,6 +162,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #if PG_MAJORVERSION_NUM < 15 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags); +extern TimeLineID GetWALInsertionTimeLine(void); #endif #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index d5e3a38dbb..0a1f6d9c72 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -69,6 +69,7 @@ struct NeonWALReader WALSegmentContext segcxt; WALOpenSegment seg; int wre_errno; + TimeLineID local_active_tlid; /* Explains failure to read, static for simplicity. */ char err_msg[NEON_WALREADER_ERR_MSG_LEN]; @@ -106,7 +107,7 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid) { NeonWALReader *reader; @@ -118,6 +119,7 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_ MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; + reader->local_active_tlid = tlid; reader->seg.ws_file = -1; reader->seg.ws_segno = 0; reader->seg.ws_tli = 0; @@ -577,6 +579,17 @@ NeonWALReaderIsRemConnEstablished(NeonWALReader *state) return state->rem_state == RS_ESTABLISHED; } +/* + * Whether remote connection is established. Once this is done, until successful + * local read or error socket is stable and user can update socket events + * instead of readding it each time. + */ +TimeLineID +NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state) +{ + return state->local_active_tlid; +} + /* * Returns events user should wait on connection socket or 0 if remote * connection is not active. diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 3e41825069..722bc10537 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,9 +19,10 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid); extern void NeonWALReaderFree(NeonWALReader *state); extern void NeonWALReaderResetRemote(NeonWALReader *state); +extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c63edb1398..91d39345e2 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -98,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp = palloc0(sizeof(WalProposer)); wp->config = config; wp->api = api; + wp->localTimeLineID = config->pgTimeline; wp->state = WPS_COLLECTING_TERMS; wp->mconf.generation = INVALID_GENERATION; wp->mconf.members.len = 0; @@ -1384,7 +1385,7 @@ ProcessPropStartPos(WalProposer *wp) * we must bail out, as clog and other non rel data is inconsistent. */ walprop_shared = wp->api.get_shmem_state(wp); - if (!wp->config->syncSafekeepers) + if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote) { /* * Basebackup LSN always points to the beginning of the record (not diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index cca20e746b..08087e5a55 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -391,6 +391,7 @@ typedef struct WalproposerShmemState /* last feedback from each shard */ PageserverFeedback shard_ps_feedback[MAX_SHARDS]; int num_shards; + bool replica_promote; /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; @@ -806,6 +807,9 @@ typedef struct WalProposer /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; + /* Current local TimeLineId in use */ + TimeLineID localTimeLineID; + /* WAL has been generated up to this point */ XLogRecPtr availableLsn; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index c90702a282..3d6a92ad79 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -35,6 +35,7 @@ #include "storage/proc.h" #include "storage/ipc.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/spin.h" #include "tcop/tcopprot.h" @@ -159,12 +160,19 @@ WalProposerMain(Datum main_arg) { WalProposer *wp; + if (*wal_acceptors_list == '\0') + { + wpg_log(WARNING, "Safekeepers list is empty"); + return; + } + init_walprop_config(false); walprop_pg_init_bgworker(); am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); + wp->localTimeLineID = GetWALInsertionTimeLine(); wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp); walprop_pg_init_walsender(); @@ -350,6 +358,9 @@ assign_neon_safekeepers(const char *newval, void *extra) char *newval_copy; char *oldval; + if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress()) + walprop_shared->replica_promote = true; + if (!am_walproposer) return; @@ -540,16 +551,15 @@ BackpressureThrottlingTime(void) /* * Register a background worker proposing WAL to wal acceptors. + * We start walproposer bgworker even for replicas in order to support possible replica promotion. + * When pg_promote() function is called, then walproposer bgworker registered with BgWorkerStart_RecoveryFinished + * is automatically launched when promotion is completed. */ static void walprop_register_bgworker(void) { BackgroundWorker bgw; - /* If no wal acceptors are specified, don't start the background worker. */ - if (*wal_acceptors_list == '\0') - return; - memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; @@ -1326,9 +1336,7 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + ThisTimeLineID = 1; #endif /* @@ -1542,7 +1550,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix, sk->wp->localTimeLineID); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1556,7 +1564,7 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, buf, startptr, count, - walprop_pg_get_timeline_id()); + sk->wp->localTimeLineID); if (res == NEON_WALREAD_SUCCESS) { diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 81198d6c8d..534bf1c19b 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -111,7 +111,7 @@ NeonWALPageRead( readBuf, targetPagePtr, count, - walprop_pg_get_timeline_id()); + NeonWALReaderLocalActiveTimeLineID(wal_reader)); if (res == NEON_WALREAD_SUCCESS) { @@ -202,7 +202,7 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { elog(ERROR, "unable to start walsender when basebackupLsn is 0"); } - wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] "); + wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ", 1); } xlr->page_read = NeonWALPageRead; xlr->segment_open = NeonWALReadSegmentOpen; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 19d12da5e3..f9337bed89 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4711,7 +4711,7 @@ class EndpointFactory: origin: Endpoint, endpoint_id: str | None = None, config_lines: list[str] | None = None, - ): + ) -> Endpoint: branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None @@ -4730,7 +4730,7 @@ class EndpointFactory: origin: Endpoint, endpoint_id: str | None = None, config_lines: list[str] | None = None, - ): + ) -> Endpoint: branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 4044f25b37..1ff61ce8dc 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv): for query in queries: with s_con.cursor() as secondary_cursor: secondary_cursor.execute(query) - response = secondary_cursor.fetchone() - assert response is not None + res = secondary_cursor.fetchone() + assert res is not None + response = res assert response == responses[query] # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): s_cur.execute("SELECT COUNT(*) FROM test") res = s_cur.fetchone() - assert res[0] == 10000 + assert res == (10000,) # Clear the cache in the standby, so that when we # re-execute the query, it will make GetPage @@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): s_cur.execute("SELECT COUNT(*) FROM test") log_replica_lag(primary, secondary) res = s_cur.fetchone() - assert res[0] == 10000 + assert res == (10000,) def run_pgbench(connstr: str, pg_bin: PgBin): diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py new file mode 100644 index 0000000000..e378d37635 --- /dev/null +++ b/test_runner/regress/test_replica_promotes.py @@ -0,0 +1,133 @@ +""" +File with secondary->primary promotion testing. + +This far, only contains a test that we don't break and that the data is persisted. +""" + +import psycopg2 +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup +from fixtures.pg_version import PgVersion +from pytest import raises + + +def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): + """ + Test that a replica safely promotes, and can commit data updates which + show up when the primary boots up after the promoted secondary endpoint + shut down. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env: NeonEnv = neon_simple_env + primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + with primary.connect() as primary_conn: + primary_cur = primary_conn.cursor() + primary_cur.execute( + "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)" + ) + primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") + primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}") + primary_cur.execute("show neon.safekeepers") + safekeepers = primary_cur.fetchall()[0][0] + + wait_replica_caughtup(primary, secondary) + + with secondary.connect() as secondary_conn: + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + + assert secondary_cur.fetchone() == (100,) + + with raises(psycopg2.Error): + secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)") + secondary_conn.commit() + + secondary_conn.rollback() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100,) + + primary.stop_and_destroy(mode="immediate") + + # Reconnect to the secondary to make sure we get a read-write connection + promo_conn = secondary.connect() + promo_cur = promo_conn.cursor() + promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") + promo_cur.execute("select pg_reload_conf()") + + promo_cur.execute("SELECT * FROM pg_promote()") + assert promo_cur.fetchone() == (True,) + promo_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}") + + # Reconnect to the secondary to make sure we get a read-write connection + with secondary.connect() as new_primary_conn: + new_primary_cur = new_primary_conn.cursor() + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (100,) + + new_primary_cur.execute( + "INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload" + ) + assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)] + + new_primary_cur = new_primary_conn.cursor() + new_primary_cur.execute("select payload from t") + assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] + + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (200,) + new_primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}") + + with secondary.connect() as second_viewpoint_conn: + new_primary_cur = second_viewpoint_conn.cursor() + new_primary_cur.execute("select payload from t") + assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] + + # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline) + + secondary.stop_and_destroy() + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + + with primary.connect() as new_primary: + new_primary_cur = new_primary.cursor() + new_primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}") + + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (200,) + new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)") + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (300,) + + primary.stop(mode="immediate") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 55c0d45abe..6770bc2513 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 +Subproject commit 6770bc251301ef40c66f7ecb731741dc435b5051 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index de7640f55d..8c3249f36c 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f +Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 0bf96bd6d7..7a4c0eacae 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 +Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 8be779fd3a..db424d42d7 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 +Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f diff --git a/vendor/revisions.json b/vendor/revisions.json index 3e999760f4..12d5499ddb 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "8be779fd3ab9e87206da96a7e4842ef1abf04f44" + "db424d42d748f8ad91ac00e28db2c7f2efa42f7f" ], "v16": [ "16.9", - "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" + "7a4c0eacaeb9b97416542fa19103061c166460b1" ], "v15": [ "15.13", - "de7640f55da07512834d5cc40c4b3fb376b5f04f" + "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9" ], "v14": [ "14.18", - "55c0d45abe6467c02084c2192bca117eda6ce1e7" + "6770bc251301ef40c66f7ecb731741dc435b5051" ] }