diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 53917d8bc4..e88901ed78 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -7,12 +7,12 @@ OBJS = \ extension_server.o \ file_cache.o \ libpagestore.o \ - libpqwalproposer.o \ neon.o \ + neon_utils.o \ pagestore_smgr.o \ relsize_cache.o \ walproposer.o \ - walproposer_utils.o \ + walproposer_pg.o \ control_plane_connector.o PG_CPPFLAGS = -I$(libpq_srcdir) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index c89de11594..ca24ec7586 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -30,7 +30,7 @@ #include "neon.h" #include "walproposer.h" -#include "walproposer_utils.h" +#include "neon_utils.h" #define PageStoreTrace DEBUG5 diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c deleted file mode 100644 index ce9a1475d3..0000000000 --- a/pgxn/neon/libpqwalproposer.c +++ /dev/null @@ -1,424 +0,0 @@ -#include "postgres.h" - -#include "libpq-fe.h" -#include "neon.h" -#include "walproposer.h" - -/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ -struct WalProposerConn -{ - PGconn *pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from - * walprop_async_read */ -}; - -/* Helper function */ -static bool -ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) -{ - /* If we're already correctly blocking or nonblocking, all good */ - if (is_nonblocking == conn->is_nonblocking) - return true; - - /* Otherwise, set it appropriately */ - if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) - return false; - - conn->is_nonblocking = is_nonblocking; - return true; -} - -/* Exported function definitions */ -char * -walprop_error_message(WalProposerConn *conn) -{ - return PQerrorMessage(conn->pg_conn); -} - -WalProposerConnStatusType -walprop_status(WalProposerConn *conn) -{ - switch (PQstatus(conn->pg_conn)) - { - case CONNECTION_OK: - return WP_CONNECTION_OK; - case CONNECTION_BAD: - return WP_CONNECTION_BAD; - default: - return WP_CONNECTION_IN_PROGRESS; - } -} - -WalProposerConn * -walprop_connect_start(char *conninfo, char *password) -{ - WalProposerConn *conn; - PGconn *pg_conn; - const char *keywords[3]; - const char *values[3]; - int n; - - /* - * Connect using the given connection string. If the - * NEON_AUTH_TOKEN environment variable was set, use that as - * the password. - * - * The connection options are parsed in the order they're given, so - * when we set the password before the connection string, the - * connection string can override the password from the env variable. - * Seems useful, although we don't currently use that capability - * anywhere. - */ - n = 0; - if (password) - { - keywords[n] = "password"; - values[n] = password; - n++; - } - keywords[n] = "dbname"; - values[n] = conninfo; - n++; - keywords[n] = NULL; - values[n] = NULL; - n++; - pg_conn = PQconnectStartParams(keywords, values, 1); - - /* - * Allocation of a PQconn can fail, and will return NULL. We want to fully - * replicate the behavior of PQconnectStart here. - */ - if (!pg_conn) - return NULL; - - /* - * And in theory this allocation can fail as well, but it's incredibly - * unlikely if we just successfully allocated a PGconn. - * - * palloc will exit on failure though, so there's not much we could do if - * it *did* fail. - */ - conn = palloc(sizeof(WalProposerConn)); - conn->pg_conn = pg_conn; - conn->is_nonblocking = false; /* connections always start in blocking - * mode */ - conn->recvbuf = NULL; - return conn; -} - -WalProposerConnectPollStatusType -walprop_connect_poll(WalProposerConn *conn) -{ - WalProposerConnectPollStatusType return_val; - - switch (PQconnectPoll(conn->pg_conn)) - { - case PGRES_POLLING_FAILED: - return_val = WP_CONN_POLLING_FAILED; - break; - case PGRES_POLLING_READING: - return_val = WP_CONN_POLLING_READING; - break; - case PGRES_POLLING_WRITING: - return_val = WP_CONN_POLLING_WRITING; - break; - case PGRES_POLLING_OK: - return_val = WP_CONN_POLLING_OK; - break; - - /* - * There's a comment at its source about this constant being - * unused. We'll expect it's never returned. - */ - case PGRES_POLLING_ACTIVE: - elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); - - /* - * This return is never actually reached, but it's here to make - * the compiler happy - */ - return WP_CONN_POLLING_FAILED; - - default: - Assert(false); - return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ - } - - return return_val; -} - -bool -walprop_send_query(WalProposerConn *conn, char *query) -{ - /* - * We need to be in blocking mode for sending the query to run without - * requiring a call to PQflush - */ - if (!ensure_nonblocking_status(conn, false)) - return false; - - /* PQsendQuery returns 1 on success, 0 on failure */ - if (!PQsendQuery(conn->pg_conn, query)) - return false; - - return true; -} - -WalProposerExecStatusType -walprop_get_query_result(WalProposerConn *conn) -{ - PGresult *result; - WalProposerExecStatusType return_val; - - /* Marker variable if we need to log an unexpected success result */ - char *unexpected_success = NULL; - - /* Consume any input that we might be missing */ - if (!PQconsumeInput(conn->pg_conn)) - return WP_EXEC_FAILED; - - if (PQisBusy(conn->pg_conn)) - return WP_EXEC_NEEDS_INPUT; - - - result = PQgetResult(conn->pg_conn); - - /* - * PQgetResult returns NULL only if getting the result was successful & - * there's no more of the result to get. - */ - if (!result) - { - elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); - return WP_EXEC_UNEXPECTED_SUCCESS; - } - - /* Helper macro to reduce boilerplate */ -#define UNEXPECTED_SUCCESS(msg) \ - return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ - unexpected_success = msg; \ - break; - - - switch (PQresultStatus(result)) - { - /* "true" success case */ - case PGRES_COPY_BOTH: - return_val = WP_EXEC_SUCCESS_COPYBOTH; - break; - - /* Unexpected success case */ - case PGRES_EMPTY_QUERY: - UNEXPECTED_SUCCESS("empty query return"); - case PGRES_COMMAND_OK: - UNEXPECTED_SUCCESS("data-less command end"); - case PGRES_TUPLES_OK: - UNEXPECTED_SUCCESS("tuples return"); - case PGRES_COPY_OUT: - UNEXPECTED_SUCCESS("'Copy Out' response"); - case PGRES_COPY_IN: - UNEXPECTED_SUCCESS("'Copy In' response"); - case PGRES_SINGLE_TUPLE: - UNEXPECTED_SUCCESS("single tuple return"); - case PGRES_PIPELINE_SYNC: - UNEXPECTED_SUCCESS("pipeline sync point"); - - /* Failure cases */ - case PGRES_BAD_RESPONSE: - case PGRES_NONFATAL_ERROR: - case PGRES_FATAL_ERROR: - case PGRES_PIPELINE_ABORTED: - return_val = WP_EXEC_FAILED; - break; - - default: - Assert(false); - return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ - } - - if (unexpected_success) - elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); - - return return_val; -} - -pgsocket -walprop_socket(WalProposerConn *conn) -{ - return PQsocket(conn->pg_conn); -} - -int -walprop_flush(WalProposerConn *conn) -{ - return (PQflush(conn->pg_conn)); -} - -void -walprop_finish(WalProposerConn *conn) -{ - if (conn->recvbuf != NULL) - PQfreemem(conn->recvbuf); - PQfinish(conn->pg_conn); - pfree(conn); -} - -/* - * Receive a message from the safekeeper. - * - * On success, the data is placed in *buf. It is valid until the next call - * to this function. - */ -PGAsyncReadResult -walprop_async_read(WalProposerConn *conn, char **buf, int *amount) -{ - int result; - - if (conn->recvbuf != NULL) - { - PQfreemem(conn->recvbuf); - conn->recvbuf = NULL; - } - - /* Call PQconsumeInput so that we have the data we need */ - if (!PQconsumeInput(conn->pg_conn)) - { - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } - - /* - * The docs for PQgetCopyData list the return values as: 0 if the copy is - * still in progress, but no "complete row" is available -1 if the copy is - * done -2 if an error occurred (> 0) if it was successful; that value is - * the amount transferred. - * - * The protocol we use between walproposer and safekeeper means that we - * *usually* wouldn't expect to see that the copy is done, but this can - * sometimes be triggered by the server returning an ErrorResponse (which - * also happens to have the effect that the copy is done). - */ - switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) - { - case 0: - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_TRY_AGAIN; - case -1: - { - /* - * If we get -1, it's probably because of a server error; the - * safekeeper won't normally send a CopyDone message. - * - * We can check PQgetResult to make sure that the server - * failed; it'll always result in PGRES_FATAL_ERROR - */ - ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); - - if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); - - /* - * If there was actually an error, it'll be properly reported - * by calls to PQerrorMessage -- we don't have to do anything - * else - */ - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } - case -2: - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - default: - /* Positive values indicate the size of the returned result */ - *amount = result; - *buf = conn->recvbuf; - return PG_ASYNC_READ_SUCCESS; - } -} - -PGAsyncWriteResult -walprop_async_write(WalProposerConn *conn, void const *buf, size_t size) -{ - int result; - - /* If we aren't in non-blocking mode, switch to it. */ - if (!ensure_nonblocking_status(conn, true)) - return PG_ASYNC_WRITE_FAIL; - - /* - * The docs for PQputcopyData list the return values as: 1 if the data was - * queued, 0 if it was not queued because of full buffers, or -1 if an - * error occurred - */ - result = PQputCopyData(conn->pg_conn, buf, size); - - /* - * We won't get a result of zero because walproposer always empties the - * connection's buffers before sending more - */ - Assert(result != 0); - - switch (result) - { - case 1: - /* good -- continue */ - break; - case -1: - return PG_ASYNC_WRITE_FAIL; - default: - elog(FATAL, "invalid return %d from PQputCopyData", result); - } - - /* - * After queueing the data, we still need to flush to get it to send. This - * might take multiple tries, but we don't want to wait around until it's - * done. - * - * PQflush has the following returns (directly quoting the docs): 0 if - * sucessful, 1 if it was unable to send all the data in the send queue - * yet -1 if it failed for some reason - */ - switch (result = PQflush(conn->pg_conn)) - { - case 0: - return PG_ASYNC_WRITE_SUCCESS; - case 1: - return PG_ASYNC_WRITE_TRY_FLUSH; - case -1: - return PG_ASYNC_WRITE_FAIL; - default: - elog(FATAL, "invalid return %d from PQflush", result); - } -} - -/* - * This function is very similar to walprop_async_write. For more - * information, refer to the comments there. - */ -bool -walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size) -{ - int result; - - /* If we are in non-blocking mode, switch out of it. */ - if (!ensure_nonblocking_status(conn, false)) - return false; - - if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) - return false; - - Assert(result == 1); - - /* Because the connection is non-blocking, flushing returns 0 or -1 */ - - if ((result = PQflush(conn->pg_conn)) == -1) - return false; - - Assert(result == 0); - return true; -} diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 2610da4311..3300c67456 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -18,6 +18,10 @@ extern char *neon_auth_token; extern char *neon_timeline; extern char *neon_tenant; +extern char *wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connection_timeout; + extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); @@ -30,4 +34,10 @@ extern void pg_init_extension_server(void); extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); +extern uint64 BackpressureThrottlingTime(void); +extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); + +extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); +extern void PGDLLEXPORT WalProposerMain(Datum main_arg); + #endif /* NEON_H */ diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c new file mode 100644 index 0000000000..06faea7490 --- /dev/null +++ b/pgxn/neon/neon_utils.c @@ -0,0 +1,116 @@ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlogutils.h" +#include "common/logging.h" +#include "common/ip.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "replication/slot.h" +#include "replication/walsender_private.h" + +#include "storage/ipc.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" + +#include "libpq-fe.h" +#include +#include + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif +#if PG_MAJORVERSION_NUM >= 16 +#include "utils/guc.h" +#endif + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +int +HexDecodeChar(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +bool +HexDecodeString(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = HexDecodeChar(input[i * 2]); + int n2 = HexDecodeChar(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} + +/* -------------------------------- + * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint32 +pq_getmsgint32_le(StringInfo msg) +{ + uint32 n32; + + pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); + + return n32; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint64 +pq_getmsgint64_le(StringInfo msg) +{ + uint64 n64; + + pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); + + return n64; +} + +/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ +void +pq_sendint32_le(StringInfo buf, uint32 i) +{ + enlargeStringInfo(buf, sizeof(uint32)); + memcpy(buf->data + buf->len, &i, sizeof(uint32)); + buf->len += sizeof(uint32); +} + +/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ +void +pq_sendint64_le(StringInfo buf, uint64 i) +{ + enlargeStringInfo(buf, sizeof(uint64)); + memcpy(buf->data + buf->len, &i, sizeof(uint64)); + buf->len += sizeof(uint64); +} diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h new file mode 100644 index 0000000000..e3fafc8d0f --- /dev/null +++ b/pgxn/neon/neon_utils.h @@ -0,0 +1,12 @@ +#ifndef __NEON_UTILS_H__ +#define __NEON_UTILS_H__ + +#include "postgres.h" + +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); + +#endif /* __NEON_UTILS_H__ */ diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a9342bd984..c1fd5e3ef3 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -7,9 +7,9 @@ * * We have two ways of launching WalProposer: * - * 1. As a background worker which will run physical WalSender with - * am_wal_proposer flag set to true. WalSender in turn would handle WAL - * reading part and call WalProposer when ready to scatter WAL. + * 1. As a background worker which will pretend to be physical WalSender. + * WalProposer will receive notifications about new available WAL and + * will immediately broadcast it to alive safekeepers. * * 2. As a standalone utility by running `postgres --sync-safekeepers`. That * is needed to create LSN from which it is safe to start postgres. More @@ -29,107 +29,25 @@ * safekeepers, learn start LSN of future epoch and run basebackup' * won't work. * + * Both ways are implemented in walproposer_pg.c file. This file contains + * generic part of walproposer which can be used in both cases, but can also + * be used as an independent library. + * *------------------------------------------------------------------------- */ #include "postgres.h" - -#include -#include -#include -#include "access/xact.h" -#include "access/xlogdefs.h" -#include "access/xlogutils.h" -#include "access/xloginsert.h" -#if PG_VERSION_NUM >= 150000 -#include "access/xlogrecovery.h" -#endif -#include "storage/fd.h" -#include "storage/latch.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "access/xlog.h" #include "libpq/pqformat.h" -#include "replication/slot.h" -#include "replication/walreceiver.h" -#if PG_VERSION_NUM >= 160000 -#include "replication/walsender_private.h" -#endif -#include "postmaster/bgworker.h" -#include "postmaster/interrupt.h" -#include "postmaster/postmaster.h" -#include "storage/pmsignal.h" -#include "storage/proc.h" -#include "storage/ipc.h" -#include "storage/lwlock.h" -#include "storage/shmem.h" -#include "storage/spin.h" -#include "tcop/tcopprot.h" -#include "utils/builtins.h" -#include "utils/guc.h" -#include "utils/memutils.h" -#include "utils/ps_status.h" -#include "utils/timestamp.h" - #include "neon.h" #include "walproposer.h" -#include "walproposer_utils.h" - -static bool syncSafekeepers = false; - -char *wal_acceptors_list = ""; -int wal_acceptor_reconnect_timeout = 1000; -int wal_acceptor_connection_timeout = 10000; -bool am_wal_proposer = false; - -#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" - -static int n_safekeepers = 0; -static int quorum = 0; -static Safekeeper safekeeper[MAX_SAFEKEEPERS]; -static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to* - * safekeepers */ -static ProposerGreeting greetRequest; -static VoteRequest voteRequest; /* Vote request for safekeeper */ -static WaitEventSet *waitEvents; -static AppendResponse quorumFeedback; -/* - * Minimal LSN which may be needed for recovery of some safekeeper, - * record-aligned (first record which might not yet received by someone). - */ -static XLogRecPtr truncateLsn; - -/* - * Term of the proposer. We want our term to be highest and unique, - * so we collect terms from safekeepers quorum, choose max and +1. - * After that our term is fixed and must not change. If we observe - * that some safekeeper has higher term, it means that we have another - * running compute, so we must stop immediately. - */ -static term_t propTerm; -static TermHistory propTermHistory; /* term history of the proposer */ -static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ -static term_t donorEpoch; /* Most advanced acceptor epoch */ -static int donor; /* Most advanced acceptor */ -static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -static int n_votes = 0; -static int n_connected = 0; -static TimestampTz last_reconnect_attempt; - -static WalproposerShmemState * walprop_shared; +#include "neon_utils.h" /* Prototypes for private functions */ -static void WalProposerRegister(void); -static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId); -static void WalProposerStart(void); -static void WalProposerLoop(void); -static void InitEventSet(void); -static void UpdateEventSet(Safekeeper *sk, uint32 events); +static void WalProposerLoop(WalProposer *wp); static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); static void ShutdownConnection(Safekeeper *sk); static void ResetConnection(Safekeeper *sk); -static long TimeToReconnect(TimestampTz now); -static void ReconnectSafekeepers(void); +static long TimeToReconnect(WalProposer *wp, TimestampTz now); +static void ReconnectSafekeepers(WalProposer *wp); static void AdvancePollState(Safekeeper *sk, uint32 events); static void HandleConnectionEvent(Safekeeper *sk); static void SendStartWALPush(Safekeeper *sk); @@ -138,403 +56,44 @@ static void SendProposerGreeting(Safekeeper *sk); static void RecvAcceptorGreeting(Safekeeper *sk); static void SendVoteRequest(Safekeeper *sk); static void RecvVoteResponse(Safekeeper *sk); -static void HandleElectedProposer(void); -static term_t GetHighestTerm(TermHistory * th); +static void HandleElectedProposer(WalProposer *wp); +static term_t GetHighestTerm(TermHistory *th); static term_t GetEpoch(Safekeeper *sk); -static void DetermineEpochStartLsn(void); -static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); +static void DetermineEpochStartLsn(WalProposer *wp); static void SendProposerElected(Safekeeper *sk); -static void WalProposerStartStreaming(XLogRecPtr startpos); static void StartStreaming(Safekeeper *sk); static void SendMessageToNode(Safekeeper *sk); -static void BroadcastAppendRequest(void); +static void BroadcastAppendRequest(WalProposer *wp); static void HandleActiveState(Safekeeper *sk, uint32 events); static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); -static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); -static XLogRecPtr CalculateMinFlushLsn(void); -static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); -static void HandleSafekeeperResponse(void); +static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); +static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); +static void HandleSafekeeperResponse(WalProposer *wp); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); -static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); static bool AsyncFlush(Safekeeper *sk); +static int CompareLsn(const void *a, const void *b); +static char *FormatSafekeeperState(SafekeeperState state); +static void AssertEventsOkForState(uint32 events, Safekeeper *sk); +static uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +static char *FormatEvents(uint32 events); -static void nwp_shmem_startup_hook(void); -static void nwp_register_gucs(void); -static void nwp_prepare_shmem(void); -static uint64 backpressure_lag_impl(void); -static bool backpressure_throttling_impl(void); - -static process_interrupts_callback_t PrevProcessInterruptsCallback; -static shmem_startup_hook_type prev_shmem_startup_hook_type; -#if PG_VERSION_NUM >= 150000 -static shmem_request_hook_type prev_shmem_request_hook = NULL; -static void walproposer_shmem_request(void); -#endif - -void -pg_init_walproposer(void) -{ - if (!process_shared_preload_libraries_in_progress) - return; - - nwp_register_gucs(); - - nwp_prepare_shmem(); - - delay_backend_us = &backpressure_lag_impl; - PrevProcessInterruptsCallback = ProcessInterruptsCallback; - ProcessInterruptsCallback = backpressure_throttling_impl; - - WalProposerRegister(); -} - -/* - * Entry point for `postgres --sync-safekeepers`. - */ -PGDLLEXPORT void -WalProposerSync(int argc, char *argv[]) -{ - struct stat stat_buf; - - syncSafekeepers = true; -#if PG_VERSION_NUM < 150000 - ThisTimeLineID = 1; -#endif - - /* - * Initialize postmaster_alive_fds as WaitEventSet checks them. - * - * Copied from InitPostmasterDeathWatchHandle() - */ - if (pipe(postmaster_alive_fds) < 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg_internal("could not create pipe to monitor postmaster death: %m"))); - if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) - ereport(FATAL, - (errcode_for_socket_access(), - errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); - - ChangeToDataDir(); - - /* Create pg_wal directory, if it doesn't exist */ - if (stat(XLOGDIR, &stat_buf) != 0) - { - ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); - if (MakePGDirectory(XLOGDIR) < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create directory \"%s\": %m", - XLOGDIR))); - exit(1); - } - } - - WalProposerInit(0, 0); - - BackgroundWorkerUnblockSignals(); - - WalProposerStart(); -} - -static void -nwp_register_gucs(void) -{ - DefineCustomStringVariable( - "neon.safekeepers", - "List of Neon WAL acceptors (host:port)", - NULL, /* long_desc */ - &wal_acceptors_list, /* valueAddr */ - "", /* bootValue */ - PGC_POSTMASTER, - GUC_LIST_INPUT, /* extensions can't use* - * GUC_LIST_QUOTE */ - NULL, NULL, NULL); - - DefineCustomIntVariable( - "neon.safekeeper_reconnect_timeout", - "Walproposer reconnects to offline safekeepers once in this interval.", - NULL, - &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, /* default, min, max */ - PGC_SIGHUP, /* context */ - GUC_UNIT_MS, /* flags */ - NULL, NULL, NULL); - - DefineCustomIntVariable( - "neon.safekeeper_connect_timeout", - "Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.", - NULL, - &wal_acceptor_connection_timeout, - 10000, 0, INT_MAX, - PGC_SIGHUP, - GUC_UNIT_MS, - NULL, NULL, NULL); -} - -/* shmem handling */ - -static void -nwp_prepare_shmem(void) -{ -#if PG_VERSION_NUM >= 150000 - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = walproposer_shmem_request; -#else - RequestAddinShmemSpace(WalproposerShmemSize()); -#endif - prev_shmem_startup_hook_type = shmem_startup_hook; - shmem_startup_hook = nwp_shmem_startup_hook; -} - -#if PG_VERSION_NUM >= 150000 -/* - * shmem_request hook: request additional shared resources. We'll allocate or - * attach to the shared resources in nwp_shmem_startup_hook(). - */ -static void -walproposer_shmem_request(void) -{ - if (prev_shmem_request_hook) - prev_shmem_request_hook(); - - RequestAddinShmemSpace(WalproposerShmemSize()); -} -#endif - -static void -nwp_shmem_startup_hook(void) -{ - if (prev_shmem_startup_hook_type) - prev_shmem_startup_hook_type(); - - WalproposerShmemInit(); -} - -/* - * WAL proposer bgworker entry point. - */ -PGDLLEXPORT void -WalProposerMain(Datum main_arg) -{ -#if PG_VERSION_NUM >= 150000 - TimeLineID tli; -#endif - - /* Establish signal handlers. */ - pqsignal(SIGUSR1, procsignal_sigusr1_handler); - pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGTERM, die); - - BackgroundWorkerUnblockSignals(); - -#if PG_VERSION_NUM >= 150000 - /* FIXME pass proper tli to WalProposerInit ? */ - GetXLogReplayRecPtr(&tli); - WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); -#else - GetXLogReplayRecPtr(&ThisTimeLineID); - WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); -#endif - - last_reconnect_attempt = GetCurrentTimestamp(); - - application_name = (char *) "walproposer"; /* for - * synchronous_standby_names */ - am_wal_proposer = true; - am_walsender = true; - InitWalSender(); - InitProcessPhase2(); - - /* Create replication slot for WAL proposer if not exists */ - if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) - { - ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); - ReplicationSlotReserveWal(); - /* Write this slot to disk */ - ReplicationSlotMarkDirty(); - ReplicationSlotSave(); - ReplicationSlotRelease(); - } - - WalProposerStart(); -} - -/* - * Create new AppendRequest message and start sending it. This function is - * called from walsender every time the new WAL is available. - */ -void -WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) -{ - Assert(startpos == availableLsn && endpos >= availableLsn); - availableLsn = endpos; - BroadcastAppendRequest(); -} - -/* - * Advance the WAL proposer state machine, waiting each time for events to occur. - * Will exit only when latch is set, i.e. new WAL should be pushed from walsender - * to walproposer. - */ -void -WalProposerPoll(void) -{ - while (true) - { - Safekeeper *sk = NULL; - bool wait_timeout = false; - bool late_cv_trigger = false; - WaitEvent event = {0}; - int rc = 0; - TimestampTz now = GetCurrentTimestamp(); - long timeout = TimeToReconnect(now); - -#if PG_MAJORVERSION_NUM >= 16 - if (WalSndCtl != NULL) - ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); -#endif - - /* - * Wait for a wait event to happen, or timeout: - * - Safekeeper socket can become available for READ or WRITE - * - Our latch got set, because - * * PG15-: We got woken up by a process triggering the WalSender - * * PG16+: WalSndCtl->wal_flush_cv was triggered - */ - rc = WaitEventSetWait(waitEvents, timeout, - &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); -#if PG_MAJORVERSION_NUM >= 16 - if (WalSndCtl != NULL) - late_cv_trigger = ConditionVariableCancelSleep(); -#endif - - /* - * If wait is terminated by latch set (walsenders' latch is set on - * each wal flush), then exit loop. (no need for pm death check due to - * WL_EXIT_ON_PM_DEATH) - */ - if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger) - { - /* Reset our latch */ - ResetLatch(MyLatch); - - break; - } - - /* - * If the event contains something that one of our safekeeper states - * was waiting for, we'll advance its state. - */ - if (rc == 1 && (event.events & (WL_SOCKET_MASK))) - { - sk = (Safekeeper *) event.user_data; - AdvancePollState(sk, event.events); - } - - /* - * If the timeout expired, attempt to reconnect to any safekeepers - * that we dropped - */ - ReconnectSafekeepers(); - - if (rc == 0) /* timeout expired */ - { - wait_timeout = true; - - /* - * Ensure flushrecptr is set to a recent value. This fixes a case - * where we've not been notified of new WAL records when we were - * planning on consuming them. - */ - if (!syncSafekeepers) { - XLogRecPtr flushed; - -#if PG_MAJORVERSION_NUM < 15 - flushed = GetFlushRecPtr(); -#else - flushed = GetFlushRecPtr(NULL); -#endif - if (flushed > availableLsn) - break; - } - } - - now = GetCurrentTimestamp(); - if (rc == 0 || TimeToReconnect(now) <= 0) /* timeout expired: poll state */ - { - TimestampTz now; - - /* - * If no WAL was generated during timeout (and we have already - * collected the quorum), then send pool message - */ - if (availableLsn != InvalidXLogRecPtr) - { - BroadcastAppendRequest(); - } - - /* - * Abandon connection attempts which take too long. - */ - now = GetCurrentTimestamp(); - for (int i = 0; i < n_safekeepers; i++) - { - Safekeeper *sk = &safekeeper[i]; - - if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, - wal_acceptor_connection_timeout)) - { - elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", - sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout); - ShutdownConnection(sk); - } - } - } - } -} - -/* - * Register a background worker proposing WAL to wal acceptors. - */ -static void -WalProposerRegister(void) -{ - BackgroundWorker bgw; - - if (*wal_acceptors_list == '\0') - return; - - memset(&bgw, 0, sizeof(bgw)); - bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; - bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; - snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); - snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); - snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); - snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); - bgw.bgw_restart_time = 5; - bgw.bgw_notify_pid = 0; - bgw.bgw_main_arg = (Datum) 0; - - RegisterBackgroundWorker(&bgw); -} - -static void -WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) +WalProposer * +WalProposerCreate(WalProposerConfig *config, walproposer_api api) { char *host; char *sep; char *port; + WalProposer *wp; - load_file("libpqwalreceiver", false); - if (WalReceiverFunctions == NULL) - elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + wp = palloc0(sizeof(WalProposer)); + wp->config = config; + wp->api = api; - for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) + for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep) { port = strchr(host, ':'); if (port == NULL) @@ -545,118 +104,186 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) sep = strchr(port, ','); if (sep != NULL) *sep++ = '\0'; - if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) + if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS) { elog(FATAL, "Too many safekeepers"); } - safekeeper[n_safekeepers].host = host; - safekeeper[n_safekeepers].port = port; - safekeeper[n_safekeepers].state = SS_OFFLINE; - safekeeper[n_safekeepers].conn = NULL; + wp->safekeeper[wp->n_safekeepers].host = host; + wp->safekeeper[wp->n_safekeepers].port = port; + wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE; + wp->safekeeper[wp->n_safekeepers].conn = NULL; + wp->safekeeper[wp->n_safekeepers].wp = wp; { - Safekeeper *sk = &safekeeper[n_safekeepers]; - int written = 0; + Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers]; + int written = 0; written = snprintf((char *) &sk->conninfo, MAXCONNINFO, "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_timeline, neon_tenant); + sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant); if (written > MAXCONNINFO || written < 0) elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } - initStringInfo(&safekeeper[n_safekeepers].outbuf); - safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); - if (safekeeper[n_safekeepers].xlogreader == NULL) + initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf); + wp->safekeeper[wp->n_safekeepers].xlogreader = wp->api.wal_reader_allocate(); + if (wp->safekeeper[wp->n_safekeepers].xlogreader == NULL) elog(FATAL, "Failed to allocate xlog reader"); - safekeeper[n_safekeepers].flushWrite = false; - safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; - safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; - n_safekeepers += 1; + wp->safekeeper[wp->n_safekeepers].flushWrite = false; + wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr; + wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr; + wp->n_safekeepers += 1; } - if (n_safekeepers < 1) + if (wp->n_safekeepers < 1) { elog(FATAL, "Safekeepers addresses are not specified"); } - quorum = n_safekeepers / 2 + 1; + wp->quorum = wp->n_safekeepers / 2 + 1; /* Fill the greeting package */ - greetRequest.tag = 'g'; - greetRequest.protocolVersion = SK_PROTOCOL_VERSION; - greetRequest.pgVersion = PG_VERSION_NUM; - pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); - greetRequest.systemId = systemId; - if (!neon_timeline) + wp->greetRequest.tag = 'g'; + wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION; + wp->greetRequest.pgVersion = PG_VERSION_NUM; + wp->api.strong_random(&wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); + wp->greetRequest.systemId = wp->config->systemId; + if (!wp->config->neon_timeline) elog(FATAL, "neon.timeline_id is not provided"); - if (*neon_timeline != '\0' && - !HexDecodeString(greetRequest.timeline_id, neon_timeline, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline); - if (!neon_tenant) + if (*wp->config->neon_timeline != '\0' && + !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline); + if (!wp->config->neon_tenant) elog(FATAL, "neon.tenant_id is not provided"); - if (*neon_tenant != '\0' && - !HexDecodeString(greetRequest.tenant_id, neon_tenant, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant); + if (*wp->config->neon_tenant != '\0' && + !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant); -#if PG_VERSION_NUM >= 150000 - /* FIXME don't use hardcoded timeline id */ - greetRequest.timeline = 1; -#else - greetRequest.timeline = ThisTimeLineID; -#endif - greetRequest.walSegSize = wal_segment_size; + wp->greetRequest.timeline = wp->api.get_timeline_id(); + wp->greetRequest.walSegSize = wp->config->wal_segment_size; - InitEventSet(); -} + wp->api.init_event_set(wp->n_safekeepers); -static void -WalProposerStart(void) -{ - - /* Initiate connections to all safekeeper nodes */ - for (int i = 0; i < n_safekeepers; i++) - { - ResetConnection(&safekeeper[i]); - } - - WalProposerLoop(); -} - -static void -WalProposerLoop(void) -{ - while (true) - WalProposerPoll(); -} - -/* Initializes the internal event set, provided that it is currently null */ -static void -InitEventSet(void) -{ - if (waitEvents) - elog(FATAL, "double-initialization of event set"); - - waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); - AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); - AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); + return wp; } /* - * Updates the events we're already waiting on for the safekeeper, setting it to - * the provided `events` - * - * This function is called any time the safekeeper's state switches to one where - * it has to wait to continue. This includes the full body of AdvancePollState - * and calls to IO helper functions. + * Create new AppendRequest message and start sending it. This function is + * called from walsender every time the new WAL is available. */ -static void -UpdateEventSet(Safekeeper *sk, uint32 events) +void +WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos) { - /* eventPos = -1 when we don't have an event */ - Assert(sk->eventPos != -1); + Assert(startpos == wp->availableLsn && endpos >= wp->availableLsn); + wp->availableLsn = endpos; + BroadcastAppendRequest(wp); +} - ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +/* + * Advance the WAL proposer state machine, waiting each time for events to occur. + * Will exit only when latch is set, i.e. new WAL should be pushed from walsender + * to walproposer. + */ +void +WalProposerPoll(WalProposer *wp) +{ + while (true) + { + Safekeeper *sk = NULL; + int rc = 0; + uint32 events = 0; + TimestampTz now = wp->api.get_current_timestamp(); + long timeout = TimeToReconnect(wp, now); + + rc = wp->api.wait_event_set(timeout, &sk, &events); + + /* Exit loop if latch is set (we got new WAL) */ + if ((rc == 1 && events & WL_LATCH_SET)) + break; + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (rc == 1 && (events & WL_SOCKET_MASK)) + { + Assert(sk != NULL); + AdvancePollState(sk, events); + } + + /* + * If the timeout expired, attempt to reconnect to any safekeepers + * that we dropped + */ + ReconnectSafekeepers(wp); + + if (rc == 0) /* timeout expired */ + { + /* + * Ensure flushrecptr is set to a recent value. This fixes a case + * where we've not been notified of new WAL records when we were + * planning on consuming them. + */ + if (!wp->config->syncSafekeepers) + { + XLogRecPtr flushed = wp->api.get_flush_rec_ptr(); + + if (flushed > wp->availableLsn) + break; + } + } + + now = wp->api.get_current_timestamp(); + /* timeout expired: poll state */ + if (rc == 0 || TimeToReconnect(wp, now) <= 0) + { + TimestampTz now; + + /* + * If no WAL was generated during timeout (and we have already + * collected the quorum), then send empty keepalive message + */ + if (wp->availableLsn != InvalidXLogRecPtr) + { + BroadcastAppendRequest(wp); + } + + /* + * Abandon connection attempts which take too long. + */ + now = wp->api.get_current_timestamp(); + for (int i = 0; i < wp->n_safekeepers; i++) + { + Safekeeper *sk = &wp->safekeeper[i]; + + if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, + wp->config->safekeeper_connection_timeout)) + { + elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", + sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout); + ShutdownConnection(sk); + } + } + } + } +} + +void +WalProposerStart(WalProposer *wp) +{ + + /* Initiate connections to all safekeeper nodes */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + ResetConnection(&wp->safekeeper[i]); + } + + WalProposerLoop(wp); +} + +static void +WalProposerLoop(WalProposer *wp) +{ + while (true) + WalProposerPoll(wp); } /* @@ -667,24 +294,22 @@ UpdateEventSet(Safekeeper *sk, uint32 events) static void HackyRemoveWalProposerEvent(Safekeeper *to_remove) { + WalProposer *wp = to_remove->wp; + /* Remove the existing event set */ - if (waitEvents) - { - FreeWaitEventSet(waitEvents); - waitEvents = NULL; - } + wp->api.free_event_set(); /* Re-initialize it without adding any safekeeper events */ - InitEventSet(); + wp->api.init_event_set(wp->n_safekeepers); /* * loop through the existing safekeepers. If they aren't the one we're * removing, and if they have a socket we can use, re-add the applicable * events. */ - for (int i = 0; i < n_safekeepers; i++) + for (int i = 0; i < wp->n_safekeepers; i++) { uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &safekeeper[i]; + Safekeeper *sk = &wp->safekeeper[i]; sk->eventPos = -1; @@ -695,7 +320,8 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove) if (sk->conn != NULL) { desired_events = SafekeeperStateDesiredEvents(sk->state); - sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); + /* will set sk->eventPos */ + wp->api.add_safekeeper_event_set(sk, desired_events); } } } @@ -705,7 +331,7 @@ static void ShutdownConnection(Safekeeper *sk) { if (sk->conn) - walprop_finish(sk->conn); + sk->wp->api.conn_finish(sk->conn); sk->conn = NULL; sk->state = SS_OFFLINE; sk->flushWrite = false; @@ -727,7 +353,7 @@ ShutdownConnection(Safekeeper *sk) static void ResetConnection(Safekeeper *sk) { - pgsocket sock; /* socket of the new connection */ + WalProposer *wp = sk->wp; if (sk->state != SS_OFFLINE) { @@ -737,7 +363,7 @@ ResetConnection(Safekeeper *sk) /* * Try to establish new connection */ - sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token); + sk->conn = wp->api.conn_connect_start((char *) &sk->conninfo); /* * "If the result is null, then libpq has been unable to allocate a new @@ -751,7 +377,7 @@ ResetConnection(Safekeeper *sk) * PQconnectPoll. Before we do that though, we need to check that it * didn't immediately fail. */ - if (walprop_status(sk->conn) == WP_CONNECTION_BAD) + if (wp->api.conn_status(sk->conn) == WP_CONNECTION_BAD) { /*--- * According to libpq docs: @@ -763,13 +389,13 @@ ResetConnection(Safekeeper *sk) * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */ elog(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, wp->api.conn_error_message(sk->conn)); /* * Even though the connection failed, we still need to clean up the * object */ - walprop_finish(sk->conn); + wp->api.conn_finish(sk->conn); sk->conn = NULL; return; } @@ -790,10 +416,9 @@ ResetConnection(Safekeeper *sk) elog(LOG, "connecting with node %s:%s", sk->host, sk->port); sk->state = SS_CONNECTING_WRITE; - sk->latestMsgReceivedAt = GetCurrentTimestamp(); + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(); - sock = walprop_socket(sk->conn); - sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); + wp->api.add_safekeeper_event_set(sk, WL_SOCKET_WRITEABLE); return; } @@ -803,16 +428,16 @@ ResetConnection(Safekeeper *sk) * (do we actually need this?). */ static long -TimeToReconnect(TimestampTz now) +TimeToReconnect(WalProposer *wp, TimestampTz now) { TimestampTz passed; TimestampTz till_reconnect; - if (wal_acceptor_reconnect_timeout <= 0) + if (wp->config->safekeeper_reconnect_timeout <= 0) return -1; - passed = now - last_reconnect_attempt; - till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; + passed = now - wp->last_reconnect_attempt; + till_reconnect = wp->config->safekeeper_reconnect_timeout * 1000 - passed; if (till_reconnect <= 0) return 0; return (long) (till_reconnect / 1000); @@ -820,17 +445,17 @@ TimeToReconnect(TimestampTz now) /* If the timeout has expired, attempt to reconnect to all offline safekeepers */ static void -ReconnectSafekeepers(void) +ReconnectSafekeepers(WalProposer *wp) { - TimestampTz now = GetCurrentTimestamp(); + TimestampTz now = wp->api.get_current_timestamp(); - if (TimeToReconnect(now) == 0) + if (TimeToReconnect(wp, now) == 0) { - last_reconnect_attempt = now; - for (int i = 0; i < n_safekeepers; i++) + wp->last_reconnect_attempt = now; + for (int i = 0; i < wp->n_safekeepers; i++) { - if (safekeeper[i].state == SS_OFFLINE) - ResetConnection(&safekeeper[i]); + if (wp->safekeeper[i].state == SS_OFFLINE) + ResetConnection(&wp->safekeeper[i]); } } } @@ -938,7 +563,8 @@ AdvancePollState(Safekeeper *sk, uint32 events) static void HandleConnectionEvent(Safekeeper *sk) { - WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); + WalProposer *wp = sk->wp; + WalProposerConnectPollStatusType result = wp->api.conn_connect_poll(sk->conn); /* The new set of events we'll wait on, after updating */ uint32 new_events = WL_NO_EVENTS; @@ -948,7 +574,8 @@ HandleConnectionEvent(Safekeeper *sk) case WP_CONN_POLLING_OK: elog(LOG, "connected with node %s:%s", sk->host, sk->port); - sk->latestMsgReceivedAt = GetCurrentTimestamp(); + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(); + /* * We have to pick some event to update event set. We'll * eventually need the socket to be readable, so we go with that. @@ -970,7 +597,7 @@ HandleConnectionEvent(Safekeeper *sk) case WP_CONN_POLLING_FAILED: elog(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, wp->api.conn_error_message(sk->conn)); /* * If connecting failed, we don't want to restart the connection @@ -987,7 +614,7 @@ HandleConnectionEvent(Safekeeper *sk) * old event and re-register an event on the new socket. */ HackyRemoveWalProposerEvent(sk); - sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); + wp->api.add_safekeeper_event_set(sk, new_events); /* If we successfully connected, send START_WAL_PUSH query */ if (result == WP_CONN_POLLING_OK) @@ -1002,21 +629,25 @@ HandleConnectionEvent(Safekeeper *sk) static void SendStartWALPush(Safekeeper *sk) { - if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) + WalProposer *wp = sk->wp; + + if (!wp->api.conn_send_query(sk->conn, "START_WAL_PUSH")) { elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, wp->api.conn_error_message(sk->conn)); ShutdownConnection(sk); return; } sk->state = SS_WAIT_EXEC_RESULT; - UpdateEventSet(sk, WL_SOCKET_READABLE); + wp->api.update_event_set(sk, WL_SOCKET_READABLE); } static void RecvStartWALPushResult(Safekeeper *sk) { - switch (walprop_get_query_result(sk->conn)) + WalProposer *wp = sk->wp; + + switch (wp->api.conn_get_query_result(sk->conn)) { /* * Successful result, move on to starting the handshake @@ -1040,7 +671,7 @@ RecvStartWALPushResult(Safekeeper *sk) case WP_EXEC_FAILED: elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); + sk->host, sk->port, wp->api.conn_error_message(sk->conn)); ShutdownConnection(sk); return; @@ -1069,19 +700,21 @@ SendProposerGreeting(Safekeeper *sk) * On failure, logging & resetting the connection is handled. We just need * to handle the control flow. */ - BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); + BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV); } static void RecvAcceptorGreeting(Safekeeper *sk) { + WalProposer *wp = sk->wp; + /* * If our reading doesn't immediately succeed, any necessary error * handling or state setting is taken care of. We can leave any other work * until later. */ sk->greetResponse.apm.tag = 'g'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); @@ -1089,37 +722,37 @@ RecvAcceptorGreeting(Safekeeper *sk) /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; - /* + /* * Note: it would be better to track the counter on per safekeeper basis, - * but at worst walproposer would restart with 'term rejected', so leave as - * is for now. + * but at worst walproposer would restart with 'term rejected', so leave + * as is for now. */ - ++n_connected; - if (n_connected <= quorum) + ++wp->n_connected; + if (wp->n_connected <= wp->quorum) { /* We're still collecting terms from the majority. */ - propTerm = Max(sk->greetResponse.term, propTerm); + wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); /* Quorum is acquried, prepare the vote request. */ - if (n_connected == quorum) + if (wp->n_connected == wp->quorum) { - propTerm++; - elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); + wp->propTerm++; + elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - voteRequest = (VoteRequest) + wp->voteRequest = (VoteRequest) { .tag = 'v', - .term = propTerm + .term = wp->propTerm }; - memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); + memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN); } } - else if (sk->greetResponse.term > propTerm) + else if (sk->greetResponse.term > wp->propTerm) { /* Another compute with higher term is running. */ elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", sk->host, sk->port, - sk->greetResponse.term, propTerm); + sk->greetResponse.term, wp->propTerm); } /* @@ -1128,27 +761,27 @@ RecvAcceptorGreeting(Safekeeper *sk) * * If we do have quorum, we can start an election. */ - if (n_connected < quorum) + if (wp->n_connected < wp->quorum) { /* * SS_VOTING is an idle state; read-ready indicates the connection * closed. */ - UpdateEventSet(sk, WL_SOCKET_READABLE); + wp->api.update_event_set(sk, WL_SOCKET_READABLE); } else { /* * Now send voting request to the cohort and wait responses */ - for (int j = 0; j < n_safekeepers; j++) + for (int j = 0; j < wp->n_safekeepers; j++) { /* * Remember: SS_VOTING indicates that the safekeeper is * participating in voting, but hasn't sent anything yet. */ - if (safekeeper[j].state == SS_VOTING) - SendVoteRequest(&safekeeper[j]); + if (wp->safekeeper[j].state == SS_VOTING) + SendVoteRequest(&wp->safekeeper[j]); } } } @@ -1156,10 +789,12 @@ RecvAcceptorGreeting(Safekeeper *sk) static void SendVoteRequest(Safekeeper *sk) { + WalProposer *wp = sk->wp; + /* We have quorum for voting, send our vote request */ - elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); + elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); /* On failure, logging & resetting is handled */ - if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) + if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) return; /* If successful, wait for read-ready with SS_WAIT_VERDICT */ @@ -1168,8 +803,10 @@ SendVoteRequest(Safekeeper *sk) static void RecvVoteResponse(Safekeeper *sk) { + WalProposer *wp = sk->wp; + sk->voteResponse.apm.tag = 'v'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) return; elog(LOG, @@ -1185,21 +822,21 @@ RecvVoteResponse(Safekeeper *sk) * we are not elected yet and thus need the vote. */ if ((!sk->voteResponse.voteGiven) && - (sk->voteResponse.term > propTerm || n_votes < quorum)) + (sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum)) { elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", sk->host, sk->port, - sk->voteResponse.term, propTerm); + sk->voteResponse.term, wp->propTerm); } - Assert(sk->voteResponse.term == propTerm); + Assert(sk->voteResponse.term == wp->propTerm); /* Handshake completed, do we have quorum? */ - n_votes++; - if (n_votes < quorum) + wp->n_votes++; + if (wp->n_votes < wp->quorum) { sk->state = SS_IDLE; /* can't do much yet, no quorum */ } - else if (n_votes > quorum) + else if (wp->n_votes > wp->quorum) { /* recovery already performed, just start streaming */ SendProposerElected(sk); @@ -1207,10 +844,10 @@ RecvVoteResponse(Safekeeper *sk) else { sk->state = SS_IDLE; - UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for - * read-ready */ + /* Idle state waits for read-ready events */ + wp->api.update_event_set(sk, WL_SOCKET_READABLE); - HandleElectedProposer(); + HandleElectedProposer(sk->wp); } } @@ -1222,36 +859,36 @@ RecvVoteResponse(Safekeeper *sk) * replication from walsender. */ static void -HandleElectedProposer(void) +HandleElectedProposer(WalProposer *wp) { - DetermineEpochStartLsn(); + DetermineEpochStartLsn(wp); /* * Check if not all safekeepers are up-to-date, we need to download WAL * needed to synchronize them */ - if (truncateLsn < propEpochStartLsn) + if (wp->truncateLsn < wp->propEpochStartLsn) { elog(LOG, "start recovery because truncateLsn=%X/%X is not " "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(truncateLsn), - LSN_FORMAT_ARGS(propEpochStartLsn)); + LSN_FORMAT_ARGS(wp->truncateLsn), + LSN_FORMAT_ARGS(wp->propEpochStartLsn)); /* Perform recovery */ - if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) + if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn)) elog(FATAL, "Failed to recover state"); } - else if (syncSafekeepers) + else if (wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ - fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); - exit(0); + wp->api.finish_sync_safekeepers(wp->propEpochStartLsn); + /* unreachable */ } - for (int i = 0; i < n_safekeepers; i++) + for (int i = 0; i < wp->n_safekeepers; i++) { - if (safekeeper[i].state == SS_IDLE) - SendProposerElected(&safekeeper[i]); + if (wp->safekeeper[i].state == SS_IDLE) + SendProposerElected(&wp->safekeeper[i]); } /* @@ -1260,7 +897,7 @@ HandleElectedProposer(void) * because that state is used only for quorum waiting. */ - if (syncSafekeepers) + if (wp->config->syncSafekeepers) { /* * Send empty message to enforce receiving feedback even from nodes @@ -1268,19 +905,19 @@ HandleElectedProposer(void) * epoch which finishes sync-safeekepers who doesn't generate any real * new records. Will go away once we switch to async acks. */ - BroadcastAppendRequest(); + BroadcastAppendRequest(wp); /* keep polling until all safekeepers are synced */ return; } - WalProposerStartStreaming(propEpochStartLsn); + wp->api.start_streaming(wp, wp->propEpochStartLsn); /* Should not return here */ } /* latest term in TermHistory, or 0 is there is no entries */ static term_t -GetHighestTerm(TermHistory * th) +GetHighestTerm(TermHistory *th) { return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; } @@ -1294,9 +931,9 @@ GetEpoch(Safekeeper *sk) /* If LSN points to the page header, skip it */ static XLogRecPtr -SkipXLogPageHeader(XLogRecPtr lsn) +SkipXLogPageHeader(WalProposer *wp, XLogRecPtr lsn) { - if (XLogSegmentOffset(lsn, wal_segment_size) == 0) + if (XLogSegmentOffset(lsn, wp->config->wal_segment_size) == 0) { lsn += SizeOfXLogLongPHD; } @@ -1316,41 +953,41 @@ SkipXLogPageHeader(XLogRecPtr lsn) * only for skipping recovery). */ static void -DetermineEpochStartLsn(void) +DetermineEpochStartLsn(WalProposer *wp) { TermHistory *dth; - propEpochStartLsn = InvalidXLogRecPtr; - donorEpoch = 0; - truncateLsn = InvalidXLogRecPtr; - timelineStartLsn = InvalidXLogRecPtr; + wp->propEpochStartLsn = InvalidXLogRecPtr; + wp->donorEpoch = 0; + wp->truncateLsn = InvalidXLogRecPtr; + wp->timelineStartLsn = InvalidXLogRecPtr; - for (int i = 0; i < n_safekeepers; i++) + for (int i = 0; i < wp->n_safekeepers; i++) { - if (safekeeper[i].state == SS_IDLE) + if (wp->safekeeper[i].state == SS_IDLE) { - if (GetEpoch(&safekeeper[i]) > donorEpoch || - (GetEpoch(&safekeeper[i]) == donorEpoch && - safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) + if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch || + (GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch && + wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn)) { - donorEpoch = GetEpoch(&safekeeper[i]); - propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; - donor = i; + wp->donorEpoch = GetEpoch(&wp->safekeeper[i]); + wp->propEpochStartLsn = wp->safekeeper[i].voteResponse.flushLsn; + wp->donor = i; } - truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); - if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) + if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) { /* timelineStartLsn should be the same everywhere or unknown */ - if (timelineStartLsn != InvalidXLogRecPtr && - timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) + if (wp->timelineStartLsn != InvalidXLogRecPtr && + wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) { elog(WARNING, "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(timelineStartLsn), - LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); + LSN_FORMAT_ARGS(wp->timelineStartLsn), + LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); } - timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; + wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; } } } @@ -1359,14 +996,14 @@ DetermineEpochStartLsn(void) * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing * was committed yet. Start streaming then from the basebackup LSN. */ - if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) + if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { - propEpochStartLsn = truncateLsn = GetRedoStartLsn(); - if (timelineStartLsn == InvalidXLogRecPtr) + wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(); + if (wp->timelineStartLsn == InvalidXLogRecPtr) { - timelineStartLsn = GetRedoStartLsn(); + wp->timelineStartLsn = wp->api.get_redo_start_lsn(); } - elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); + elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } /* @@ -1374,46 +1011,48 @@ DetermineEpochStartLsn(void) * some connected safekeeper; it must have carried truncateLsn pointing to * the first record. */ - Assert((truncateLsn != InvalidXLogRecPtr) || - (syncSafekeepers && truncateLsn == propEpochStartLsn)); + Assert((wp->truncateLsn != InvalidXLogRecPtr) || + (wp->config->syncSafekeepers && wp->truncateLsn == wp->propEpochStartLsn)); /* * We will be generating WAL since propEpochStartLsn, so we should set * availableLsn to mark this LSN as the latest available position. */ - availableLsn = propEpochStartLsn; + wp->availableLsn = wp->propEpochStartLsn; /* * Proposer's term history is the donor's + its own entry. */ - dth = &safekeeper[donor].voteResponse.termHistory; - propTermHistory.n_entries = dth->n_entries + 1; - propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); - memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); - propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; - propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; + dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; + wp->propTermHistory.n_entries = dth->n_entries + 1; + wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); + memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; + wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - quorum, - propTerm, - LSN_FORMAT_ARGS(propEpochStartLsn), - safekeeper[donor].host, safekeeper[donor].port, - LSN_FORMAT_ARGS(truncateLsn)); + wp->quorum, + wp->propTerm, + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + LSN_FORMAT_ARGS(wp->truncateLsn)); /* * Ensure the basebackup we are running (at RedoStartLsn) matches LSN * since which we are going to write according to the consensus. If not, * we must bail out, as clog and other non rel data is inconsistent. */ - if (!syncSafekeepers) + if (!wp->config->syncSafekeepers) { + WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(); + /* * Basebackup LSN always points to the beginning of the record (not * the page), as StartupXLOG most probably wants it this way. * Safekeepers don't skip header as they need continious stream of * data, so correct LSN for comparison. */ - if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) + if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn()) { /* * However, allow to proceed if previously elected leader was me; @@ -1425,119 +1064,14 @@ DetermineEpochStartLsn(void) { elog(PANIC, "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", - LSN_FORMAT_ARGS(propEpochStartLsn), - LSN_FORMAT_ARGS(GetRedoStartLsn())); + LSN_FORMAT_ARGS(wp->propEpochStartLsn), + LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn())); } } - walprop_shared->mineLastElectedTerm = propTerm; + walprop_shared->mineLastElectedTerm = wp->propTerm; } } -/* - * Receive WAL from most advanced safekeeper - */ -static bool -WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) -{ - char *err; - WalReceiverConn *wrconn; - WalRcvStreamOptions options; - char conninfo[MAXCONNINFO]; - - if (!neon_auth_token) - { - memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO); - } - else - { - int written = 0; - - written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo); - if (written > MAXCONNINFO || written < 0) - elog(FATAL, "could not append password to the safekeeper connection string"); - } - -#if PG_MAJORVERSION_NUM < 16 - wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); -#else - wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); -#endif - - if (!wrconn) - { - ereport(WARNING, - (errmsg("could not connect to WAL acceptor %s:%s: %s", - safekeeper[donor].host, safekeeper[donor].port, - err))); - return false; - } - elog(LOG, - "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); - - options.logical = false; - options.startpoint = startpos; - options.slotname = NULL; - options.proto.physical.startpointTLI = timeline; - - if (walrcv_startstreaming(wrconn, &options)) - { - XLogRecPtr rec_start_lsn; - XLogRecPtr rec_end_lsn = 0; - int len; - char *buf; - pgsocket wait_fd = PGINVALID_SOCKET; - - while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) - { - if (len == 0) - { - (void) WaitLatchOrSocket( - MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, - -1, WAIT_EVENT_WAL_RECEIVER_MAIN); - } - else - { - Assert(buf[0] == 'w' || buf[0] == 'k'); - if (buf[0] == 'k') - continue; /* keepalive */ - memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], - sizeof rec_start_lsn); - rec_start_lsn = pg_ntoh64(rec_start_lsn); - rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; - - /* write WAL to disk */ - XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); - - ereport(DEBUG1, - (errmsg("Recover message %X/%X length %d", - LSN_FORMAT_ARGS(rec_start_lsn), len))); - if (rec_end_lsn >= endpos) - break; - } - } - ereport(LOG, - (errmsg("end of replication stream at %X/%X: %m", - LSN_FORMAT_ARGS(rec_end_lsn)))); - walrcv_disconnect(wrconn); - - /* failed to receive all WAL till endpos */ - if (rec_end_lsn < endpos) - return false; - } - else - { - ereport(LOG, - (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", - timeline, (uint32) (startpos >> 32), (uint32) startpos))); - return false; - } - - return true; -} - /* * Determine for sk the starting streaming point and send it message * 1) Announcing we are elected proposer (which immediately advances epoch if @@ -1550,6 +1084,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec static void SendProposerElected(Safekeeper *sk) { + WalProposer *wp = sk->wp; ProposerElected msg; TermHistory *th; term_t lastCommonTerm; @@ -1567,22 +1102,22 @@ SendProposerElected(Safekeeper *sk) th = &sk->voteResponse.termHistory; /* We must start somewhere. */ - Assert(propTermHistory.n_entries >= 1); + Assert(wp->propTermHistory.n_entries >= 1); - for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) + for (i = 0; i < Min(wp->propTermHistory.n_entries, th->n_entries); i++) { - if (propTermHistory.entries[i].term != th->entries[i].term) + if (wp->propTermHistory.entries[i].term != th->entries[i].term) break; /* term must begin everywhere at the same point */ - Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); + Assert(wp->propTermHistory.entries[i].lsn == th->entries[i].lsn); } i--; /* step back to the last common term */ if (i < 0) { /* safekeeper is empty or no common point, start from the beginning */ - sk->startStreamingAt = propTermHistory.entries[0].lsn; + sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - if (sk->startStreamingAt < truncateLsn) + if (sk->startStreamingAt < wp->truncateLsn) { /* * There's a gap between the WAL starting point and a truncateLsn, @@ -1603,10 +1138,10 @@ SendProposerElected(Safekeeper *sk) * safekeeper, and it's aligned to the WAL record, so we can * safely start streaming from this point. */ - sk->startStreamingAt = truncateLsn; + sk->startStreamingAt = wp->truncateLsn; elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", - sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), + sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn), LSN_FORMAT_ARGS(sk->startStreamingAt)); } } @@ -1618,28 +1153,28 @@ SendProposerElected(Safekeeper *sk) * proposer, LSN it is currently writing, but then we just pick * safekeeper pos as it obviously can't be higher. */ - if (propTermHistory.entries[i].term == propTerm) + if (wp->propTermHistory.entries[i].term == wp->propTerm) { sk->startStreamingAt = sk->voteResponse.flushLsn; } else { - XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr propEndLsn = wp->propTermHistory.entries[i + 1].lsn; XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : sk->voteResponse.flushLsn); sk->startStreamingAt = Min(propEndLsn, skEndLsn); } } - Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); + Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn); msg.tag = 'e'; - msg.term = propTerm; + msg.term = wp->propTerm; msg.startStreamingAt = sk->startStreamingAt; - msg.termHistory = &propTermHistory; - msg.timelineStartLsn = timelineStartLsn; + msg.termHistory = &wp->propTermHistory; + msg.timelineStartLsn = wp->timelineStartLsn; - lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; + lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0; elog(LOG, "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); @@ -1662,22 +1197,6 @@ SendProposerElected(Safekeeper *sk) StartStreaming(sk); } -/* - * Start walsender streaming replication - */ -static void -WalProposerStartStreaming(XLogRecPtr startpos) -{ - StartReplicationCmd cmd; - - elog(LOG, "WAL proposer starts streaming at %X/%X", - LSN_FORMAT_ARGS(startpos)); - cmd.slotname = WAL_PROPOSER_SLOT_NAME; - cmd.timeline = greetRequest.timeline; - cmd.startpoint = startpos; - StartProposerReplication(&cmd); -} - /* * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets * correct event set. @@ -1719,25 +1238,25 @@ SendMessageToNode(Safekeeper *sk) * Broadcast new message to all caught-up safekeepers */ static void -BroadcastAppendRequest() +BroadcastAppendRequest(WalProposer *wp) { - for (int i = 0; i < n_safekeepers; i++) - if (safekeeper[i].state == SS_ACTIVE) - SendMessageToNode(&safekeeper[i]); + for (int i = 0; i < wp->n_safekeepers; i++) + if (wp->safekeeper[i].state == SS_ACTIVE) + SendMessageToNode(&wp->safekeeper[i]); } static void -PrepareAppendRequest(AppendRequestHeader * req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); req->tag = 'a'; - req->term = propTerm; - req->epochStartLsn = propEpochStartLsn; + req->term = wp->propTerm; + req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; - req->commitLsn = GetAcknowledgedByQuorumWALPosition(); - req->truncateLsn = truncateLsn; - req->proposerId = greetRequest.proposerId; + req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp); + req->truncateLsn = wp->truncateLsn; + req->proposerId = wp->greetRequest.proposerId; } /* @@ -1746,6 +1265,8 @@ PrepareAppendRequest(AppendRequestHeader * req, XLogRecPtr beginLsn, XLogRecPtr static void HandleActiveState(Safekeeper *sk, uint32 events) { + WalProposer *wp = sk->wp; + uint32 newEvents = WL_SOCKET_READABLE; if (events & WL_SOCKET_WRITEABLE) @@ -1765,10 +1286,10 @@ HandleActiveState(Safekeeper *sk, uint32 events) * after arrival. But it's good to have it here in case we change this * behavior in the future. */ - if (sk->streamingAt != availableLsn || sk->flushWrite) + if (sk->streamingAt != wp->availableLsn || sk->flushWrite) newEvents |= WL_SOCKET_WRITEABLE; - UpdateEventSet(sk, newEvents); + wp->api.update_event_set(sk, newEvents); } /* @@ -1783,10 +1304,10 @@ HandleActiveState(Safekeeper *sk, uint32 events) static bool SendAppendRequests(Safekeeper *sk) { + WalProposer *wp = sk->wp; XLogRecPtr endLsn; AppendRequestHeader *req; PGAsyncWriteResult writeResult; - WALReadError errinfo; bool sentAnything = false; if (sk->flushWrite) @@ -1803,7 +1324,7 @@ SendAppendRequests(Safekeeper *sk) sk->flushWrite = false; } - while (sk->streamingAt != availableLsn || !sentAnything) + while (sk->streamingAt != wp->availableLsn || !sentAnything) { sentAnything = true; @@ -1811,13 +1332,13 @@ SendAppendRequests(Safekeeper *sk) endLsn += MAX_SEND_SIZE; /* if we went beyond available WAL, back off */ - if (endLsn > availableLsn) + if (endLsn > wp->availableLsn) { - endLsn = availableLsn; + endLsn = wp->availableLsn; } req = &sk->appendRequest; - PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); + PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn); ereport(DEBUG2, (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", @@ -1825,7 +1346,7 @@ SendAppendRequests(Safekeeper *sk) LSN_FORMAT_ARGS(req->beginLsn), LSN_FORMAT_ARGS(req->endLsn), LSN_FORMAT_ARGS(req->commitLsn), - LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); + LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port))); resetStringInfo(&sk->outbuf); @@ -1834,23 +1355,14 @@ SendAppendRequests(Safekeeper *sk) /* write the WAL itself */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); - if (!WALRead(sk->xlogreader, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn, -#if PG_VERSION_NUM >= 150000 - /* FIXME don't use hardcoded timeline_id here */ - 1, -#else - ThisTimeLineID, -#endif - &errinfo)) - { - WALReadRaiseError(&errinfo); - } + /* wal_read will raise error on failure */ + wp->api.wal_read(sk->xlogreader, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn); sk->outbuf.len += req->endLsn - req->beginLsn; - writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); + writeResult = wp->api.conn_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); /* Mark current message as sent, whatever the result is */ sk->streamingAt = endLsn; @@ -1874,7 +1386,7 @@ SendAppendRequests(Safekeeper *sk) case PG_ASYNC_WRITE_FAIL: elog(WARNING, "Failed to send to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + wp->api.conn_error_message(sk->conn)); ShutdownConnection(sk); return false; default: @@ -1897,6 +1409,7 @@ SendAppendRequests(Safekeeper *sk) static bool RecvAppendResponses(Safekeeper *sk) { + WalProposer *wp = sk->wp; XLogRecPtr minQuorumLsn; bool readAnything = false; @@ -1908,7 +1421,7 @@ RecvAppendResponses(Safekeeper *sk) * work until later. */ sk->appendResponse.apm.tag = 'a'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->appendResponse)) + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) break; ereport(DEBUG2, @@ -1918,12 +1431,12 @@ RecvAppendResponses(Safekeeper *sk) LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), sk->host, sk->port))); - if (sk->appendResponse.term > propTerm) + if (sk->appendResponse.term > wp->propTerm) { /* Another compute with higher term is running. */ elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", sk->host, sk->port, - sk->appendResponse.term, propTerm); + sk->appendResponse.term, wp->propTerm); } readAnything = true; @@ -1932,16 +1445,16 @@ RecvAppendResponses(Safekeeper *sk) if (!readAnything) return sk->state == SS_ACTIVE; - HandleSafekeeperResponse(); + HandleSafekeeperResponse(wp); /* * Also send the new commit lsn to all the safekeepers. */ - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - if (minQuorumLsn > lastSentCommitLsn) + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); + if (minQuorumLsn > wp->lastSentCommitLsn) { - BroadcastAppendRequest(); - lastSentCommitLsn = minQuorumLsn; + BroadcastAppendRequest(wp); + wp->lastSentCommitLsn = minQuorumLsn; } return sk->state == SS_ACTIVE; @@ -1949,7 +1462,7 @@ RecvAppendResponses(Safekeeper *sk) /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ void -ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf) +ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf) { uint8 nkeys; int i; @@ -2025,56 +1538,20 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf } } -/* - * Combine hot standby feedbacks from all safekeepers. - */ -static void -CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) -{ - hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ - - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].appendResponse.hs.ts != 0) - { - HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs; - if (FullTransactionIdIsNormal(skhs->xmin) - && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) - { - hs->xmin = skhs->xmin; - hs->ts = skhs->ts; - } - if (FullTransactionIdIsNormal(skhs->catalog_xmin) - && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) - { - hs->catalog_xmin = skhs->catalog_xmin; - hs->ts = skhs->ts; - } - } - } - - if (hs->xmin.value == ~0) - hs->xmin = InvalidFullTransactionId; - if (hs->catalog_xmin.value == ~0) - hs->catalog_xmin = InvalidFullTransactionId; -} - /* * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the * last WAL record that can be safely discarded. */ static XLogRecPtr -CalculateMinFlushLsn(void) +CalculateMinFlushLsn(WalProposer *wp) { - XLogRecPtr lsn = n_safekeepers > 0 - ? safekeeper[0].appendResponse.flushLsn - : InvalidXLogRecPtr; + XLogRecPtr lsn = wp->n_safekeepers > 0 + ? wp->safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; - for (int i = 1; i < n_safekeepers; i++) + for (int i = 1; i < wp->n_safekeepers; i++) { - lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); + lsn = Min(lsn, wp->safekeeper[i].appendResponse.flushLsn); } return lsn; } @@ -2083,163 +1560,37 @@ CalculateMinFlushLsn(void) * Calculate WAL position acknowledged by quorum */ static XLogRecPtr -GetAcknowledgedByQuorumWALPosition(void) +GetAcknowledgedByQuorumWALPosition(WalProposer *wp) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* * Sort acknowledged LSNs */ - for (int i = 0; i < n_safekeepers; i++) + for (int i = 0; i < wp->n_safekeepers; i++) { /* * Like in Raft, we aren't allowed to commit entries from previous * terms, so ignore reported LSN until it gets to epochStartLsn. */ - responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? safekeeper[i].appendResponse.flushLsn : 0; + responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propEpochStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; } - qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); /* * Get the smallest LSN committed by quorum */ - return responses[n_safekeepers - quorum]; -} - -/* - * WalproposerShmemSize --- report amount of shared memory space needed - */ -Size -WalproposerShmemSize(void) -{ - return sizeof(WalproposerShmemState); -} - -bool -WalproposerShmemInit(void) -{ - bool found; - - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - walprop_shared = ShmemInitStruct("Walproposer shared state", - sizeof(WalproposerShmemState), - &found); - - if (!found) - { - memset(walprop_shared, 0, WalproposerShmemSize()); - SpinLockInit(&walprop_shared->mutex); - pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); - } - LWLockRelease(AddinShmemInitLock); - - return found; -} - -void -replication_feedback_set(PageserverFeedback * rf) -{ - SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); - SpinLockRelease(&walprop_shared->mutex); -} - -void -replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) -{ - SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.last_received_lsn; - *flushLsn = walprop_shared->feedback.disk_consistent_lsn; - *applyLsn = walprop_shared->feedback.remote_consistent_lsn; - SpinLockRelease(&walprop_shared->mutex); -} - -/* - * Get PageserverFeedback fields from the most advanced safekeeper - */ -static void -GetLatestNeonFeedback(PageserverFeedback * rf) -{ - int latest_safekeeper = 0; - XLogRecPtr last_received_lsn = InvalidXLogRecPtr; - - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) - { - latest_safekeeper = i; - last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn; - } - } - - rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; - rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; - rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; - rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime; - - elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); - - replication_feedback_set(rf); + return responses[wp->n_safekeepers - wp->quorum]; } static void -HandleSafekeeperResponse(void) +HandleSafekeeperResponse(WalProposer *wp) { - HotStandbyFeedback hsFeedback; XLogRecPtr minQuorumLsn; - XLogRecPtr diskConsistentLsn; XLogRecPtr minFlushLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; - - if (!syncSafekeepers) - { - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - } - - if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) - { - - if (minQuorumLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = minQuorumLsn; - - /* advance the replication slot */ - if (!syncSafekeepers) - ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - - /* - * apply_lsn - This is what processed and durably saved at* - * pageserver. - */ - quorumFeedback.rf.disk_consistent_lsn, - GetCurrentTimestamp(), false); - } - - CombineHotStanbyFeedbacks(&hsFeedback); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) - { - quorumFeedback.hs = hsFeedback; - if (!syncSafekeepers) - ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); - } + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); + wp->api.process_safekeeper_feedback(wp, minQuorumLsn); /* * Try to advance truncateLsn to minFlushLsn, which is the last record @@ -2255,17 +1606,16 @@ HandleSafekeeperResponse(void) * term' in Raft); 2) chunks we read from WAL and send are plain sheets of * bytes, but safekeepers ack only on record boundaries. */ - minFlushLsn = CalculateMinFlushLsn(); - if (minFlushLsn > truncateLsn) + minFlushLsn = CalculateMinFlushLsn(wp); + if (minFlushLsn > wp->truncateLsn) { - truncateLsn = minFlushLsn; + wp->truncateLsn = minFlushLsn; /* * Advance the replication slot to free up old WAL files. Note that * slot doesn't exist if we are in syncSafekeepers mode. */ - if (MyReplicationSlot) - PhysicalConfirmReceivedLocation(truncateLsn); + wp->api.confirm_wal_streamed(wp->truncateLsn); } /* @@ -2280,15 +1630,15 @@ HandleSafekeeperResponse(void) * (due to pageserver connecting to not-synced-safekeeper) we currently * wait for all seemingly alive safekeepers to get synced. */ - if (syncSafekeepers) + if (wp->config->syncSafekeepers) { int n_synced; n_synced = 0; - for (int i = 0; i < n_safekeepers; i++) + for (int i = 0; i < wp->n_safekeepers; i++) { - Safekeeper *sk = &safekeeper[i]; - bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; + Safekeeper *sk = &wp->safekeeper[i]; + bool synced = sk->appendResponse.commitLsn >= wp->propEpochStartLsn; /* alive safekeeper which is not synced yet; wait for it */ if (sk->state != SS_OFFLINE && !synced) @@ -2297,23 +1647,23 @@ HandleSafekeeperResponse(void) n_synced++; } - if (n_synced >= quorum) + if (n_synced >= wp->quorum) { /* A quorum of safekeepers has been synced! */ - - /* - * Send empty message to broadcast latest truncateLsn to all safekeepers. - * This helps to finish next sync-safekeepers eailier, by skipping recovery - * step. - * - * We don't need to wait for response because it doesn't affect correctness, - * and TCP should be able to deliver the message to safekeepers in case of - * network working properly. - */ - BroadcastAppendRequest(); - fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); - exit(0); + /* + * Send empty message to broadcast latest truncateLsn to all + * safekeepers. This helps to finish next sync-safekeepers + * eailier, by skipping recovery step. + * + * We don't need to wait for response because it doesn't affect + * correctness, and TCP should be able to deliver the message to + * safekeepers in case of network working properly. + */ + BroadcastAppendRequest(wp); + + wp->api.finish_sync_safekeepers(wp->propEpochStartLsn); + /* unreachable */ } } } @@ -2325,7 +1675,9 @@ HandleSafekeeperResponse(void) static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size) { - switch (walprop_async_read(sk->conn, buf, buf_size)) + WalProposer *wp = sk->wp; + + switch (wp->api.conn_async_read(sk->conn, buf, buf_size)) { case PG_ASYNC_READ_SUCCESS: return true; @@ -2337,7 +1689,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) case PG_ASYNC_READ_FAIL: elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + wp->api.conn_error_message(sk->conn)); ShutdownConnection(sk); return false; } @@ -2355,8 +1707,10 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * failed, a warning is emitted and the connection is reset. */ static bool -AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) { + WalProposer *wp = sk->wp; + char *buf; int buf_size; uint64 tag; @@ -2378,7 +1732,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) ResetConnection(sk); return false; } - sk->latestMsgReceivedAt = GetCurrentTimestamp(); + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(); switch (tag) { case 'g': @@ -2444,13 +1798,14 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) { + WalProposer *wp = sk->wp; uint32 events; - if (!walprop_blocking_write(sk->conn, msg, msg_size)) + if (!wp->api.conn_blocking_write(sk->conn, msg, msg_size)) { elog(WARNING, "Failed to send to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + wp->api.conn_error_message(sk->conn)); ShutdownConnection(sk); return false; } @@ -2463,7 +1818,7 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes */ events = SafekeeperStateDesiredEvents(success_state); if (events) - UpdateEventSet(sk, events); + wp->api.update_event_set(sk, events); return true; } @@ -2478,7 +1833,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) { - switch (walprop_async_write(sk->conn, msg, msg_size)) + WalProposer *wp = sk->wp; + + switch (wp->api.conn_async_write(sk->conn, msg, msg_size)) { case PG_ASYNC_WRITE_SUCCESS: return true; @@ -2490,12 +1847,12 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta * this function */ sk->state = flush_state; - UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); + wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); return false; case PG_ASYNC_WRITE_FAIL: elog(WARNING, "Failed to send to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + wp->api.conn_error_message(sk->conn)); ShutdownConnection(sk); return false; default: @@ -2515,13 +1872,15 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta static bool AsyncFlush(Safekeeper *sk) { + WalProposer *wp = sk->wp; + /*--- * PQflush returns: * 0 if successful [we're good to move on] * 1 if unable to send everything yet [call PQflush again] * -1 if it failed [emit an error] */ - switch (walprop_flush(sk->conn)) + switch (wp->api.conn_flush(sk->conn)) { case 0: /* flush is done */ @@ -2532,7 +1891,7 @@ AsyncFlush(Safekeeper *sk) case -1: elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); + wp->api.conn_error_message(sk->conn)); ResetConnection(sk); return false; default: @@ -2541,88 +1900,210 @@ AsyncFlush(Safekeeper *sk) } } -/* Check if we need to suspend inserts because of lagging replication. */ -static uint64 -backpressure_lag_impl(void) +static int +CompareLsn(const void *a, const void *b) { - if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) - { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; -#if PG_VERSION_NUM >= 150000 - XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); -#else - XLogRecPtr myFlushLsn = GetFlushRecPtr(); -#endif - replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); -#define MB ((XLogRecPtr)1024 * 1024) + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); - elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", - LSN_FORMAT_ARGS(myFlushLsn), - LSN_FORMAT_ARGS(writePtr), - LSN_FORMAT_ARGS(flushPtr), - LSN_FORMAT_ARGS(applyPtr)); - - if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) - { - return (myFlushLsn - writePtr - max_replication_write_lag * MB); - } - - if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) - { - return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); - } - - if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) - { - return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); - } - } - return 0; + if (lsn1 < lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; } -#define BACK_PRESSURE_DELAY 10000L // 0.01 sec - -static bool -backpressure_throttling_impl(void) +/* Returns a human-readable string corresonding to the SafekeeperState + * + * The string should not be freed. + * + * The strings are intended to be used as a prefix to "state", e.g.: + * + * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * + * If this sort of phrasing doesn't fit the message, instead use something like: + * + * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + */ +static char * +FormatSafekeeperState(SafekeeperState state) { - int64 lag; - TimestampTz start, - stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + char *return_val = NULL; + + switch (state) + { + case SS_OFFLINE: + return_val = "offline"; + break; + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + return_val = "connecting"; + break; + case SS_WAIT_EXEC_RESULT: + return_val = "receiving query result"; + break; + case SS_HANDSHAKE_RECV: + return_val = "handshake (receiving)"; + break; + case SS_VOTING: + return_val = "voting"; + break; + case SS_WAIT_VERDICT: + return_val = "wait-for-verdict"; + break; + case SS_SEND_ELECTED_FLUSH: + return_val = "send-announcement-flush"; + break; + case SS_IDLE: + return_val = "idle"; + break; + case SS_ACTIVE: + return_val = "active"; + break; + } + + Assert(return_val != NULL); + + return return_val; +} + +/* Asserts that the provided events are expected for given safekeeper's state */ +static void +AssertEventsOkForState(uint32 events, Safekeeper *sk) +{ + uint32 expected = SafekeeperStateDesiredEvents(sk->state); /* - * Don't throttle read only transactions or wal sender. - * Do throttle CREATE INDEX CONCURRENTLY, however. It performs some - * stages outside a transaction, even though it writes a lot of WAL. - * Check PROC_IN_SAFE_IC flag to cover that case. + * The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. (b) if we are expecting something, there's + * overlap (i.e. `events & expected != 0`) */ - if (am_walsender - || (!(MyProc->statusFlags & PROC_IN_SAFE_IC) - && !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))) - return retry; + bool events_ok_for_state; /* long name so the `Assert` is more + * clear later */ - /* Calculate replicas lag */ - lag = backpressure_lag_impl(); - if (lag == 0) - return retry; + if (expected == WL_NO_EVENTS) + events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); + else + events_ok_for_state = ((events & expected) != 0); - /* Suspend writers until replicas catch up */ - set_ps_display("backpressure throttling"); - - elog(DEBUG2, "backpressure throttling: lag %lu", lag); - start = GetCurrentTimestamp(); - pg_usleep(BACK_PRESSURE_DELAY); - stop = GetCurrentTimestamp(); - pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); - return true; + if (!events_ok_for_state) + { + /* + * To give a descriptive message in the case of failure, we use elog + * and then an assertion that's guaranteed to fail. + */ + elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + Assert(events_ok_for_state); + } } -uint64 -BackpressureThrottlingTime(void) +/* Returns the set of events a safekeeper in this state should be waiting on + * + * This will return WL_NO_EVENTS (= 0) for some events. */ +static uint32 +SafekeeperStateDesiredEvents(SafekeeperState state) { - return pg_atomic_read_u64(&walprop_shared->backpressureThrottlingTime); + uint32 result = WL_NO_EVENTS; + + /* If the state doesn't have a modifier, we can check the base state */ + switch (state) + { + /* Connecting states say what they want in the name */ + case SS_CONNECTING_READ: + result = WL_SOCKET_READABLE; + break; + case SS_CONNECTING_WRITE: + result = WL_SOCKET_WRITEABLE; + break; + + /* Reading states need the socket to be read-ready to continue */ + case SS_WAIT_EXEC_RESULT: + case SS_HANDSHAKE_RECV: + case SS_WAIT_VERDICT: + result = WL_SOCKET_READABLE; + break; + + /* + * Idle states use read-readiness as a sign that the connection + * has been disconnected. + */ + case SS_VOTING: + case SS_IDLE: + result = WL_SOCKET_READABLE; + break; + + /* + * Flush states require write-ready for flushing. Active state + * does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We + * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ + case SS_SEND_ELECTED_FLUSH: + case SS_ACTIVE: + result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + break; + + /* The offline state expects no events. */ + case SS_OFFLINE: + result = WL_NO_EVENTS; + break; + + default: + Assert(false); + break; + } + + return result; +} + +/* Returns a human-readable string corresponding to the event set + * + * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the + * returned string may be meaingless. + * + * The string should not be freed. It should also not be expected to remain the same between + * function calls. */ +static char * +FormatEvents(uint32 events) +{ + static char return_str[8]; + + /* Helper variable to check if there's extra bits */ + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; + + /* + * The formatting here isn't supposed to be *particularly* useful -- it's + * just to give an sense of what events have been triggered without + * needing to remember your powers of two. + */ + + return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; + return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; + return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; + return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; + return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; + return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; + + if (events & (~all_flags)) + { + elog(WARNING, "Event formatting found unexpected component %d", + events & (~all_flags)); + return_str[6] = '*'; + return_str[7] = '\0'; + } + else + return_str[6] = '\0'; + + return (char *) &return_str; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index fa1ba30a8f..a1a9ccdfdd 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -1,8 +1,8 @@ #ifndef __NEON_WALPROPOSER_H__ #define __NEON_WALPROPOSER_H__ -#include "access/xlogdefs.h" #include "postgres.h" +#include "access/xlogdefs.h" #include "port.h" #include "access/xlog_internal.h" #include "access/transam.h" @@ -16,29 +16,15 @@ #define MAX_SAFEKEEPERS 32 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL * message */ -#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* - * message header */ -#define XLOG_HDR_END_POS (1 + 8) /* offset of end position in wal sender* - * message header */ - /* * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred, * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 */ #define WL_NO_EVENTS 0 -extern char *wal_acceptors_list; -extern int wal_acceptor_reconnect_timeout; -extern int wal_acceptor_connection_timeout; -extern bool am_wal_proposer; - -struct WalProposerConn; /* Defined in libpqwalproposer */ +struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */ typedef struct WalProposerConn WalProposerConn; -struct WalMessage; -typedef struct WalMessage WalMessage; - /* Possible return values from ReadPGAsync */ typedef enum { @@ -52,7 +38,7 @@ typedef enum PG_ASYNC_READ_TRY_AGAIN, /* Reading failed. Check PQerrorMessage(conn) */ PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; +} PGAsyncReadResult; /* Possible return values from WritePGAsync */ typedef enum @@ -71,7 +57,7 @@ typedef enum PG_ASYNC_WRITE_TRY_FLUSH, /* Writing failed. Check PQerrorMessage(conn) */ PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; +} PGAsyncWriteResult; /* * WAL safekeeper state, which is used to wait for some event. @@ -147,7 +133,7 @@ typedef enum * to read. */ SS_ACTIVE, -} SafekeeperState; +} SafekeeperState; /* Consensus logical timestamp. */ typedef uint64 term_t; @@ -171,12 +157,12 @@ typedef struct ProposerGreeting uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; -} ProposerGreeting; +} ProposerGreeting; typedef struct AcceptorProposerMessage { uint64 tag; -} AcceptorProposerMessage; +} AcceptorProposerMessage; /* * Acceptor -> Proposer initial response: the highest term acceptor voted for. @@ -186,7 +172,7 @@ typedef struct AcceptorGreeting AcceptorProposerMessage apm; term_t term; NNodeId nodeId; -} AcceptorGreeting; +} AcceptorGreeting; /* * Proposer -> Acceptor vote request. @@ -196,20 +182,20 @@ typedef struct VoteRequest uint64 tag; term_t term; pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; +} VoteRequest; /* Element of term switching chain. */ typedef struct TermSwitchEntry { term_t term; XLogRecPtr lsn; -} TermSwitchEntry; +} TermSwitchEntry; typedef struct TermHistory { uint32 n_entries; TermSwitchEntry *entries; -} TermHistory; +} TermHistory; /* Vote itself, sent from safekeeper to proposer */ typedef struct VoteResponse @@ -227,7 +213,7 @@ typedef struct VoteResponse * recovery of some safekeeper */ TermHistory termHistory; XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -} VoteResponse; +} VoteResponse; /* * Proposer -> Acceptor message announcing proposer is elected and communicating @@ -243,7 +229,7 @@ typedef struct ProposerElected TermHistory *termHistory; /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; -} ProposerElected; +} ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. @@ -268,7 +254,7 @@ typedef struct AppendRequestHeader */ XLogRecPtr truncateLsn; pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; +} AppendRequestHeader; /* * Hot standby feedback received from replica @@ -278,7 +264,7 @@ typedef struct HotStandbyFeedback TimestampTz ts; FullTransactionId xmin; FullTransactionId catalog_xmin; -} HotStandbyFeedback; +} HotStandbyFeedback; typedef struct PageserverFeedback { @@ -289,7 +275,7 @@ typedef struct PageserverFeedback XLogRecPtr disk_consistent_lsn; XLogRecPtr remote_consistent_lsn; TimestampTz replytime; -} PageserverFeedback; +} PageserverFeedback; typedef struct WalproposerShmemState { @@ -297,7 +283,7 @@ typedef struct WalproposerShmemState PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; -} WalproposerShmemState; +} WalproposerShmemState; /* * Report safekeeper state to proposer @@ -321,17 +307,22 @@ typedef struct AppendResponse /* and custom neon feedback. */ /* This part of the message is extensible. */ PageserverFeedback rf; -} AppendResponse; +} AppendResponse; /* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) +struct WalProposer; +typedef struct WalProposer WalProposer; + /* * Descriptor of safekeeper */ typedef struct Safekeeper { + WalProposer *wp; + char const *host; char const *port; @@ -340,7 +331,7 @@ typedef struct Safekeeper * * May contain private information like password and should not be logged. */ - char conninfo[MAXCONNINFO]; + char conninfo[MAXCONNINFO]; /* * postgres protocol connection to the WAL acceptor @@ -373,27 +364,12 @@ typedef struct Safekeeper int eventPos; /* position in wait event set. Equal to -1 if* * no event */ SafekeeperState state; /* safekeeper state machine state */ - TimestampTz latestMsgReceivedAt; /* when latest msg is received */ + TimestampTz latestMsgReceivedAt; /* when latest msg is received */ AcceptorGreeting greetResponse; /* acceptor greeting */ VoteResponse voteResponse; /* the vote */ AppendResponse appendResponse; /* feedback for master */ } Safekeeper; -extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); -extern void PGDLLEXPORT WalProposerMain(Datum main_arg); -extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); -extern void WalProposerPoll(void); -extern void ParsePageserverFeedbackMessage(StringInfo reply_message, - PageserverFeedback *rf); -extern void StartProposerReplication(StartReplicationCmd *cmd); - -extern Size WalproposerShmemSize(void); -extern bool WalproposerShmemInit(void); -extern void replication_feedback_set(PageserverFeedback *rf); -extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); - -/* libpqwalproposer hooks & helper type */ - /* Re-exported PostgresPollingStatusType */ typedef enum { @@ -406,7 +382,7 @@ typedef enum * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. * We've removed it here to avoid clutter. */ -} WalProposerConnectPollStatusType; +} WalProposerConnectPollStatusType; /* Re-exported and modified ExecStatusType */ typedef enum @@ -431,7 +407,7 @@ typedef enum WP_EXEC_NEEDS_INPUT, /* Catch-all failure. Check PQerrorMessage. */ WP_EXEC_FAILED, -} WalProposerExecStatusType; +} WalProposerExecStatusType; /* Re-exported ConnStatusType */ typedef enum @@ -445,67 +421,252 @@ typedef enum * that extra functionality, so we collect them into a single tag here. */ WP_CONNECTION_IN_PROGRESS, -} WalProposerConnStatusType; - -/* Re-exported PQerrorMessage */ -extern char *walprop_error_message(WalProposerConn *conn); - -/* Re-exported PQstatus */ -extern WalProposerConnStatusType walprop_status(WalProposerConn *conn); - -/* Re-exported PQconnectStart */ -extern WalProposerConn * walprop_connect_start(char *conninfo, char *password); - -/* Re-exported PQconectPoll */ -extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn); - -/* Blocking wrapper around PQsendQuery */ -extern bool walprop_send_query(WalProposerConn *conn, char *query); - -/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ -extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn); - -/* Re-exported PQsocket */ -extern pgsocket walprop_socket(WalProposerConn *conn); - -/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ -extern int walprop_flush(WalProposerConn *conn); - -/* Re-exported PQfinish */ -extern void walprop_finish(WalProposerConn *conn); +} WalProposerConnStatusType; /* - * Ergonomic wrapper around PGgetCopyData - * - * Reads a CopyData block from a safekeeper, setting *amount to the number - * of bytes returned. - * - * This function is allowed to assume certain properties specific to the - * protocol with the safekeepers, so it should not be used as-is for any - * other purpose. - * - * Note: If possible, using is generally preferred, because it - * performs a bit of extra checking work that's always required and is normally - * somewhat verbose. + * Collection of hooks for walproposer, to call postgres functions, + * read WAL and send it over the network. */ -extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount); +typedef struct walproposer_api +{ + /* + * Get WalproposerShmemState. This is used to store information about last + * elected term. + */ + WalproposerShmemState *(*get_shmem_state) (void); + + /* + * Start receiving notifications about new WAL. This is an infinite loop + * which calls WalProposerBroadcast() and WalProposerPoll() to send the + * WAL. + */ + void (*start_streaming) (WalProposer *wp, XLogRecPtr startpos); + + /* Get pointer to the latest available WAL. */ + XLogRecPtr (*get_flush_rec_ptr) (void); + + /* Get current time. */ + TimestampTz (*get_current_timestamp) (void); + + /* Get postgres timeline. */ + TimeLineID (*get_timeline_id) (void); + + /* Current error message, aka PQerrorMessage. */ + char *(*conn_error_message) (WalProposerConn *conn); + + /* Connection status, aka PQstatus. */ + WalProposerConnStatusType (*conn_status) (WalProposerConn *conn); + + /* Start the connection, aka PQconnectStart. */ + WalProposerConn *(*conn_connect_start) (char *conninfo); + + /* Poll an asynchronous connection, aka PQconnectPoll. */ + WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn); + + /* Send a blocking SQL query, aka PQsendQuery. */ + bool (*conn_send_query) (WalProposerConn *conn, char *query); + + /* Read the query result, aka PQgetResult. */ + WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn); + + /* Flush buffer to the network, aka PQflush. */ + int (*conn_flush) (WalProposerConn *conn); + + /* Close the connection, aka PQfinish. */ + void (*conn_finish) (WalProposerConn *conn); + + /* Try to read CopyData message, aka PQgetCopyData. */ + PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount); + + /* Try to write CopyData message, aka PQputCopyData. */ + PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size); + + /* Blocking CopyData write, aka PQputCopyData + PQflush. */ + bool (*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size); + + /* Download WAL from startpos to endpos and make it available locally. */ + bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); + + /* Read WAL from disk to buf. */ + void (*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count); + + /* Allocate WAL reader. */ + XLogReaderState *(*wal_reader_allocate) (void); + + /* Deallocate event set. */ + void (*free_event_set) (void); + + /* Initialize event set. */ + void (*init_event_set) (int n_safekeepers); + + /* Update events for an existing safekeeper connection. */ + void (*update_event_set) (Safekeeper *sk, uint32 events); + + /* Add a new safekeeper connection to the event set. */ + void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events); + + /* + * Wait until some event happens: - timeout is reached - socket event for + * safekeeper connection - new WAL is available + * + * Returns 0 if timeout is reached, 1 if some event happened. Updates + * events mask to indicate events and sets sk to the safekeeper which has + * an event. + */ + int (*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events); + + /* Read random bytes. */ + bool (*strong_random) (void *buf, size_t len); + + /* + * Get a basebackup LSN. Used to cross-validate with the latest available + * LSN on the safekeepers. + */ + XLogRecPtr (*get_redo_start_lsn) (void); + + /* + * Finish sync safekeepers with the given LSN. This function should not + * return and should exit the program. + */ + void (*finish_sync_safekeepers) (XLogRecPtr lsn); + + /* + * Called after every new message from the safekeeper. Used to propagate + * backpressure feedback and to confirm WAL persistence (has been commited + * on the quorum of safekeepers). + */ + void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); + + /* + * Called on peer_horizon_lsn updates. Used to advance replication slot + * and to free up disk space by deleting unnecessary WAL. + */ + void (*confirm_wal_streamed) (XLogRecPtr lsn); +} walproposer_api; /* - * Ergonomic wrapper around PQputCopyData + PQflush - * - * Starts to write a CopyData block to a safekeeper. - * - * For information on the meaning of return codes, refer to PGAsyncWriteResult. + * Configuration of the WAL proposer. */ -extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size); +typedef struct WalProposerConfig +{ + /* hex-encoded TenantId cstr */ + char *neon_tenant; + + /* hex-encoded TimelineId cstr */ + char *neon_timeline; + + /* + * Comma-separated list of safekeepers, in the following format: + * host1:port1,host2:port2,host3:port3 + * + * This cstr should be editable. + */ + char *safekeepers_list; + + /* + * WalProposer reconnects to offline safekeepers once in this interval. + * Time is in milliseconds. + */ + int safekeeper_reconnect_timeout; + + /* + * WalProposer terminates the connection if it doesn't receive any message + * from the safekeeper in this interval. Time is in milliseconds. + */ + int safekeeper_connection_timeout; + + /* + * WAL segment size. Will be passed to safekeepers in greet request. Also + * used to detect page headers. + */ + int wal_segment_size; + + /* + * If safekeeper was started in sync mode, walproposer will not subscribe + * for new WAL and will exit when quorum of safekeepers will be synced to + * the latest available LSN. + */ + bool syncSafekeepers; + + /* Will be passed to safekeepers in greet request. */ + uint64 systemId; +} WalProposerConfig; + /* - * Blocking equivalent to walprop_async_write_fn - * - * Returns 'true' if successful, 'false' on failure. + * WAL proposer state. */ -extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size); +typedef struct WalProposer +{ + WalProposerConfig *config; + int n_safekeepers; -extern uint64 BackpressureThrottlingTime(void); + /* (n_safekeepers / 2) + 1 */ + int quorum; + + Safekeeper safekeeper[MAX_SAFEKEEPERS]; + + /* WAL has been generated up to this point */ + XLogRecPtr availableLsn; + + /* last commitLsn broadcasted to safekeepers */ + XLogRecPtr lastSentCommitLsn; + + ProposerGreeting greetRequest; + + /* Vote request for safekeeper */ + VoteRequest voteRequest; + + /* + * Minimal LSN which may be needed for recovery of some safekeeper, + * record-aligned (first record which might not yet received by someone). + */ + XLogRecPtr truncateLsn; + + /* + * Term of the proposer. We want our term to be highest and unique, so we + * collect terms from safekeepers quorum, choose max and +1. After that + * our term is fixed and must not change. If we observe that some + * safekeeper has higher term, it means that we have another running + * compute, so we must stop immediately. + */ + term_t propTerm; + + /* term history of the proposer */ + TermHistory propTermHistory; + + /* epoch start lsn of the proposer */ + XLogRecPtr propEpochStartLsn; + + /* Most advanced acceptor epoch */ + term_t donorEpoch; + + /* Most advanced acceptor */ + int donor; + + /* timeline globally starts at this LSN */ + XLogRecPtr timelineStartLsn; + + /* number of votes collected from safekeepers */ + int n_votes; + + /* number of successful connections over the lifetime of walproposer */ + int n_connected; + + /* + * Timestamp of the last reconnection attempt. Related to + * config->safekeeper_reconnect_timeout + */ + TimestampTz last_reconnect_attempt; + + walproposer_api api; +} WalProposer; + +extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api); +extern void WalProposerStart(WalProposer *wp); +extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos); +extern void WalProposerPoll(WalProposer *wp); +extern void ParsePageserverFeedbackMessage(StringInfo reply_message, + PageserverFeedback *rf); #endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c new file mode 100644 index 0000000000..654b411e94 --- /dev/null +++ b/pgxn/neon/walproposer_pg.c @@ -0,0 +1,1667 @@ +/* + * Implementation of postgres based walproposer disk and IO routines, i.e. the + * real ones. The reason this is separate from walproposer.c is ability to + * replace them with mocks, allowing to do simulation testing. + * + * Also contains initialization of postgres based walproposer. + */ + +#include "postgres.h" + +#include +#include +#include +#include "access/xact.h" +#include "access/xlogdefs.h" +#include "access/xlogutils.h" +#include "access/xloginsert.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif +#include "storage/fd.h" +#include "storage/latch.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlog.h" +#include "libpq/pqformat.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/walsender_private.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/timestamp.h" + +#include "neon.h" +#include "walproposer.h" +#include "libpq-fe.h" + +#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* + * message header */ + +#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" + +char *wal_acceptors_list = ""; +int wal_acceptor_reconnect_timeout = 1000; +int wal_acceptor_connection_timeout = 10000; + +static AppendResponse quorumFeedback; +static WalproposerShmemState *walprop_shared; +static WalProposerConfig walprop_config; +static XLogRecPtr sentPtr = InvalidXLogRecPtr; +static const walproposer_api walprop_pg; + +static void nwp_shmem_startup_hook(void); +static void nwp_register_gucs(void); +static void nwp_prepare_shmem(void); +static uint64 backpressure_lag_impl(void); +static bool backpressure_throttling_impl(void); +static void walprop_register_bgworker(void); + +static void walprop_pg_init_standalone_sync_safekeepers(void); +static void walprop_pg_init_walsender(void); +static void walprop_pg_init_bgworker(void); +static TimestampTz walprop_pg_get_current_timestamp(void); +static void walprop_pg_load_libpqwalreceiver(void); + +static process_interrupts_callback_t PrevProcessInterruptsCallback; +static shmem_startup_hook_type prev_shmem_startup_hook_type; +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void walproposer_shmem_request(void); +#endif + +static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd); +static void WalSndLoop(WalProposer *wp); +static void XLogBroadcastWalProposer(WalProposer *wp); + +static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +static void XLogWalPropClose(XLogRecPtr recptr); + +static void +init_walprop_config(bool syncSafekeepers) +{ + walprop_config.neon_tenant = neon_tenant; + walprop_config.neon_timeline = neon_timeline; + walprop_config.safekeepers_list = wal_acceptors_list; + walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout; + walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout; + walprop_config.wal_segment_size = wal_segment_size; + walprop_config.syncSafekeepers = syncSafekeepers; + if (!syncSafekeepers) + walprop_config.systemId = GetSystemIdentifier(); + else + walprop_config.systemId = 0; +} + +/* + * Entry point for `postgres --sync-safekeepers`. + */ +PGDLLEXPORT void +WalProposerSync(int argc, char *argv[]) +{ + WalProposer *wp; + + init_walprop_config(true); + walprop_pg_init_standalone_sync_safekeepers(); + walprop_pg_load_libpqwalreceiver(); + + wp = WalProposerCreate(&walprop_config, walprop_pg); + + WalProposerStart(wp); +} + +/* + * WAL proposer bgworker entry point. + */ +PGDLLEXPORT void +WalProposerMain(Datum main_arg) +{ + WalProposer *wp; + + init_walprop_config(false); + walprop_pg_init_bgworker(); + walprop_pg_load_libpqwalreceiver(); + + wp = WalProposerCreate(&walprop_config, walprop_pg); + wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(); + + walprop_pg_init_walsender(); + WalProposerStart(wp); +} + +/* + * Initialize GUCs, bgworker, shmem and backpressure. + */ +void +pg_init_walproposer(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + nwp_register_gucs(); + + nwp_prepare_shmem(); + + delay_backend_us = &backpressure_lag_impl; + PrevProcessInterruptsCallback = ProcessInterruptsCallback; + ProcessInterruptsCallback = backpressure_throttling_impl; + + walprop_register_bgworker(); +} + +static void +nwp_register_gucs(void) +{ + DefineCustomStringVariable( + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use* + * GUC_LIST_QUOTE */ + NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_reconnect_timeout", + "Walproposer reconnects to offline safekeepers once in this interval.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_connect_timeout", + "Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.", + NULL, + &wal_acceptor_connection_timeout, + 10000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL); +} + +/* Check if we need to suspend inserts because of lagging replication. */ +static uint64 +backpressure_lag_impl(void) +{ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) + { + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; +#if PG_VERSION_NUM >= 150000 + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); +#else + XLogRecPtr myFlushLsn = GetFlushRecPtr(); +#endif + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); +#define MB ((XLogRecPtr)1024 * 1024) + + elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", + LSN_FORMAT_ARGS(myFlushLsn), + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr)); + + if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) + { + return (myFlushLsn - writePtr - max_replication_write_lag * MB); + } + + if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) + { + return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); + } + + if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) + { + return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); + } + } + return 0; +} + +/* + * WalproposerShmemSize --- report amount of shared memory space needed + */ +static Size +WalproposerShmemSize(void) +{ + return sizeof(WalproposerShmemState); +} + +static bool +WalproposerShmemInit(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + walprop_shared = ShmemInitStruct("Walproposer shared state", + sizeof(WalproposerShmemState), + &found); + + if (!found) + { + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +#define BACK_PRESSURE_DELAY 10000L // 0.01 sec + +static bool +backpressure_throttling_impl(void) +{ + int64 lag; + TimestampTz start, + stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; + + /* + * Don't throttle read only transactions or wal sender. Do throttle CREATE + * INDEX CONCURRENTLY, however. It performs some stages outside a + * transaction, even though it writes a lot of WAL. Check PROC_IN_SAFE_IC + * flag to cover that case. + */ + if (am_walsender + || (!(MyProc->statusFlags & PROC_IN_SAFE_IC) + && !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))) + return retry; + + /* Calculate replicas lag */ + lag = backpressure_lag_impl(); + if (lag == 0) + return retry; + + /* Suspend writers until replicas catch up */ + set_ps_display("backpressure throttling"); + + elog(DEBUG2, "backpressure throttling: lag %lu", lag); + start = GetCurrentTimestamp(); + pg_usleep(BACK_PRESSURE_DELAY); + stop = GetCurrentTimestamp(); + pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); + return true; +} + +uint64 +BackpressureThrottlingTime(void) +{ + return pg_atomic_read_u64(&walprop_shared->backpressureThrottlingTime); +} + +/* + * Register a background worker proposing WAL to wal acceptors. + */ +static void +walprop_register_bgworker(void) +{ + BackgroundWorker bgw; + + /* If no wal acceptors are specified, don't start the background worker. */ + if (*wal_acceptors_list == '\0') + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +/* shmem handling */ + +static void +nwp_prepare_shmem(void) +{ +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; +#else + RequestAddinShmemSpace(WalproposerShmemSize()); +#endif + prev_shmem_startup_hook_type = shmem_startup_hook; + shmem_startup_hook = nwp_shmem_startup_hook; +} + +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in nwp_shmem_startup_hook(). + */ +static void +walproposer_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(WalproposerShmemSize()); +} +#endif + +static void +nwp_shmem_startup_hook(void) +{ + if (prev_shmem_startup_hook_type) + prev_shmem_startup_hook_type(); + + WalproposerShmemInit(); +} + +static WalproposerShmemState * +walprop_pg_get_shmem_state(void) +{ + Assert(walprop_shared != NULL); + return walprop_shared; +} + +void +replication_feedback_set(PageserverFeedback *rf) +{ + SpinLockAcquire(&walprop_shared->mutex); + memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); + SpinLockRelease(&walprop_shared->mutex); +} + +void +replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) +{ + SpinLockAcquire(&walprop_shared->mutex); + *writeLsn = walprop_shared->feedback.last_received_lsn; + *flushLsn = walprop_shared->feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->feedback.remote_consistent_lsn; + SpinLockRelease(&walprop_shared->mutex); +} + +/* + * Start walsender streaming replication + */ +static void +walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) +{ + StartReplicationCmd cmd; + + elog(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); + cmd.slotname = WAL_PROPOSER_SLOT_NAME; + cmd.timeline = wp->greetRequest.timeline; + cmd.startpoint = startpos; + StartProposerReplication(wp, &cmd); +} + +static void +walprop_pg_init_walsender(void) +{ + am_walsender = true; + InitWalSender(); + InitProcessPhase2(); + + /* Create replication slot for WAL proposer if not exists */ + if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) + { + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); + ReplicationSlotReserveWal(); + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + } +} + +static void +walprop_pg_init_standalone_sync_safekeepers(void) +{ + struct stat stat_buf; + +#if PG_VERSION_NUM < 150000 + ThisTimeLineID = 1; +#endif + + /* + * Initialize postmaster_alive_fds as WaitEventSet checks them. + * + * Copied from InitPostmasterDeathWatchHandle() + */ + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); + + ChangeToDataDir(); + + /* Create pg_wal directory, if it doesn't exist */ + if (stat(XLOGDIR, &stat_buf) != 0) + { + ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); + if (MakePGDirectory(XLOGDIR) < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + XLOGDIR))); + exit(1); + } + } + BackgroundWorkerUnblockSignals(); +} + +static void +walprop_pg_init_bgworker(void) +{ +#if PG_VERSION_NUM >= 150000 + TimeLineID tli; +#endif + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + application_name = (char *) "walproposer"; /* for + * synchronous_standby_names */ + +#if PG_VERSION_NUM >= 150000 + /* FIXME pass proper tli to WalProposerInit ? */ + GetXLogReplayRecPtr(&tli); +#else + GetXLogReplayRecPtr(&ThisTimeLineID); +#endif +} + +static XLogRecPtr +walprop_pg_get_flush_rec_ptr(void) +{ +#if PG_MAJORVERSION_NUM < 15 + return GetFlushRecPtr(); +#else + return GetFlushRecPtr(NULL); +#endif +} + +static TimestampTz +walprop_pg_get_current_timestamp(void) +{ + return GetCurrentTimestamp(); +} + +static TimeLineID +walprop_pg_get_timeline_id(void) +{ +#if PG_VERSION_NUM >= 150000 + /* FIXME don't use hardcoded timeline id */ + return 1; +#else + return ThisTimeLineID; +#endif +} + +static void +walprop_pg_load_libpqwalreceiver(void) +{ + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); +} + +/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ +struct WalProposerConn +{ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from walprop_async_read */ +}; + +/* Helper function */ +static bool +ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) +{ + /* If we're already correctly blocking or nonblocking, all good */ + if (is_nonblocking == conn->is_nonblocking) + return true; + + /* Otherwise, set it appropriately */ + if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) + return false; + + conn->is_nonblocking = is_nonblocking; + return true; +} + +/* Exported function definitions */ +static char * +walprop_error_message(WalProposerConn *conn) +{ + return PQerrorMessage(conn->pg_conn); +} + +static WalProposerConnStatusType +walprop_status(WalProposerConn *conn) +{ + switch (PQstatus(conn->pg_conn)) + { + case CONNECTION_OK: + return WP_CONNECTION_OK; + case CONNECTION_BAD: + return WP_CONNECTION_BAD; + default: + return WP_CONNECTION_IN_PROGRESS; + } +} + +static WalProposerConn * +walprop_connect_start(char *conninfo) +{ + WalProposerConn *conn; + PGconn *pg_conn; + const char *keywords[3]; + const char *values[3]; + int n; + char *password = neon_auth_token; + + /* + * Connect using the given connection string. If the NEON_AUTH_TOKEN + * environment variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so when + * we set the password before the connection string, the connection string + * can override the password from the env variable. Seems useful, although + * we don't currently use that capability anywhere. + */ + n = 0; + if (password) + { + keywords[n] = "password"; + values[n] = password; + n++; + } + keywords[n] = "dbname"; + values[n] = conninfo; + n++; + keywords[n] = NULL; + values[n] = NULL; + n++; + pg_conn = PQconnectStartParams(keywords, values, 1); + + /* + * Allocation of a PQconn can fail, and will return NULL. We want to fully + * replicate the behavior of PQconnectStart here. + */ + if (!pg_conn) + return NULL; + + /* + * And in theory this allocation can fail as well, but it's incredibly + * unlikely if we just successfully allocated a PGconn. + * + * palloc will exit on failure though, so there's not much we could do if + * it *did* fail. + */ + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ + conn->recvbuf = NULL; + return conn; +} + +static WalProposerConnectPollStatusType +walprop_connect_poll(WalProposerConn *conn) +{ + WalProposerConnectPollStatusType return_val; + + switch (PQconnectPoll(conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + return_val = WP_CONN_POLLING_FAILED; + break; + case PGRES_POLLING_READING: + return_val = WP_CONN_POLLING_READING; + break; + case PGRES_POLLING_WRITING: + return_val = WP_CONN_POLLING_WRITING; + break; + case PGRES_POLLING_OK: + return_val = WP_CONN_POLLING_OK; + break; + + /* + * There's a comment at its source about this constant being + * unused. We'll expect it's never returned. + */ + case PGRES_POLLING_ACTIVE: + elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + + /* + * This return is never actually reached, but it's here to make + * the compiler happy + */ + return WP_CONN_POLLING_FAILED; + + default: + Assert(false); + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + } + + return return_val; +} + +static bool +walprop_send_query(WalProposerConn *conn, char *query) +{ + /* + * We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush + */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* PQsendQuery returns 1 on success, 0 on failure */ + if (!PQsendQuery(conn->pg_conn, query)) + return false; + + return true; +} + +static WalProposerExecStatusType +walprop_get_query_result(WalProposerConn *conn) +{ + PGresult *result; + WalProposerExecStatusType return_val; + + /* Marker variable if we need to log an unexpected success result */ + char *unexpected_success = NULL; + + /* Consume any input that we might be missing */ + if (!PQconsumeInput(conn->pg_conn)) + return WP_EXEC_FAILED; + + if (PQisBusy(conn->pg_conn)) + return WP_EXEC_NEEDS_INPUT; + + + result = PQgetResult(conn->pg_conn); + + /* + * PQgetResult returns NULL only if getting the result was successful & + * there's no more of the result to get. + */ + if (!result) + { + elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + return WP_EXEC_UNEXPECTED_SUCCESS; + } + + /* Helper macro to reduce boilerplate */ +#define UNEXPECTED_SUCCESS(msg) \ + return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ + unexpected_success = msg; \ + break; + + + switch (PQresultStatus(result)) + { + /* "true" success case */ + case PGRES_COPY_BOTH: + return_val = WP_EXEC_SUCCESS_COPYBOTH; + break; + + /* Unexpected success case */ + case PGRES_EMPTY_QUERY: + UNEXPECTED_SUCCESS("empty query return"); + case PGRES_COMMAND_OK: + UNEXPECTED_SUCCESS("data-less command end"); + case PGRES_TUPLES_OK: + UNEXPECTED_SUCCESS("tuples return"); + case PGRES_COPY_OUT: + UNEXPECTED_SUCCESS("'Copy Out' response"); + case PGRES_COPY_IN: + UNEXPECTED_SUCCESS("'Copy In' response"); + case PGRES_SINGLE_TUPLE: + UNEXPECTED_SUCCESS("single tuple return"); + case PGRES_PIPELINE_SYNC: + UNEXPECTED_SUCCESS("pipeline sync point"); + + /* Failure cases */ + case PGRES_BAD_RESPONSE: + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_PIPELINE_ABORTED: + return_val = WP_EXEC_FAILED; + break; + + default: + Assert(false); + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + } + + if (unexpected_success) + elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + + return return_val; +} + +static pgsocket +walprop_socket(WalProposerConn *conn) +{ + return PQsocket(conn->pg_conn); +} + +static int +walprop_flush(WalProposerConn *conn) +{ + return (PQflush(conn->pg_conn)); +} + +static void +walprop_finish(WalProposerConn *conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +walprop_async_read(WalProposerConn *conn, char **buf, int *amount) +{ + int result; + + if (conn->recvbuf != NULL) + { + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; + } + + /* Call PQconsumeInput so that we have the data we need */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + + /* + * The docs for PQgetCopyData list the return values as: 0 if the copy is + * still in progress, but no "complete row" is available -1 if the copy is + * done -2 if an error occurred (> 0) if it was successful; that value is + * the amount transferred. + * + * The protocol we use between walproposer and safekeeper means that we + * *usually* wouldn't expect to see that the copy is done, but this can + * sometimes be triggered by the server returning an ErrorResponse (which + * also happens to have the effect that the copy is done). + */ + switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) + { + case 0: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_TRY_AGAIN; + case -1: + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server + * failed; it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + + /* + * If there was actually an error, it'll be properly reported + * by calls to PQerrorMessage -- we don't have to do anything + * else + */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + case -2: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + default: + /* Positive values indicate the size of the returned result */ + *amount = result; + *buf = conn->recvbuf; + return PG_ASYNC_READ_SUCCESS; + } +} + +static PGAsyncWriteResult +walprop_async_write(WalProposerConn *conn, void const *buf, size_t size) +{ + int result; + + /* If we aren't in non-blocking mode, switch to it. */ + if (!ensure_nonblocking_status(conn, true)) + return PG_ASYNC_WRITE_FAIL; + + /* + * The docs for PQputcopyData list the return values as: 1 if the data was + * queued, 0 if it was not queued because of full buffers, or -1 if an + * error occurred + */ + result = PQputCopyData(conn->pg_conn, buf, size); + + /* + * We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more + */ + Assert(result != 0); + + switch (result) + { + case 1: + /* good -- continue */ + break; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQputCopyData", result); + } + + /* + * After queueing the data, we still need to flush to get it to send. This + * might take multiple tries, but we don't want to wait around until it's + * done. + * + * PQflush has the following returns (directly quoting the docs): 0 if + * sucessful, 1 if it was unable to send all the data in the send queue + * yet -1 if it failed for some reason + */ + switch (result = PQflush(conn->pg_conn)) + { + case 0: + return PG_ASYNC_WRITE_SUCCESS; + case 1: + return PG_ASYNC_WRITE_TRY_FLUSH; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQflush", result); + } +} + +/* + * This function is very similar to walprop_async_write. For more + * information, refer to the comments there. + */ +static bool +walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size) +{ + int result; + + /* If we are in non-blocking mode, switch out of it. */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) + return false; + + Assert(result == 1); + + /* Because the connection is non-blocking, flushing returns 0 or -1 */ + + if ((result = PQflush(conn->pg_conn)) == -1) + return false; + + Assert(result == 0); + return true; +} + +/* + * Subscribe for new WAL and stream it in the loop to safekeepers. + * + * At the moment, this never returns, but an ereport(ERROR) will take us back + * to the main loop. + */ +static void +StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) +{ + XLogRecPtr FlushPtr; + TimeLineID currTLI; + +#if PG_VERSION_NUM < 150000 + if (ThisTimeLineID == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); +#endif + + /* + * We assume here that we're logging enough information in the WAL for + * log-shipping, since this is checked in PostmasterMain(). + * + * NOTE: wal_level can only change at shutdown, so in most cases it is + * difficult for there to be WAL data that we can still see that was + * written at wal_level='minimal'. + */ + + if (cmd->slotname) + { + ReplicationSlotAcquire(cmd->slotname, true); + if (SlotIsLogical(MyReplicationSlot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use a logical replication slot for physical replication"))); + + /* + * We don't need to verify the slot's restart_lsn here; instead we + * rely on the caller requesting the starting point to use. If the + * WAL segment doesn't exist, we'll fail later. + */ + } + + /* + * Select the timeline. If it was given explicitly by the client, use + * that. Otherwise use the timeline of the last replayed record, which is + * kept in ThisTimeLineID. + * + * Neon doesn't currently use PG Timelines, but it may in the future, so + * we keep this code around to lighten the load for when we need it. + */ +#if PG_VERSION_NUM >= 150000 + FlushPtr = GetFlushRecPtr(&currTLI); +#else + FlushPtr = GetFlushRecPtr(); + currTLI = ThisTimeLineID; +#endif + + /* + * When we first start replication the standby will be behind the primary. + * For some applications, for example synchronous replication, it is + * important to have a clear state for this initial catchup mode, so we + * can trigger actions when we change streaming state later. We may stay + * in this state for a long time, which is exactly why we want to be able + * to monitor whether or not we are still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* + * Don't allow a request to stream from a future point in WAL that hasn't + * been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); + } + + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; + + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); + + SyncRepInitConfig(); + + /* Infinite send loop, never returns */ + WalSndLoop(wp); + + WalSndSetState(WALSNDSTATE_STARTUP); + + if (cmd->slotname) + ReplicationSlotRelease(); +} + +/* + * Main loop that waits for LSN updates and calls the walproposer. + * Synchronous replication sets latch in WalSndWakeup at walsender.c + */ +static void +WalSndLoop(WalProposer *wp) +{ + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + XLogBroadcastWalProposer(wp); + + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(wp); + } +} + +/* + * Notify walproposer about the new WAL position. + */ +static void +XLogBroadcastWalProposer(WalProposer *wp) +{ + XLogRecPtr startptr; + XLogRecPtr endptr; + + /* Start from the last sent position */ + startptr = sentPtr; + + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and fsync'd to + * disk. We cannot go further than what's been written out given the + * current implementation of WALRead(). And in any case it's unsafe to + * send WAL that is not securely down to disk on the primary: if the + * primary subsequently crashes and restarts, standbys must not have + * applied any WAL that got lost on the primary. + */ +#if PG_VERSION_NUM >= 150000 + endptr = GetFlushRecPtr(NULL); +#else + endptr = GetFlushRecPtr(); +#endif + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(endptr, GetCurrentTimestamp()); + + /* Do we have any work to do? */ + Assert(startptr <= endptr); + if (endptr <= startptr) + return; + + WalProposerBroadcast(wp, startptr, endptr); + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(sentPtr)); + set_ps_display(activitymsg); + } +} + +/* + * Receive WAL from most advanced safekeeper + */ +static bool +WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +{ + char *err; + WalReceiverConn *wrconn; + WalRcvStreamOptions options; + char conninfo[MAXCONNINFO]; + + if (!neon_auth_token) + { + memcpy(conninfo, sk->conninfo, MAXCONNINFO); + } + else + { + int written = 0; + + written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); + if (written > MAXCONNINFO || written < 0) + elog(FATAL, "could not append password to the safekeeper connection string"); + } + +#if PG_MAJORVERSION_NUM < 16 + wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); +#else + wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); +#endif + + if (!wrconn) + { + ereport(WARNING, + (errmsg("could not connect to WAL acceptor %s:%s: %s", + sk->host, sk->port, + err))); + return false; + } + elog(LOG, + "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + sk->host, sk->port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + + options.logical = false; + options.startpoint = startpos; + options.slotname = NULL; + options.proto.physical.startpointTLI = timeline; + + if (walrcv_startstreaming(wrconn, &options)) + { + XLogRecPtr rec_start_lsn; + XLogRecPtr rec_end_lsn = 0; + int len; + char *buf; + pgsocket wait_fd = PGINVALID_SOCKET; + + while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) + { + if (len == 0) + { + (void) WaitLatchOrSocket( + MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, + -1, WAIT_EVENT_WAL_RECEIVER_MAIN); + } + else + { + Assert(buf[0] == 'w' || buf[0] == 'k'); + if (buf[0] == 'k') + continue; /* keepalive */ + memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], + sizeof rec_start_lsn); + rec_start_lsn = pg_ntoh64(rec_start_lsn); + rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; + + /* write WAL to disk */ + XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); + + ereport(DEBUG1, + (errmsg("Recover message %X/%X length %d", + LSN_FORMAT_ARGS(rec_start_lsn), len))); + if (rec_end_lsn >= endpos) + break; + } + } + ereport(LOG, + (errmsg("end of replication stream at %X/%X: %m", + LSN_FORMAT_ARGS(rec_end_lsn)))); + walrcv_disconnect(wrconn); + + /* failed to receive all WAL till endpos */ + if (rec_end_lsn < endpos) + return false; + } + else + { + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", + timeline, (uint32) (startpos >> 32), (uint32) startpos))); + return false; + } + + return true; +} + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID + * corresponding the filename of walpropFile. + */ +static int walpropFile = -1; +static TimeLineID walpropFileTLI = 0; +static XLogSegNo walpropSegNo = 0; + +/* + * Write XLOG data to disk. + */ +static void +XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) +{ + int startoff; + int byteswritten; + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + XLogWalPropClose(recptr); + + if (walpropFile < 0) + { +#if PG_VERSION_NUM >= 150000 + /* FIXME Is it ok to use hardcoded value here? */ + TimeLineID tli = 1; +#else + bool use_existent = true; +#endif + /* Create/use new log file */ + XLByteToSeg(recptr, walpropSegNo, wal_segment_size); +#if PG_VERSION_NUM >= 150000 + walpropFile = XLogFileInit(walpropSegNo, tli); + walpropFileTLI = tli; +#else + walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); + walpropFileTLI = ThisTimeLineID; +#endif + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + } + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop. + */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + { + XLogWalPropClose(recptr); + } +} + +/* + * Close the current segment. + */ +static void +XLogWalPropClose(XLogRecPtr recptr) +{ + Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); + + if (close(walpropFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log segment %s: %m", + xlogfname))); + } + + walpropFile = -1; +} + +static void +walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count) +{ + WALReadError errinfo; + + if (!WALRead(state, + buf, + startptr, + count, + walprop_pg_get_timeline_id(), + &errinfo)) + { + WALReadRaiseError(&errinfo); + } +} + +static XLogReaderState * +walprop_pg_wal_reader_allocate(void) +{ + return XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); +} + +static WaitEventSet *waitEvents; + +static void +walprop_pg_free_event_set(void) +{ + if (waitEvents) + { + FreeWaitEventSet(waitEvents); + waitEvents = NULL; + } +} + +static void +walprop_pg_init_event_set(int n_safekeepers) +{ + if (waitEvents) + elog(FATAL, "double-initialization of event set"); + + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); + AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); +} + +static void +walprop_pg_update_event_set(Safekeeper *sk, uint32 events) +{ + /* eventPos = -1 when we don't have an event */ + Assert(sk->eventPos != -1); + + ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +} + +static void +walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events) +{ + sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk->conn), NULL, sk); +} + +static int +walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events) +{ + WaitEvent event = {0}; + int rc = 0; + bool late_cv_trigger = false; + + *sk = NULL; + *events = 0; + +#if PG_MAJORVERSION_NUM >= 16 + if (WalSndCtl != NULL) + ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); +#endif + + /* + * Wait for a wait event to happen, or timeout: - Safekeeper socket can + * become available for READ or WRITE - Our latch got set, because * + * PG15-: We got woken up by a process triggering the WalSender * PG16+: + * WalSndCtl->wal_flush_cv was triggered + */ + rc = WaitEventSetWait(waitEvents, timeout, + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); +#if PG_MAJORVERSION_NUM >= 16 + if (WalSndCtl != NULL) + late_cv_trigger = ConditionVariableCancelSleep(); +#endif + + /* + * If wait is terminated by latch set (walsenders' latch is set on each + * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) + */ + if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger) + { + /* Reset our latch */ + ResetLatch(MyLatch); + *events = WL_LATCH_SET; + return 1; + } + + /* + * If the event contains something about the socket, it means we got an + * event from a safekeeper socket. + */ + if (rc == 1 && (event.events & (WL_SOCKET_MASK))) + { + *sk = (Safekeeper *) event.user_data; + *events = event.events; + return 1; + } + + /* XXX: Can we have non-timeout event here? */ + *events = event.events; + return rc; +} + +static void +walprop_pg_finish_sync_safekeepers(XLogRecPtr lsn) +{ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn)); + exit(0); +} + +/* + * Get PageserverFeedback fields from the most advanced safekeeper + */ +static void +GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) +{ + int latest_safekeeper = 0; + XLogRecPtr last_received_lsn = InvalidXLogRecPtr; + + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) + { + latest_safekeeper = i; + last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn; + } + } + + rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; + rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; + rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; + rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; + rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; + + elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); + + replication_feedback_set(rf); +} + +/* + * Combine hot standby feedbacks from all safekeepers. + */ +static void +CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) +{ + hs->ts = 0; + hs->xmin.value = ~0; /* largest unsigned value */ + hs->catalog_xmin.value = ~0; /* largest unsigned value */ + + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].appendResponse.hs.ts != 0) + { + HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; + + if (FullTransactionIdIsNormal(skhs->xmin) + && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) + { + hs->xmin = skhs->xmin; + hs->ts = skhs->ts; + } + if (FullTransactionIdIsNormal(skhs->catalog_xmin) + && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) + { + hs->catalog_xmin = skhs->catalog_xmin; + hs->ts = skhs->ts; + } + } + } + + if (hs->xmin.value == ~0) + hs->xmin = InvalidFullTransactionId; + if (hs->catalog_xmin.value == ~0) + hs->catalog_xmin = InvalidFullTransactionId; +} + +static void +walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) +{ + HotStandbyFeedback hsFeedback; + XLogRecPtr diskConsistentLsn; + + diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; + + if (!wp->config->syncSafekeepers) + { + /* Get PageserverFeedback fields from the most advanced safekeeper */ + GetLatestNeonFeedback(&quorumFeedback.rf, wp); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + } + + if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) + { + + if (commitLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = commitLsn; + + /* advance the replication slot */ + if (!wp->config->syncSafekeepers) + ProcessStandbyReply( + /* write_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + /* flush_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + + /* + * apply_lsn - This is what processed and durably saved at* + * pageserver. + */ + quorumFeedback.rf.disk_consistent_lsn, + walprop_pg_get_current_timestamp(), false); + } + + CombineHotStanbyFeedbacks(&hsFeedback, wp); + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + { + quorumFeedback.hs = hsFeedback; + if (!wp->config->syncSafekeepers) + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + } +} + +static void +walprop_pg_confirm_wal_streamed(XLogRecPtr lsn) +{ + if (MyReplicationSlot) + PhysicalConfirmReceivedLocation(lsn); +} + +static const walproposer_api walprop_pg = { + .get_shmem_state = walprop_pg_get_shmem_state, + .start_streaming = walprop_pg_start_streaming, + .get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr, + .get_current_timestamp = walprop_pg_get_current_timestamp, + .get_timeline_id = walprop_pg_get_timeline_id, + .conn_error_message = walprop_error_message, + .conn_status = walprop_status, + .conn_connect_start = walprop_connect_start, + .conn_connect_poll = walprop_connect_poll, + .conn_send_query = walprop_send_query, + .conn_get_query_result = walprop_get_query_result, + .conn_flush = walprop_flush, + .conn_finish = walprop_finish, + .conn_async_read = walprop_async_read, + .conn_async_write = walprop_async_write, + .conn_blocking_write = walprop_blocking_write, + .recovery_download = WalProposerRecovery, + .wal_read = walprop_pg_wal_read, + .wal_reader_allocate = walprop_pg_wal_reader_allocate, + .free_event_set = walprop_pg_free_event_set, + .init_event_set = walprop_pg_init_event_set, + .update_event_set = walprop_pg_update_event_set, + .add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set, + .wait_event_set = walprop_pg_wait_event_set, + .strong_random = pg_strong_random, + .get_redo_start_lsn = GetRedoStartLsn, + .finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers, + .process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback, + .confirm_wal_streamed = walprop_pg_confirm_wal_streamed, +}; diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c deleted file mode 100644 index 05030360f6..0000000000 --- a/pgxn/neon/walproposer_utils.c +++ /dev/null @@ -1,659 +0,0 @@ -#include "postgres.h" - -#include "access/timeline.h" -#include "access/xlogutils.h" -#include "common/logging.h" -#include "common/ip.h" -#include "funcapi.h" -#include "libpq/libpq.h" -#include "libpq/pqformat.h" -#include "miscadmin.h" -#include "postmaster/interrupt.h" -#include "replication/slot.h" -#include "walproposer_utils.h" -#include "replication/walsender_private.h" - -#include "storage/ipc.h" -#include "utils/builtins.h" -#include "utils/ps_status.h" - -#include "libpq-fe.h" -#include -#include - -#if PG_VERSION_NUM >= 150000 -#include "access/xlogutils.h" -#include "access/xlogrecovery.h" -#endif -#if PG_MAJORVERSION_NUM >= 16 -#include "utils/guc.h" -#endif - -/* - * These variables are used similarly to openLogFile/SegNo, - * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID - * corresponding the filename of walpropFile. - */ -static int walpropFile = -1; -static TimeLineID walpropFileTLI = 0; -static XLogSegNo walpropSegNo = 0; - -/* START cloned file-local variables and functions from walsender.c */ - -/* - * How far have we sent WAL already? This is also advertised in - * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) - */ -static XLogRecPtr sentPtr = InvalidXLogRecPtr; - -static void WalSndLoop(void); -static void XLogBroadcastWalProposer(void); -/* END cloned file-level variables and functions from walsender.c */ - -int -CompareLsn(const void *a, const void *b) -{ - XLogRecPtr lsn1 = *((const XLogRecPtr *) a); - XLogRecPtr lsn2 = *((const XLogRecPtr *) b); - - if (lsn1 < lsn2) - return -1; - else if (lsn1 == lsn2) - return 0; - else - return 1; -} - -/* Returns a human-readable string corresonding to the SafekeeperState - * - * The string should not be freed. - * - * The strings are intended to be used as a prefix to "state", e.g.: - * - * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); - * - * If this sort of phrasing doesn't fit the message, instead use something like: - * - * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); - */ -char * -FormatSafekeeperState(SafekeeperState state) -{ - char *return_val = NULL; - - switch (state) - { - case SS_OFFLINE: - return_val = "offline"; - break; - case SS_CONNECTING_READ: - case SS_CONNECTING_WRITE: - return_val = "connecting"; - break; - case SS_WAIT_EXEC_RESULT: - return_val = "receiving query result"; - break; - case SS_HANDSHAKE_RECV: - return_val = "handshake (receiving)"; - break; - case SS_VOTING: - return_val = "voting"; - break; - case SS_WAIT_VERDICT: - return_val = "wait-for-verdict"; - break; - case SS_SEND_ELECTED_FLUSH: - return_val = "send-announcement-flush"; - break; - case SS_IDLE: - return_val = "idle"; - break; - case SS_ACTIVE: - return_val = "active"; - break; - } - - Assert(return_val != NULL); - - return return_val; -} - -/* Asserts that the provided events are expected for given safekeeper's state */ -void -AssertEventsOkForState(uint32 events, Safekeeper *sk) -{ - uint32 expected = SafekeeperStateDesiredEvents(sk->state); - - /* - * The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. (b) if we are expecting something, there's - * overlap (i.e. `events & expected != 0`) - */ - bool events_ok_for_state; /* long name so the `Assert` is more - * clear later */ - - if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); - else - events_ok_for_state = ((events & expected) != 0); - - if (!events_ok_for_state) - { - /* - * To give a descriptive message in the case of failure, we use elog - * and then an assertion that's guaranteed to fail. - */ - elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", - FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); - Assert(events_ok_for_state); - } -} - -/* Returns the set of events a safekeeper in this state should be waiting on - * - * This will return WL_NO_EVENTS (= 0) for some events. */ -uint32 -SafekeeperStateDesiredEvents(SafekeeperState state) -{ - uint32 result = WL_NO_EVENTS; - - /* If the state doesn't have a modifier, we can check the base state */ - switch (state) - { - /* Connecting states say what they want in the name */ - case SS_CONNECTING_READ: - result = WL_SOCKET_READABLE; - break; - case SS_CONNECTING_WRITE: - result = WL_SOCKET_WRITEABLE; - break; - - /* Reading states need the socket to be read-ready to continue */ - case SS_WAIT_EXEC_RESULT: - case SS_HANDSHAKE_RECV: - case SS_WAIT_VERDICT: - result = WL_SOCKET_READABLE; - break; - - /* - * Idle states use read-readiness as a sign that the connection - * has been disconnected. - */ - case SS_VOTING: - case SS_IDLE: - result = WL_SOCKET_READABLE; - break; - - /* - * Flush states require write-ready for flushing. Active state - * does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We - * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ - case SS_SEND_ELECTED_FLUSH: - case SS_ACTIVE: - result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; - break; - - /* The offline state expects no events. */ - case SS_OFFLINE: - result = WL_NO_EVENTS; - break; - - default: - Assert(false); - break; - } - - return result; -} - -/* Returns a human-readable string corresponding to the event set - * - * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the - * returned string may be meaingless. - * - * The string should not be freed. It should also not be expected to remain the same between - * function calls. */ -char * -FormatEvents(uint32 events) -{ - static char return_str[8]; - - /* Helper variable to check if there's extra bits */ - uint32 all_flags = WL_LATCH_SET - | WL_SOCKET_READABLE - | WL_SOCKET_WRITEABLE - | WL_TIMEOUT - | WL_POSTMASTER_DEATH - | WL_EXIT_ON_PM_DEATH - | WL_SOCKET_CONNECTED; - - /* - * The formatting here isn't supposed to be *particularly* useful -- it's - * just to give an sense of what events have been triggered without - * needing to remember your powers of two. - */ - - return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; - return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; - return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; - return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; - return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; - return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; - return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; - - if (events & (~all_flags)) - { - elog(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); - return_str[6] = '*'; - return_str[7] = '\0'; - } - else - return_str[6] = '\0'; - - return (char *) &return_str; -} - -/* - * Convert a character which represents a hexadecimal digit to an integer. - * - * Returns -1 if the character is not a hexadecimal digit. - */ -static int -HexDecodeChar(char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - if (c >= 'A' && c <= 'F') - return c - 'A' + 10; - - return -1; -} - -/* - * Decode a hex string into a byte string, 2 hex chars per byte. - * - * Returns false if invalid characters are encountered; otherwise true. - */ -bool -HexDecodeString(uint8 *result, char *input, int nbytes) -{ - int i; - - for (i = 0; i < nbytes; ++i) - { - int n1 = HexDecodeChar(input[i * 2]); - int n2 = HexDecodeChar(input[i * 2 + 1]); - - if (n1 < 0 || n2 < 0) - return false; - result[i] = n1 * 16 + n2; - } - - return true; -} - -/* -------------------------------- - * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order - * -------------------------------- - */ -uint32 -pq_getmsgint32_le(StringInfo msg) -{ - uint32 n32; - - pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); - - return n32; -} - -/* -------------------------------- - * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order - * -------------------------------- - */ -uint64 -pq_getmsgint64_le(StringInfo msg) -{ - uint64 n64; - - pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); - - return n64; -} - -/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ -void -pq_sendint32_le(StringInfo buf, uint32 i) -{ - enlargeStringInfo(buf, sizeof(uint32)); - memcpy(buf->data + buf->len, &i, sizeof(uint32)); - buf->len += sizeof(uint32); -} - -/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ -void -pq_sendint64_le(StringInfo buf, uint64 i) -{ - enlargeStringInfo(buf, sizeof(uint64)); - memcpy(buf->data + buf->len, &i, sizeof(uint64)); - buf->len += sizeof(uint64); -} - -/* - * Write XLOG data to disk. - */ -void -XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) -{ - int startoff; - int byteswritten; - - while (nbytes > 0) - { - int segbytes; - - /* Close the current segment if it's completed */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - XLogWalPropClose(recptr); - - if (walpropFile < 0) - { -#if PG_VERSION_NUM >= 150000 - /* FIXME Is it ok to use hardcoded value here? */ - TimeLineID tli = 1; -#else - bool use_existent = true; -#endif - /* Create/use new log file */ - XLByteToSeg(recptr, walpropSegNo, wal_segment_size); -#if PG_VERSION_NUM >= 150000 - walpropFile = XLogFileInit(walpropSegNo, tli); - walpropFileTLI = tli; -#else - walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); - walpropFileTLI = ThisTimeLineID; -#endif - } - - /* Calculate the start offset of the received logs */ - startoff = XLogSegmentOffset(recptr, wal_segment_size); - - if (startoff + nbytes > wal_segment_size) - segbytes = wal_segment_size - startoff; - else - segbytes = nbytes; - - /* OK to write the logs */ - errno = 0; - - byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); - if (byteswritten <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - /* if write didn't set errno, assume no disk space */ - if (errno == 0) - errno = ENOSPC; - - save_errno = errno; - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log segment %s " - "at offset %u, length %lu: %m", - xlogfname, startoff, (unsigned long) segbytes))); - } - - /* Update state for write */ - recptr += byteswritten; - - nbytes -= byteswritten; - buf += byteswritten; - } - - /* - * Close the current segment if it's fully written up in the last cycle of - * the loop. - */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - { - XLogWalPropClose(recptr); - } -} - -/* - * Close the current segment. - */ -void -XLogWalPropClose(XLogRecPtr recptr) -{ - Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); - - if (close(walpropFile) != 0) - { - char xlogfname[MAXFNAMELEN]; - - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not close log segment %s: %m", - xlogfname))); - } - - walpropFile = -1; -} - -/* START of cloned functions from walsender.c */ - -/* - * Subscribe for new WAL and stream it in the loop to safekeepers. - * - * At the moment, this never returns, but an ereport(ERROR) will take us back - * to the main loop. - */ -void -StartProposerReplication(StartReplicationCmd *cmd) -{ - XLogRecPtr FlushPtr; - TimeLineID currTLI; - -#if PG_VERSION_NUM < 150000 - if (ThisTimeLineID == 0) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); -#endif - - /* - * We assume here that we're logging enough information in the WAL for - * log-shipping, since this is checked in PostmasterMain(). - * - * NOTE: wal_level can only change at shutdown, so in most cases it is - * difficult for there to be WAL data that we can still see that was - * written at wal_level='minimal'. - */ - - if (cmd->slotname) - { - ReplicationSlotAcquire(cmd->slotname, true); - if (SlotIsLogical(MyReplicationSlot)) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot use a logical replication slot for physical replication"))); - - /* - * We don't need to verify the slot's restart_lsn here; instead we - * rely on the caller requesting the starting point to use. If the - * WAL segment doesn't exist, we'll fail later. - */ - } - - /* - * Select the timeline. If it was given explicitly by the client, use - * that. Otherwise use the timeline of the last replayed record, which is - * kept in ThisTimeLineID. - * - * Neon doesn't currently use PG Timelines, but it may in the future, so - * we keep this code around to lighten the load for when we need it. - */ -#if PG_VERSION_NUM >= 150000 - FlushPtr = GetFlushRecPtr(&currTLI); -#else - FlushPtr = GetFlushRecPtr(); - currTLI = ThisTimeLineID; -#endif - - /* - * When we first start replication the standby will be behind the - * primary. For some applications, for example synchronous - * replication, it is important to have a clear state for this initial - * catchup mode, so we can trigger actions when we change streaming - * state later. We may stay in this state for a long time, which is - * exactly why we want to be able to monitor whether or not we are - * still here. - */ - WalSndSetState(WALSNDSTATE_CATCHUP); - - /* - * Don't allow a request to stream from a future point in WAL that - * hasn't been flushed to disk in this server yet. - */ - if (FlushPtr < cmd->startpoint) - { - ereport(ERROR, - (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", - LSN_FORMAT_ARGS(cmd->startpoint), - LSN_FORMAT_ARGS(FlushPtr)))); - } - - /* Start streaming from the requested point */ - sentPtr = cmd->startpoint; - - /* Initialize shared memory status, too */ - SpinLockAcquire(&MyWalSnd->mutex); - MyWalSnd->sentPtr = sentPtr; - SpinLockRelease(&MyWalSnd->mutex); - - SyncRepInitConfig(); - - /* Infinite send loop, never returns */ - WalSndLoop(); - - WalSndSetState(WALSNDSTATE_STARTUP); - - if (cmd->slotname) - ReplicationSlotRelease(); -} - -/* - * Main loop that waits for LSN updates and calls the walproposer. - * Synchronous replication sets latch in WalSndWakeup at walsender.c - */ -static void -WalSndLoop(void) -{ - /* Clear any already-pending wakeups */ - ResetLatch(MyLatch); - - for (;;) - { - CHECK_FOR_INTERRUPTS(); - - XLogBroadcastWalProposer(); - - if (MyWalSnd->state == WALSNDSTATE_CATCHUP) - WalSndSetState(WALSNDSTATE_STREAMING); - WalProposerPoll(); - } -} - -/* - * Notify walproposer about the new WAL position. - */ -static void -XLogBroadcastWalProposer(void) -{ - XLogRecPtr startptr; - XLogRecPtr endptr; - - /* Start from the last sent position */ - startptr = sentPtr; - - /* - * Streaming the current timeline on a primary. - * - * Attempt to send all data that's already been written out and - * fsync'd to disk. We cannot go further than what's been written out - * given the current implementation of WALRead(). And in any case - * it's unsafe to send WAL that is not securely down to disk on the - * primary: if the primary subsequently crashes and restarts, standbys - * must not have applied any WAL that got lost on the primary. - */ -#if PG_VERSION_NUM >= 150000 - endptr = GetFlushRecPtr(NULL); -#else - endptr = GetFlushRecPtr(); -#endif - - /* - * Record the current system time as an approximation of the time at which - * this WAL location was written for the purposes of lag tracking. - * - * In theory we could make XLogFlush() record a time in shmem whenever WAL - * is flushed and we could get that time as well as the LSN when we call - * GetFlushRecPtr() above (and likewise for the cascading standby - * equivalent), but rather than putting any new code into the hot WAL path - * it seems good enough to capture the time here. We should reach this - * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that - * may take some time, we read the WAL flush pointer and take the time - * very close to together here so that we'll get a later position if it is - * still moving. - * - * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, - * this gives us a cheap approximation for the WAL flush time for this - * LSN. - * - * Note that the LSN is not necessarily the LSN for the data contained in - * the present message; it's the end of the WAL, which might be further - * ahead. All the lag tracking machinery cares about is finding out when - * that arbitrary LSN is eventually reported as written, flushed and - * applied, so that it can measure the elapsed time. - */ - LagTrackerWrite(endptr, GetCurrentTimestamp()); - - /* Do we have any work to do? */ - Assert(startptr <= endptr); - if (endptr <= startptr) - return; - - WalProposerBroadcast(startptr, endptr); - sentPtr = endptr; - - /* Update shared memory status */ - { - WalSnd *walsnd = MyWalSnd; - - SpinLockAcquire(&walsnd->mutex); - walsnd->sentPtr = sentPtr; - SpinLockRelease(&walsnd->mutex); - } - - /* Report progress of XLOG streaming in PS display */ - if (update_process_title) - { - char activitymsg[50]; - - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", - LSN_FORMAT_ARGS(sentPtr)); - set_ps_display(activitymsg); - } -} diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h deleted file mode 100644 index aa5df5fa43..0000000000 --- a/pgxn/neon/walproposer_utils.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NEON_WALPROPOSER_UTILS_H__ -#define __NEON_WALPROPOSER_UTILS_H__ - -#include "walproposer.h" - -int CompareLsn(const void *a, const void *b); -char *FormatSafekeeperState(SafekeeperState state); -void AssertEventsOkForState(uint32 events, Safekeeper *sk); -uint32 SafekeeperStateDesiredEvents(SafekeeperState state); -char *FormatEvents(uint32 events); -bool HexDecodeString(uint8 *result, char *input, int nbytes); -uint32 pq_getmsgint32_le(StringInfo msg); -uint64 pq_getmsgint64_le(StringInfo msg); -void pq_sendint32_le(StringInfo buf, uint32 i); -void pq_sendint64_le(StringInfo buf, uint64 i); -void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); -void XLogWalPropClose(XLogRecPtr recptr); - -#endif /* __NEON_WALPROPOSER_UTILS_H__ */