diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 71b9e8d803..6e570b22d4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -136,6 +136,10 @@ jobs: run: mold -run make postgres -j$(nproc) shell: bash -euxo pipefail {0} + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + shell: bash -euxo pipefail {0} + - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 029beba351..eddfee88fc 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -81,6 +81,9 @@ jobs: if: steps.cache_pg.outputs.cache-hit != 'true' run: make postgres + - name: Build neon extensions + run: make neon-pg-ext + # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' # and the real cause will be inside config.log - name: Print configure logs in case of failure diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 117a4155cd..4527fb9ece 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -13,7 +13,8 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install # Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes. # Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some @@ -55,6 +56,16 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.3.tar.gz && \ make install && \ rm -rf /plv8-* +# compile neon extensions +FROM build-deps AS neon-pg-ext-build +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + # Compile and run the Neon-specific `compute_ctl` binary FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools USER nonroot @@ -73,8 +84,8 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig # TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build# -COPY --from=plv8-build --chown=postgres /usr/local/pgsql /usr/local +# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# +COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/compute_tools/target/release/compute_ctl /usr/local/bin/compute_ctl RUN apt update && \ diff --git a/Makefile b/Makefile index fc75e9fc5e..9d7e1497e5 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # Top level Makefile to build Zenith and PostgreSQL # .PHONY: all -all: zenith postgres +all: zenith postgres neon-pg-ext ### Zenith Rust bits # @@ -87,25 +87,39 @@ postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install - +@echo "Compiling contrib/neon" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install - +@echo "Compiling contrib/neon_test_utils" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install + +@echo "Compiling libpq" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install +@echo "Compiling pg_buffercache" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install - .PHONY: postgres-clean postgres-clean: $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean + +neon-pg-ext: postgres + +@echo "Compiling neon" + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ + -C $(ROOT_PROJECT_DIR)/pgxn/neon install + +@echo "Compiling neon_test_utils" + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ + -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install + +.PHONY: neon-pg-ext-clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean # This doesn't remove the effects of 'configure'. .PHONY: clean clean: cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean + cd pgxn/neon && $(MAKE) clean + cd pgxn/neon_test_utils && $(MAKE) clean # This removes everything .PHONY: distclean diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile new file mode 100644 index 0000000000..a6ce611974 --- /dev/null +++ b/pgxn/neon/Makefile @@ -0,0 +1,26 @@ +# pgxs/neon/Makefile + + +MODULE_big = neon +OBJS = \ + $(WIN32RES) \ + inmem_smgr.o \ + libpagestore.o \ + libpqwalproposer.o \ + pagestore_smgr.o \ + relsize_cache.o \ + neon.o \ + walproposer.o \ + walproposer_utils.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +SHLIB_LINK_INTERNAL = $(libpq) + +EXTENSION = neon +DATA = neon--1.0.sql +PGFILEDESC = "neon - cloud storage for PostgreSQL" + + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c new file mode 100644 index 0000000000..7840292b08 --- /dev/null +++ b/pgxn/neon/inmem_smgr.c @@ -0,0 +1,286 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.c + * + * This is an implementation of the SMGR interface, used in the WAL redo + * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent + * storage, the pages that are written out are kept in a small number of + * in-memory buffers. + * + * Normally, replaying a WAL record only needs to access a handful of + * buffers, which fit in the normal buffer cache, so this is just for + * "overflow" storage when the buffer cache is not large enough. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * contrib/neon/inmem_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "pagestore_client.h" +#include "storage/block.h" +#include "storage/buf_internals.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" + +/* Size of the in-memory smgr */ +#define MAX_PAGES 64 + +/* If more than WARN_PAGES are used, print a warning in the log */ +#define WARN_PAGES 32 + +static BufferTag page_tag[MAX_PAGES]; +static char page_body[MAX_PAGES][BLCKSZ]; +static int used_pages; + +static int +locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) +{ + /* We only hold a small number of pages, so linear search */ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum + && blkno == page_tag[i].blockNum) + { + return i; + } + } + return -1; +} + +/* + * inmem_init() -- Initialize private state + */ +void +inmem_init(void) +{ + used_pages = 0; +} + +/* + * inmem_exists() -- Does the physical file exist? + */ +bool +inmem_exists(SMgrRelation reln, ForkNumber forknum) +{ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum) + { + return true; + } + } + return false; +} + +/* + * inmem_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_unlink() -- Unlink a relation. + */ +void +inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + /* same as smgwrite() for us */ + inmem_write(reln, forknum, blkno, buffer, skipFsync); +} + +/* + * inmem_open() -- Initialize newly-opened relation. + */ +void +inmem_open(SMgrRelation reln) +{ +} + +/* + * inmem_close() -- Close the specified relation, if it isn't closed already. + */ +void +inmem_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +/* + * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return true; +} + +/* + * inmem_writeback() -- Tell the kernel to write pages back to storage. + */ +void +inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ +} + +/* + * inmem_read() -- Read the specified block from a relation. + */ +void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer) +{ + int pg; + + pg = locate_page(reln, forknum, blkno); + if (pg < 0) + memset(buffer, 0, BLCKSZ); + else + memcpy(buffer, page_body[pg], BLCKSZ); +} + +/* + * inmem_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + int pg; + + pg = locate_page(reln, forknum, blocknum); + if (pg < 0) + { + /* + * We assume the buffer cache is large enough to hold all the buffers + * needed for most operations. Overflowing to this "in-mem smgr" in rare + * cases is OK. But if we find that we're using more than WARN_PAGES, + * print a warning so that we get alerted and get to investigate why + * we're accessing so many buffers. + */ + elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, + "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + if (used_pages == MAX_PAGES) + elog(ERROR, "Inmem storage overflow"); + + pg = used_pages; + used_pages++; + INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + } else { + elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + } + memcpy(page_body[pg], buffer, BLCKSZ); +} + +/* + * inmem_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +inmem_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + /* + * It's not clear why a WAL redo function would call smgrnblocks(). + * During recovery, at least before reaching consistency, the size of a + * relation could be arbitrarily small, if it was truncated after the + * record being replayed, or arbitrarily large if it was extended + * afterwards. But one place where it's called is in + * XLogReadBufferExtended(): it extends the relation, if it's smaller than + * the requested page. That's a waste of time in the WAL redo + * process. Pretend that all relations are maximally sized to avoid it. + */ + return MaxBlockNumber; +} + +/* + * inmem_truncate() -- Truncate relation to specified number of blocks. + */ +void +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ +} + +/* + * inmem_immedsync() -- Immediately sync a relation to stable storage. + */ +void +inmem_immedsync(SMgrRelation reln, ForkNumber forknum) +{ +} + +static const struct f_smgr inmem_smgr = +{ + .smgr_init = inmem_init, + .smgr_shutdown = NULL, + .smgr_open = inmem_open, + .smgr_close = inmem_close, + .smgr_create = inmem_create, + .smgr_exists = inmem_exists, + .smgr_unlink = inmem_unlink, + .smgr_extend = inmem_extend, + .smgr_prefetch = inmem_prefetch, + .smgr_read = inmem_read, + .smgr_write = inmem_write, + .smgr_writeback = inmem_writeback, + .smgr_nblocks = inmem_nblocks, + .smgr_truncate = inmem_truncate, + .smgr_immedsync = inmem_immedsync, +}; + +const f_smgr * +smgr_inmem(BackendId backend, RelFileNode rnode) +{ + Assert(InRecovery); + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &inmem_smgr; +} + +void +smgr_init_inmem() +{ + inmem_init(); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c new file mode 100644 index 0000000000..649fc1037e --- /dev/null +++ b/pgxn/neon/libpagestore.c @@ -0,0 +1,432 @@ +/*------------------------------------------------------------------------- + * + * libpagestore.c + * Handles network communications with the remote pagestore. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/libpqpagestore.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "fmgr.h" +#include "access/xlog.h" + +#include "libpq-fe.h" +#include "libpq/pqformat.h" +#include "libpq/libpq.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" + + +#define PageStoreTrace DEBUG5 + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) + +bool connected = false; +PGconn *pageserver_conn = NULL; + +char *page_server_connstring_raw; + +static ZenithResponse *pageserver_call(ZenithRequest *request); +page_server_api api = { + .request = pageserver_call +}; + +static void +pageserver_connect() +{ + char *query; + int ret; + + Assert(!connected); + + pageserver_conn = PQconnectdb(page_server_connstring); + + if (PQstatus(pageserver_conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "could not establish connection to pageserver"), + errdetail_internal("%s", msg))); + } + + query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + ret = PQsendQuery(pageserver_conn, query); + if (ret != 1) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + neon_log(ERROR, "could not send pagestream command to pageserver"); + } + + while (PQisBusy(pageserver_conn)) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(pageserver_conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(pageserver_conn)) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + + neon_log(ERROR, "could not complete handshake with pageserver: %s", + msg); + } + } + } + + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + + connected = true; +} + +/* + * A wrapper around PQgetCopyData that checks for interrupts while sleeping. + */ +static int +call_PQgetCopyData(PGconn *conn, char **buffer) +{ + int ret; + +retry: + ret = PQgetCopyData(conn, buffer, 1 /* async */ ); + + if (ret == 0) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(conn)) + neon_log(ERROR, "could not get response from pageserver: %s", + PQerrorMessage(conn)); + } + + goto retry; + } + + return ret; +} + + +static ZenithResponse * +pageserver_call(ZenithRequest *request) +{ + StringInfoData req_buff; + StringInfoData resp_buff; + ZenithResponse *resp; + + PG_TRY(); + { + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) + { + neon_log(ERROR, "failed to send page request: %s", + PQerrorMessage(pageserver_conn)); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } + + /* read response */ + resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); + resp_buff.cursor = 0; + + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + + resp = zm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + PG_CATCH(); + { + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + PG_RE_THROW(); + } + PG_END_TRY(); + + return (ZenithResponse *) resp; +} + + +static bool +check_zenith_id(char **newval, void **extra, GucSource source) +{ + uint8 zid[16]; + + return **newval == '\0' || HexDecodeString(zid, *newval, 16); +} + +static char * +substitute_pageserver_password(const char *page_server_connstring_raw) +{ + char *host = NULL; + char *port = NULL; + char *user = NULL; + char *auth_token = NULL; + char *err = NULL; + char *page_server_connstring = NULL; + PQconninfoOption *conn_options; + PQconninfoOption *conn_option; + MemoryContext oldcontext; + + /* + * Here we substitute password in connection string with an environment + * variable. To simplify things we construct a connection string back with + * only known options. In particular: host port user and password. We do + * not currently use other options and constructing full connstring in an + * URI shape is quite messy. + */ + + if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') + return NULL; + + /* extract the auth token from the connection string */ + conn_options = PQconninfoParse(page_server_connstring_raw, &err); + if (conn_options == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + /* + * Trying to populate pageserver connection string with auth token from + * environment. We are looking for password in with placeholder value like + * $ENV_VAR_NAME, so if password field is present and starts with $ we try + * to fetch environment variable value and fail loudly if it is not set. + */ + for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) + { + if (strcmp(conn_option->keyword, "host") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + host = conn_option->val; + } + else if (strcmp(conn_option->keyword, "port") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + port = conn_option->val; + } + else if (strcmp(conn_option->keyword, "user") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + user = conn_option->val; + } + else if (strcmp(conn_option->keyword, "password") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + { + /* ensure that this is a template */ + if (strncmp(conn_option->val, "$", 1) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); + + neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); + auth_token = getenv(&conn_option->val[1]); + if (!auth_token) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); + } + else + { + neon_log(LOG, "using auth token from environment passed via env"); + } + } + } + } + + /* + * allocate connection string in TopMemoryContext to make sure it is not + * freed + */ + oldcontext = CurrentMemoryContext; + MemoryContextSwitchTo(TopMemoryContext); + page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); + MemoryContextSwitchTo(oldcontext); + + PQconninfoFree(conn_options); + return page_server_connstring; +} + +/* + * Module initialization function + */ +void +pg_init_libpagestore(void) +{ + DefineCustomStringVariable("neon.pageserver_connstring", + "connection string to the page server", + NULL, + &page_server_connstring_raw, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); + + DefineCustomStringVariable("neon.timeline_id", + "Zenith timelineid the server is running on", + NULL, + &zenith_timeline, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomStringVariable("neon.tenant_id", + "Neon tenantid the server is running on", + NULL, + &zenith_tenant, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomBoolVariable("neon.wal_redo", + "start in wal-redo mode", + NULL, + &wal_redo, + false, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.max_cluster_size", + "cluster size limit", + NULL, + &max_cluster_size, + -1, -1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, NULL, NULL); + + relsize_hash_init(); + + if (page_server != NULL) + neon_log(ERROR, "libpagestore already loaded"); + + neon_log(PageStoreTrace, "libpagestore already loaded"); + page_server = &api; + + /* substitute password in pageserver_connstring */ + page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); + + /* Is there more correct way to pass CustomGUC to postgres code? */ + zenith_timeline_walproposer = zenith_timeline; + zenith_tenant_walproposer = zenith_tenant; + + if (wal_redo) + { + neon_log(PageStoreTrace, "set inmem_smgr hook"); + smgr_hook = smgr_inmem; + smgr_init_hook = smgr_init_inmem; + } + else if (page_server_connstring && page_server_connstring[0]) + { + neon_log(PageStoreTrace, "set neon_smgr hook"); + smgr_hook = smgr_zenith; + smgr_init_hook = smgr_init_zenith; + dbsize_hook = zenith_dbsize; + } +} diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c new file mode 100644 index 0000000000..2b2b7a1a6a --- /dev/null +++ b/pgxn/neon/libpqwalproposer.c @@ -0,0 +1,413 @@ +#include "postgres.h" + +#include "libpq-fe.h" +#include "neon.h" +#include "walproposer.h" + +/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ +struct WalProposerConn +{ + PGconn* pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from libpqprop_async_read */ +}; + +/* Prototypes for exported functions */ +static char* libpqprop_error_message(WalProposerConn* conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); +static WalProposerConn* libpqprop_connect_start(char* conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); +static bool libpqprop_send_query(WalProposerConn* conn, char* query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); +static pgsocket libpqprop_socket(WalProposerConn* conn); +static int libpqprop_flush(WalProposerConn* conn); +static void libpqprop_finish(WalProposerConn* conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); + +static WalProposerFunctionsType PQWalProposerFunctions = { + libpqprop_error_message, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, +}; + +/* Module initialization */ +void +pg_init_libpqwalproposer(void) +{ + if (WalProposerFunctions != NULL) + elog(ERROR, "libpqwalproposer already loaded"); + WalProposerFunctions = &PQWalProposerFunctions; +} + +/* Helper function */ +static bool +ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +{ + /* If we're already correctly blocking or nonblocking, all good */ + if (is_nonblocking == conn->is_nonblocking) + return true; + + /* Otherwise, set it appropriately */ + if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) + return false; + + conn->is_nonblocking = is_nonblocking; + return true; +} + +/* Exported function definitions */ +static char* +libpqprop_error_message(WalProposerConn* conn) +{ + return PQerrorMessage(conn->pg_conn); +} + +static WalProposerConnStatusType +libpqprop_status(WalProposerConn* conn) +{ + switch (PQstatus(conn->pg_conn)) + { + case CONNECTION_OK: + return WP_CONNECTION_OK; + case CONNECTION_BAD: + return WP_CONNECTION_BAD; + default: + return WP_CONNECTION_IN_PROGRESS; + } +} + +static WalProposerConn* +libpqprop_connect_start(char* conninfo) +{ + WalProposerConn* conn; + PGconn* pg_conn; + + pg_conn = PQconnectStart(conninfo); + /* + * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the + * behavior of PQconnectStart here. + */ + if (!pg_conn) + return NULL; + + /* + * And in theory this allocation can fail as well, but it's incredibly unlikely if we just + * successfully allocated a PGconn. + * + * palloc will exit on failure though, so there's not much we could do if it *did* fail. + */ + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->recvbuf = NULL; + return conn; +} + +static WalProposerConnectPollStatusType +libpqprop_connect_poll(WalProposerConn* conn) +{ + WalProposerConnectPollStatusType return_val; + + switch (PQconnectPoll(conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + return_val = WP_CONN_POLLING_FAILED; + break; + case PGRES_POLLING_READING: + return_val = WP_CONN_POLLING_READING; + break; + case PGRES_POLLING_WRITING: + return_val = WP_CONN_POLLING_WRITING; + break; + case PGRES_POLLING_OK: + return_val = WP_CONN_POLLING_OK; + break; + + /* There's a comment at its source about this constant being unused. We'll expect it's never + * returned. */ + case PGRES_POLLING_ACTIVE: + elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + /* This return is never actually reached, but it's here to make the compiler happy */ + return WP_CONN_POLLING_FAILED; + + default: + Assert(false); + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + } + + return return_val; +} + +static bool +libpqprop_send_query(WalProposerConn* conn, char* query) +{ + /* We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* PQsendQuery returns 1 on success, 0 on failure */ + if (!PQsendQuery(conn->pg_conn, query)) + return false; + + return true; +} + +static WalProposerExecStatusType +libpqprop_get_query_result(WalProposerConn* conn) +{ + PGresult* result; + WalProposerExecStatusType return_val; + + /* Marker variable if we need to log an unexpected success result */ + char* unexpected_success = NULL; + + /* Consume any input that we might be missing */ + if (!PQconsumeInput(conn->pg_conn)) + return WP_EXEC_FAILED; + + if (PQisBusy(conn->pg_conn)) + return WP_EXEC_NEEDS_INPUT; + + + result = PQgetResult(conn->pg_conn); + /* PQgetResult returns NULL only if getting the result was successful & there's no more of the + * result to get. */ + if (!result) + { + elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + return WP_EXEC_UNEXPECTED_SUCCESS; + } + + /* Helper macro to reduce boilerplate */ + #define UNEXPECTED_SUCCESS(msg) \ + return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ + unexpected_success = msg; \ + break; + + + switch (PQresultStatus(result)) + { + /* "true" success case */ + case PGRES_COPY_BOTH: + return_val = WP_EXEC_SUCCESS_COPYBOTH; + break; + + /* Unexpected success case */ + case PGRES_EMPTY_QUERY: + UNEXPECTED_SUCCESS("empty query return"); + case PGRES_COMMAND_OK: + UNEXPECTED_SUCCESS("data-less command end"); + case PGRES_TUPLES_OK: + UNEXPECTED_SUCCESS("tuples return"); + case PGRES_COPY_OUT: + UNEXPECTED_SUCCESS("'Copy Out' response"); + case PGRES_COPY_IN: + UNEXPECTED_SUCCESS("'Copy In' response"); + case PGRES_SINGLE_TUPLE: + UNEXPECTED_SUCCESS("single tuple return"); + case PGRES_PIPELINE_SYNC: + UNEXPECTED_SUCCESS("pipeline sync point"); + + /* Failure cases */ + case PGRES_BAD_RESPONSE: + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_PIPELINE_ABORTED: + return_val = WP_EXEC_FAILED; + break; + + default: + Assert(false); + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + } + + if (unexpected_success) + elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + + return return_val; +} + +static pgsocket +libpqprop_socket(WalProposerConn* conn) +{ + return PQsocket(conn->pg_conn); +} + +static int +libpqprop_flush(WalProposerConn* conn) +{ + return (PQflush(conn->pg_conn)); +} + +static void +libpqprop_finish(WalProposerConn* conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +{ + int result; + + if (conn->recvbuf != NULL) + { + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; + } + + /* Call PQconsumeInput so that we have the data we need */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + + /* The docs for PQgetCopyData list the return values as: + * 0 if the copy is still in progress, but no "complete row" is + * available + * -1 if the copy is done + * -2 if an error occured + * (> 0) if it was successful; that value is the amount transferred. + * + * The protocol we use between walproposer and safekeeper means that we + * *usually* wouldn't expect to see that the copy is done, but this can + * sometimes be triggered by the server returning an ErrorResponse (which + * also happens to have the effect that the copy is done). + */ + switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) + { + case 0: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_TRY_AGAIN; + case -1: + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server failed; + * it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + + /* If there was actually an error, it'll be properly reported by + * calls to PQerrorMessage -- we don't have to do anything else */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + case -2: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + default: + /* Positive values indicate the size of the returned result */ + *amount = result; + *buf = conn->recvbuf; + return PG_ASYNC_READ_SUCCESS; + } +} + +static PGAsyncWriteResult +libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we aren't in non-blocking mode, switch to it. */ + if (!ensure_nonblocking_status(conn, true)) + return PG_ASYNC_WRITE_FAIL; + + /* The docs for PQputcopyData list the return values as: + * 1 if the data was queued, + * 0 if it was not queued because of full buffers, or + * -1 if an error occured + */ + result = PQputCopyData(conn->pg_conn, buf, size); + + /* We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more */ + Assert(result != 0); + + switch (result) + { + case 1: + /* good -- continue */ + break; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQputCopyData", result); + } + + /* After queueing the data, we still need to flush to get it to send. + * This might take multiple tries, but we don't want to wait around + * until it's done. + * + * PQflush has the following returns (directly quoting the docs): + * 0 if sucessful, + * 1 if it was unable to send all the data in the send queue yet + * -1 if it failed for some reason + */ + switch (result = PQflush(conn->pg_conn)) { + case 0: + return PG_ASYNC_WRITE_SUCCESS; + case 1: + return PG_ASYNC_WRITE_TRY_FLUSH; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQflush", result); + } +} + +static bool +libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we are in non-blocking mode, switch out of it. */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there */ + if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) + return false; + + Assert(result == 1); + + /* Because the connection is non-blocking, flushing returns 0 or -1 */ + + if ((result = PQflush(conn->pg_conn)) == -1) + return false; + + Assert(result == 0); + return true; +} diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql new file mode 100644 index 0000000000..34f1ba78d4 --- /dev/null +++ b/pgxn/neon/neon--1.0.sql @@ -0,0 +1,17 @@ +\echo Use "CREATE EXTENSION neon" to load this file. \quit + +CREATE FUNCTION pg_cluster_size() +RETURNS bigint +AS 'MODULE_PATHNAME', 'pg_cluster_size' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_lsns( + OUT received_lsn pg_lsn, + OUT disk_consistent_lsn pg_lsn, + OUT remote_consistent_lsn pg_lsn +) +RETURNS record +AS 'MODULE_PATHNAME', 'backpressure_lsns' +LANGUAGE C STRICT +PARALLEL UNSAFE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c new file mode 100644 index 0000000000..595a126f04 --- /dev/null +++ b/pgxn/neon/neon.c @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * neon.c + * Utility functions to expose neon specific information to user + * + * IDENTIFICATION + * contrib/neon/neon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "fmgr.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "catalog/pg_type.h" +#include "replication/walsender.h" +#include "funcapi.h" +#include "access/htup_details.h" +#include "utils/pg_lsn.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" + +PG_MODULE_MAGIC; +void _PG_init(void); + + +void _PG_init(void) +{ + pg_init_libpagestore(); + pg_init_libpqwalproposer(); + pg_init_walproposer(); + + EmitWarningsOnPlaceholders("neon"); +} + +PG_FUNCTION_INFO_V1(pg_cluster_size); +PG_FUNCTION_INFO_V1(backpressure_lsns); + +Datum +pg_cluster_size(PG_FUNCTION_ARGS) +{ + int64 size; + + size = GetZenithCurrentClusterSize(); + + if (size == 0) + PG_RETURN_NULL(); + + PG_RETURN_INT64(size); +} + + +Datum +backpressure_lsns(PG_FUNCTION_ARGS) +{ + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(writePtr); + values[1] = LSNGetDatum(flushPtr); + values[2] = LSNGetDatum(applyPtr); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control new file mode 100644 index 0000000000..84f79881c1 --- /dev/null +++ b/pgxn/neon/neon.control @@ -0,0 +1,4 @@ +# neon extension +comment = 'cloud storage for PostgreSQL' +default_version = '1.0' +module_pathname = '$libdir/neon' diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h new file mode 100644 index 0000000000..2c66bc7bf0 --- /dev/null +++ b/pgxn/neon/neon.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * neon.h + * Functions used in the initialization of this extension. + * + * IDENTIFICATION + * contrib/neon/neon.h + * + *------------------------------------------------------------------------- + */ + +#ifndef NEON_H +#define NEON_H + +extern void pg_init_libpagestore(void); +extern void pg_init_libpqwalproposer(void); +extern void pg_init_walproposer(void); + +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h new file mode 100644 index 0000000000..f79a3c9142 --- /dev/null +++ b/pgxn/neon/pagestore_client.h @@ -0,0 +1,221 @@ +/*------------------------------------------------------------------------- + * + * pagestore_client.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * contrib/neon/pagestore_client.h + * + *------------------------------------------------------------------------- + */ +#ifndef pageserver_h +#define pageserver_h + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "storage/relfilenode.h" +#include "storage/block.h" +#include "storage/smgr.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/memutils.h" + +#include "pg_config.h" + +typedef enum +{ + /* pagestore_client -> pagestore */ + T_ZenithExistsRequest = 0, + T_ZenithNblocksRequest, + T_ZenithGetPageRequest, + T_ZenithDbSizeRequest, + + /* pagestore -> pagestore_client */ + T_ZenithExistsResponse = 100, + T_ZenithNblocksResponse, + T_ZenithGetPageResponse, + T_ZenithErrorResponse, + T_ZenithDbSizeResponse, +} ZenithMessageTag; + + + +/* base struct for c-style inheritance */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithMessage; + +#define messageTag(m) (((const ZenithMessage *)(m))->tag) + +/* + * supertype of all the Zenith*Request structs below + * + * If 'latest' is true, we are requesting the latest page version, and 'lsn' + * is just a hint to the server that we know there are no versions of the page + * (or relation size, for exists/nblocks requests) later than the 'lsn'. + */ +typedef struct +{ + ZenithMessageTag tag; + bool latest; /* if true, request latest page version */ + XLogRecPtr lsn; /* request page version @ this LSN */ +} ZenithRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithExistsRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithNblocksRequest; + + +typedef struct +{ + ZenithRequest req; + Oid dbNode; +} ZenithDbSizeRequest; + + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; +} ZenithGetPageRequest; + +/* supertype of all the Zenith*Response structs below */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithResponse; + +typedef struct +{ + ZenithMessageTag tag; + bool exists; +} ZenithExistsResponse; + +typedef struct +{ + ZenithMessageTag tag; + uint32 n_blocks; +} ZenithNblocksResponse; + +typedef struct +{ + ZenithMessageTag tag; + char page[FLEXIBLE_ARRAY_MEMBER]; +} ZenithGetPageResponse; + +typedef struct +{ + ZenithMessageTag tag; + int64 db_size; +} ZenithDbSizeResponse; + +typedef struct +{ + ZenithMessageTag tag; + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ +} ZenithErrorResponse; + +extern StringInfoData zm_pack_request(ZenithRequest *msg); +extern ZenithResponse *zm_unpack_response(StringInfo s); +extern char *zm_to_string(ZenithMessage *msg); + +/* + * API + */ + +typedef struct +{ + ZenithResponse *(*request) (ZenithRequest *request); +} page_server_api; + +extern page_server_api *page_server; + +extern char *page_server_connstring; +extern char *zenith_timeline; +extern char *zenith_tenant; +extern bool wal_redo; +extern int32 max_cluster_size; + +extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); +extern void smgr_init_zenith(void); + +extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern void smgr_init_inmem(void); +extern void smgr_shutdown_inmem(void); + +/* zenith storage manager functionality */ + +extern void zenith_init(void); +extern void zenith_open(SMgrRelation reln); +extern void zenith_close(SMgrRelation reln, ForkNumber forknum); +extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); +extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); + +extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +extern void zenith_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); +extern const int64 zenith_dbsize(Oid dbNode); +extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); + +/* zenith wal-redo storage manager functionality */ + +extern void inmem_init(void); +extern void inmem_open(SMgrRelation reln); +extern void inmem_close(SMgrRelation reln, ForkNumber forknum); +extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum); +extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +extern void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); + + +/* utils for zenith relsize cache */ +extern void relsize_hash_init(void); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); + +#endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c new file mode 100644 index 0000000000..3e1b74dba7 --- /dev/null +++ b/pgxn/neon/pagestore_smgr.c @@ -0,0 +1,1696 @@ +/*------------------------------------------------------------------------- + * + * pagestore_smgr.c + * + * + * + * Temporary and unlogged rels + * --------------------------- + * + * Temporary and unlogged tables are stored locally, by md.c. The functions + * here just pass the calls through to corresponding md.c functions. + * + * Index build operations that use the buffer cache are also handled locally, + * just like unlogged tables. Such operations must be marked by calling + * smgr_start_unlogged_build() and friends. + * + * In order to know what relations are permanent and which ones are not, we + * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set + * by smgropen() callers, when they have the relcache entry at hand. However, + * sometimes we need to open an SmgrRelation for a relation without the + * relcache. That is needed when we evict a buffer; we might not have the + * SmgrRelation for that relation open yet. To deal with that, the + * 'relpersistence' can be left to zero, meaning we don't know if it's + * permanent or not. Most operations are not allowed with relpersistence==0, + * but smgrwrite() does work, which is what we need for buffer eviction. and + * smgrunlink() so that a backend doesn't need to have the relcache entry at + * transaction commit, where relations that were dropped in the transaction + * are unlinked. + * + * If smgrwrite() is called and smgr_relpersistence == 0, we check if the + * relation file exists locally or not. If it does exist, we assume it's an + * unlogged relation and write the page there. Otherwise it must be a + * permanent relation, WAL-logged and stored on the page server, and we ignore + * the write like we do for permanent relations. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/pagestore_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlog_internal.h" +#include "catalog/pg_class.h" +#include "pagestore_client.h" +#include "pagestore_client.h" +#include "storage/smgr.h" +#include "access/xlogdefs.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/md.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "catalog/pg_tablespace_d.h" +#include "postmaster/autovacuum.h" + +/* + * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API + * calls to md.c, and *also* do the calls to the Page Server. On every + * read, compare the versions we read from local disk and Page Server, + * and Assert that they are identical. + */ +/* #define DEBUG_COMPARE_LOCAL */ + +#ifdef DEBUG_COMPARE_LOCAL +#include "access/nbtree.h" +#include "storage/bufpage.h" +#include "access/xlog_internal.h" + +static char *hexdump_page(char *page); +#endif + +#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) + +const int SmgrTrace = DEBUG5; + +page_server_api *page_server; + +/* GUCs */ +char *page_server_connstring; // with substituted password +char *zenith_timeline; +char *zenith_tenant; +bool wal_redo = false; +int32 max_cluster_size; + +/* unlogged relation build states */ +typedef enum +{ + UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, + UNLOGGED_BUILD_PHASE_1, + UNLOGGED_BUILD_PHASE_2, + UNLOGGED_BUILD_NOT_PERMANENT +} UnloggedBuildPhase; + +static SMgrRelation unlogged_build_rel = NULL; +static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +StringInfoData +zm_pack_request(ZenithRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + pq_sendbyte(&s, msg->tag); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_ZenithExistsResponse: + case T_ZenithNblocksResponse: + case T_ZenithGetPageResponse: + case T_ZenithErrorResponse: + case T_ZenithDbSizeResponse: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + break; + } + return s; +} + +ZenithResponse * +zm_unpack_response(StringInfo s) +{ + ZenithMessageTag tag = pq_getmsgbyte(s); + ZenithResponse *resp = NULL; + + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + + msg_resp->tag = tag; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + + msg_resp->tag = tag; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithGetPageResponse: + { + ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + + msg_resp->tag = tag; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + + msg_resp->tag = tag; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp->tag = tag; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_ZenithExistsRequest: + case T_ZenithNblocksRequest: + case T_ZenithGetPageRequest: + case T_ZenithDbSizeRequest: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +zm_to_string(ZenithMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithGetPageResponse: + { +#if 0 + ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size + ); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * Wrapper around log_newpage() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + PGAlignedBlock copied_buffer; + + memcpy(copied_buffer.data, page, BLCKSZ); + return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); +} + +/* + * Is 'buffer' identical to a freshly initialized empty heap page? + */ +static bool +PageIsEmptyHeapPage(char *buffer) +{ + PGAlignedBlock empty_page; + + PageInit((Page) empty_page.data, BLCKSZ, 0); + + return memcmp(buffer, empty_page.data, BLCKSZ) == 0; +} + +static void +zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + XLogRecPtr lsn = PageGetLSN(buffer); + + if (ShutdownRequestPending) + return; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + if (forknum == FSM_FORKNUM && !RecoveryInProgress()) + { + /* FSM is never WAL-logged and we don't care. */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + ereport(SmgrTrace, + (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) + { + /* + * Always WAL-log vm. We should never miss clearing visibility map + * bits. + * + * TODO Is it too bad for performance? Hopefully we do not evict + * actively used vm too often. + */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + + ereport(SmgrTrace, + (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, + * and we can just ignore that in Zenith. We do need to remember the new size, + * though, so that smgrnblocks() returns the right answer after the rel has + * been extended. We rely on the relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. The + * heapam can leave such a page behind, if e.g. an insert errors out after + * initializing the page, but before it has inserted the tuple and WAL-logged + * the change. When we read the page from the page server, it will come back + * as all-zeros. That's OK, the heapam will initialize an all-zeros page on + * first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies + * that the page was not WAL-logged, and its contents will be lost when it's + * evicted. + */ + if (PageIsNew(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else if (PageIsEmptyHeapPage(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else + { + ereport(PANIC, + (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + } + else + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + SetLastWrittenPageLSN(lsn); +} + + +/* + * zenith_init() -- Initialize private state + */ +void +zenith_init(void) +{ + /* noop */ +#ifdef DEBUG_COMPARE_LOCAL + mdinit(); +#endif +} + +/* + * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position + * to physical position in WAL. It always adds SizeOfXLogShortPHD: + * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. + * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. + */ +static XLogRecPtr +zm_adjust_lsn(XLogRecPtr lsn) +{ + /* + * If lsn points to the beging of first record on page or segment, then + * "return" it back to the page origin + */ + if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) + { + lsn -= SizeOfXLogShortPHD; + } + else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) + { + lsn -= SizeOfXLogLongPHD; + } + return lsn; +} + +/* + * Return LSN for requesting pages and number of blocks from page server + */ +static XLogRecPtr +zenith_get_request_lsn(bool *latest) +{ + XLogRecPtr lsn; + + if (RecoveryInProgress()) + { + *latest = false; + lsn = GetXLogReplayRecPtr(NULL); + elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + } + else if (am_walsender) + { + *latest = true; + lsn = InvalidXLogRecPtr; + elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + } + else + { + XLogRecPtr flushlsn; + + /* + * Use the latest LSN that was evicted from the buffer cache. Any + * pages modified by later WAL records must still in the buffer cache, + * so our request cannot concern those. + */ + *latest = true; + lsn = GetLastWrittenPageLSN(); + Assert(lsn != InvalidXLogRecPtr); + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + + lsn = zm_adjust_lsn(lsn); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index building, + * _bt_blwritepage logs the full page without flushing WAL before + * smgrextend (files are fsynced before build ends). + */ + flushlsn = GetFlushRecPtr(); + if (lsn > flushlsn) + { + elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (flushlsn >> 32), (uint32) flushlsn); + XLogFlush(lsn); + } + } + + return lsn; +} + + +/* + * zenith_exists() -- Does the physical file exist? + */ +bool +zenith_exists(SMgrRelation reln, ForkNumber forkNum) +{ + bool exists; + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* + * We don't know if it's an unlogged rel stored locally, or permanent + * rel stored in the page server. First check if it exists locally. + * If it does, great. Otherwise check if it exists in the page server. + */ + if (mdexists(reln, forkNum)) + return true; + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdexists(reln, forkNum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + { + return true; + } + + /* + * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server + * will error out if you check that, because the whole dbdir for tablespace + * 0, db 0 doesn't exists. We possibly should change the page server to + * accept that and return 'false', to be consistent with mdexists(). But + * we probably also should fix pg_table_size() to not call smgrexists() + * with bogus relfilenode. + * + * For now, handle that special case here. + */ + if (reln->smgr_rnode.node.spcNode == 0 && + reln->smgr_rnode.node.dbNode == 0 && + reln->smgr_rnode.node.relNode == 0) + { + return false; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithExistsRequest request = { + .req.tag = T_ZenithExistsRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forkNum + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithExistsResponse: + exists = ((ZenithExistsResponse *) resp)->exists; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + pfree(resp); + return exists; +} + +/* + * zenith_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdcreate(reln, forkNum, isRedo); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "Create relation %u/%u/%u.%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum); + + /* + * Newly created relation is empty, remember that in the relsize cache. + * + * FIXME: This is currently not just an optimization, but required for + * correctness. Postgres can call smgrnblocks() on the newly-created + * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation created, so if we didn't remember the size in the relsize + * cache, we might call smgrnblocks() on the newly-created relation before + * the creation WAL record hass been received by the page server. + */ + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdcreate(reln, forkNum, isRedo); +#endif +} + +/* + * zenith_unlink() -- Unlink a relation. + * + * Note that we're passed a RelFileNodeBackend --- by the time this is called, + * there won't be an SMgrRelation hashtable entry anymore. + * + * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * + * If isRedo is true, it's unsurprising for the relation to be already gone. + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: any failure should be reported as WARNING not ERROR, because + * we are usually not in a transaction anymore when this is called. + */ +void +zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +{ + /* + * Might or might not exist locally, depending on whether it's + * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is + * set). Try to unlink, it won't do any harm if the file doesn't + * exist. + */ + mdunlink(rnode, forkNum, isRedo); + if (!RelFileNodeBackendIsTemp(rnode)) { + forget_cached_relsize(rnode.node, forkNum); + } +} + +/* + * zenith_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* + * Check that the cluster size limit has not been exceeded. + * + * Temporary and unlogged relations are not included in the cluster size measured + * by the page server, so ignore those. Autovacuum processes are also exempt. + */ + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + zenith_wallog_page(reln, forkNum, blkno, buffer); + set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, blkno, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); +#endif +} + +/* + * zenith_open() -- Initialize newly-opened relation. + */ +void +zenith_open(SMgrRelation reln) +{ + /* + * We don't have anything special to do here. Call mdopen() to let md.c + * initialize itself. That's only needed for temporary or unlogged + * relations, but it's dirt cheap so do it always to make sure the md + * fields are initialized, for debugging purposes if nothing else. + */ + mdopen(reln); + + /* no work */ + elog(SmgrTrace, "[ZENITH_SMGR] open noop"); +} + +/* + * zenith_close() -- Close the specified relation, if it isn't closed already. + */ +void +zenith_close(SMgrRelation reln, ForkNumber forknum) +{ + /* + * Let md.c close it, if it had it open. Doesn't hurt to do this + * even for permanent relations that have no local storage. + */ + mdclose(reln, forknum); +} + +/* + * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* probably shouldn't happen, but ignore it */ + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); + return true; +} + +/* + * zenith_writeback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* mdwriteback() does nothing if the file doesn't exist */ + mdwriteback(reln, forknum, blocknum, nblocks); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwriteback(reln, forknum, blocknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwriteback(reln, forknum, blocknum, nblocks); +#endif +} + +/* + * While function is defined in the zenith extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. + */ +void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) +{ + ZenithResponse *resp; + + { + ZenithGetPageRequest request = { + .req.tag = T_ZenithGetPageRequest, + .req.latest = request_latest, + .req.lsn = request_lsn, + .rnode = rnode, + .forknum = forkNum, + .blkno = blkno + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithGetPageResponse: + memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + blkno, + rnode.spcNode, + rnode.dbNode, + rnode.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + pfree(resp); +} + +/* + * zenith_read() -- Read the specified block from a relation. + */ +void +zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + request_lsn = zenith_get_request_lsn(&latest); + zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + mdread(reln, forkNum, blkno, mdbuf); + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew(mdbuf)) + { + if (!PageIsNew(pageserver_masked)) + { + elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew(buffer)) + { + elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } +#endif +} + +#ifdef DEBUG_COMPARE_LOCAL +static char * +hexdump_page(char *page) +{ + StringInfoData result; + + initStringInfo(&result); + + for (int i = 0; i < BLCKSZ; i++) + { + if (i % 8 == 0) + appendStringInfo(&result, " "); + if (i % 40 == 0) + appendStringInfo(&result, "\n"); + appendStringInfo(&result, "%02x", (unsigned char) (page[i])); + } + + return result.data; +} +#endif + +/* + * zenith_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + zenith_wallog_page(reln, forknum, blocknum, buffer); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif +} + +/* + * zenith_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdnblocks(reln, forknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + { + elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, n_blocks); + return n_blocks; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithNblocksRequest request = { + .req.tag = T_ZenithNblocksRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forknum, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithNblocksResponse: + n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + + elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + n_blocks); + + pfree(resp); + return n_blocks; +} + +/* + * zenith_db_size() -- Get the size of the database in bytes. + */ +const int64 +zenith_dbsize(Oid dbNode) +{ + ZenithResponse *resp; + int64 db_size; + XLogRecPtr request_lsn; + bool latest; + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithDbSizeRequest request = { + .req.tag = T_ZenithDbSizeRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .dbNode = dbNode, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithDbSizeResponse: + db_size = ((ZenithDbSizeResponse *) resp)->db_size; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read db size of db %u from page server at lsn %X/%08X", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + db_size); + + pfree(resp); + return db_size; +} + +/* + * zenith_truncate() -- Truncate relation to specified number of blocks. + */ +void +zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdtruncate(reln, forknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + + /* + * Truncating a relation drops all its buffers from the buffer cache + * without calling smgrwrite() on them. But we must account for that in + * our tracking of last-written-LSN all the same: any future smgrnblocks() + * request must return the new size after the truncation. We don't know + * what the LSN of the truncation record was, so be conservative and use + * the most recently inserted WAL record's LSN. + */ + lsn = GetXLogInsertRecPtr(); + + lsn = zm_adjust_lsn(lsn); + + /* + * Flush it, too. We don't actually care about it here, but let's uphold + * the invariant that last-written LSN <= flush LSN. + */ + XLogFlush(lsn); + + SetLastWrittenPageLSN(lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdtruncate(reln, forknum, nblocks); +#endif +} + +/* + * zenith_immedsync() -- Immediately sync a relation to stable storage. + * + * Note that only writes already issued are synced; this routine knows + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. + */ +void +zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdimmedsync(reln, forknum); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} + +/* + * zenith_start_unlogged_build() -- Starting build operation on a rel. + * + * Some indexes are built in two phases, by first populating the table with + * regular inserts, using the shared buffer cache but skipping WAL-logging, + * and WAL-logging the whole relation after it's done. Zenith relies on the + * WAL to reconstruct pages, so we cannot use the page server in the + * first phase when the changes are not logged. + */ +static void +zenith_start_unlogged_build(SMgrRelation reln) +{ + /* + * Currently, there can be only one unlogged relation build operation in + * progress at a time. That's enough for the current usage. + */ + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + elog(ERROR, "unlogged relation build is already in progress"); + Assert(unlogged_build_rel == NULL); + + ereport(SmgrTrace, + (errmsg("starting unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (smgrnblocks(reln, MAIN_FORKNUM) != 0) + elog(ERROR, "cannot perform unlogged index build, index is not empty "); + + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; + + /* Make the relation look like it's unlogged */ + reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; + + /* + * FIXME: should we pass isRedo true to create the tablespace dir if it + * doesn't exist? Is it needed? + */ + mdcreate(reln, MAIN_FORKNUM, false); +} + +/* + * zenith_finish_unlogged_build_phase_1() + * + * Call this after you have finished populating a relation in unlogged mode, + * before you start WAL-logging it. + */ +static void +zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) + return; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; +} + +/* + * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * + * Call this after you have finished WAL-logging an relation that was + * first populated without WAL-logging. + * + * This removes the local copy of the rel, since it's now been fully + * WAL-logged and is present in the page server. + */ +static void +zenith_end_unlogged_build(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("ending unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) + { + RelFileNodeBackend rnode; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + /* Make the relation look permanent again */ + reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; + + /* Remove local copy */ + rnode = reln->smgr_rnode; + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", + rnode.node.spcNode, + rnode.node.dbNode, + rnode.node.relNode, + forknum); + + forget_cached_relsize(rnode.node, forknum); + mdclose(reln, forknum); + /* use isRedo == true, so that we drop it immediately */ + mdunlink(rnode, forknum, true); + } + } + + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +} + +static void +AtEOXact_zenith(XactEvent event, void *arg) +{ + switch (event) + { + case XACT_EVENT_ABORT: + case XACT_EVENT_PARALLEL_ABORT: + + /* + * Forget about any build we might have had in progress. The local + * file will be unlinked by smgrDoPendingDeletes() + */ + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + break; + + case XACT_EVENT_COMMIT: + case XACT_EVENT_PARALLEL_COMMIT: + case XACT_EVENT_PREPARE: + case XACT_EVENT_PRE_COMMIT: + case XACT_EVENT_PARALLEL_PRE_COMMIT: + case XACT_EVENT_PRE_PREPARE: + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("unlogged index build was not properly finished")))); + } + break; + } +} + +static const struct f_smgr zenith_smgr = +{ + .smgr_init = zenith_init, + .smgr_shutdown = NULL, + .smgr_open = zenith_open, + .smgr_close = zenith_close, + .smgr_create = zenith_create, + .smgr_exists = zenith_exists, + .smgr_unlink = zenith_unlink, + .smgr_extend = zenith_extend, + .smgr_prefetch = zenith_prefetch, + .smgr_read = zenith_read, + .smgr_write = zenith_write, + .smgr_writeback = zenith_writeback, + .smgr_nblocks = zenith_nblocks, + .smgr_truncate = zenith_truncate, + .smgr_immedsync = zenith_immedsync, + + .smgr_start_unlogged_build = zenith_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = zenith_end_unlogged_build, +}; + + +const f_smgr * +smgr_zenith(BackendId backend, RelFileNode rnode) +{ + + /* Don't use page server for temp relations */ + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &zenith_smgr; +} + +void +smgr_init_zenith(void) +{ + RegisterXactCallback(AtEOXact_zenith, NULL); + + smgr_init_standard(); + zenith_init(); +} diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c new file mode 100644 index 0000000000..8dfcffe1d1 --- /dev/null +++ b/pgxn/neon/relsize_cache.c @@ -0,0 +1,167 @@ +/*------------------------------------------------------------------------- + * + * relsize_cache.c + * Relation size cache for better zentih performance. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/relsize_cache.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "catalog/pg_tablespace_d.h" +#include "utils/dynahash.h" +#include "utils/guc.h" + + +typedef struct +{ + RelFileNode rnode; + ForkNumber forknum; +} RelTag; + +typedef struct +{ + RelTag tag; + BlockNumber size; +} RelSizeEntry; + +static HTAB *relsize_hash; +static LWLockId relsize_lock; +static int relsize_hash_size; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + +/* + * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, + * which seems reasonable. + */ +#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) + +static void +zenith_smgr_shmem_startup(void) +{ + static HASHCTL info; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); + info.keysize = sizeof(RelTag); + info.entrysize = sizeof(RelSizeEntry); + relsize_hash = ShmemInitHash("neon_relsize", + relsize_hash_size, relsize_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + LWLockRelease(AddinShmemInitLock); +} + +bool +get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +{ + bool found = false; + + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_SHARED); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry != NULL) + { + *size = entry->size; + found = true; + } + LWLockRelease(relsize_lock); + } + return found; +} + +void +set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + bool found; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); + if (!found || entry->size < size) + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + LWLockRelease(relsize_lock); + } +} + +void +relsize_hash_init(void) +{ + DefineCustomIntVariable("neon.relsize_hash_size", + "Sets the maximum number of cached relation sizes for neon", + NULL, + &relsize_hash_size, + DEFAULT_RELSIZE_HASH_SIZE, + 0, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + if (relsize_hash_size > 0) + { + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = zenith_smgr_shmem_startup; + } +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c new file mode 100644 index 0000000000..9625325c0a --- /dev/null +++ b/pgxn/neon/walproposer.c @@ -0,0 +1,2403 @@ +/*------------------------------------------------------------------------- + * + * walproposer.c + * + * Proposer/leader part of the total order broadcast protocol between postgres + * and WAL safekeepers. + * + * We have two ways of launching WalProposer: + * + * 1. As a background worker which will run physical WalSender with + * am_wal_proposer flag set to true. WalSender in turn would handle WAL + * reading part and call WalProposer when ready to scatter WAL. + * + * 2. As a standalone utility by running `postgres --sync-safekeepers`. That + * is needed to create LSN from which it is safe to start postgres. More + * specifically it addresses following problems: + * + * a) Chicken-or-the-egg problem: compute postgres needs data directory + * with non-rel files that are downloaded from pageserver by calling + * basebackup@LSN. This LSN is not arbitrary, it must include all + * previously committed transactions and defined through consensus + * voting, which happens... in walproposer, a part of compute node. + * + * b) Just warranting such LSN is not enough, we must also actually commit + * it and make sure there is a safekeeper who knows this LSN is + * committed so WAL before it can be streamed to pageserver -- otherwise + * basebackup will hang waiting for WAL. Advancing commit_lsn without + * playing consensus game is impossible, so speculative 'let's just poll + * safekeepers, learn start LSN of future epoch and run basebackup' + * won't work. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include "access/xlogdefs.h" +#include "access/xlogutils.h" +#include "storage/latch.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlog.h" +#include "libpq/pqformat.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" +#include "replication/walpropshim.h" + + +char *wal_acceptors_list; +int wal_acceptor_reconnect_timeout; +int wal_acceptor_connect_timeout; +bool am_wal_proposer; + +char *zenith_timeline_walproposer = NULL; +char *zenith_tenant_walproposer = NULL; + +/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ +WalProposerFunctionsType *WalProposerFunctions = NULL; + +#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" + +static int n_safekeepers = 0; +static int quorum = 0; +static Safekeeper safekeeper[MAX_SAFEKEEPERS]; +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static ProposerGreeting greetRequest; +static VoteRequest voteRequest; /* Vote request for safekeeper */ +static WaitEventSet *waitEvents; +static AppendResponse quorumFeedback; +/* + * Minimal LSN which may be needed for recovery of some safekeeper, + * record-aligned (first record which might not yet received by someone). + */ +static XLogRecPtr truncateLsn; +/* + * Term of the proposer. We want our term to be highest and unique, + * so we collect terms from safekeepers quorum, choose max and +1. + * After that our term is fixed and must not change. If we observe + * that some safekeeper has higher term, it means that we have another + * running compute, so we must stop immediately. + */ +static term_t propTerm; +static TermHistory propTermHistory; /* term history of the proposer */ +static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ +static term_t donorEpoch; /* Most advanced acceptor epoch */ +static int donor; /* Most advanced acceptor */ +static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +static int n_votes = 0; +static int n_connected = 0; +static TimestampTz last_reconnect_attempt; + +static WalproposerShmemState *walprop_shared; + +/* Prototypes for private functions */ +static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); +static void WalProposerStartImpl(void); +static void WalProposerLoop(void); +static void InitEventSet(void); +static void UpdateEventSet(Safekeeper *sk, uint32 events); +static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); +static void ShutdownConnection(Safekeeper *sk); +static void ResetConnection(Safekeeper *sk); +static long TimeToReconnect(TimestampTz now); +static void ReconnectSafekeepers(void); +static void AdvancePollState(Safekeeper *sk, uint32 events); +static void HandleConnectionEvent(Safekeeper *sk); +static void SendStartWALPush(Safekeeper *sk); +static void RecvStartWALPushResult(Safekeeper *sk); +static void SendProposerGreeting(Safekeeper *sk); +static void RecvAcceptorGreeting(Safekeeper *sk); +static void SendVoteRequest(Safekeeper *sk); +static void RecvVoteResponse(Safekeeper *sk); +static void HandleElectedProposer(void); +static term_t GetHighestTerm(TermHistory *th); +static term_t GetEpoch(Safekeeper *sk); +static void DetermineEpochStartLsn(void); +static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); +static void SendProposerElected(Safekeeper *sk); +static void WalProposerStartStreaming(XLogRecPtr startpos); +static void StartStreaming(Safekeeper *sk); +static void SendMessageToNode(Safekeeper *sk); +static void BroadcastAppendRequest(void); +static void HandleActiveState(Safekeeper *sk, uint32 events); +static bool SendAppendRequests(Safekeeper *sk); +static bool RecvAppendResponses(Safekeeper *sk); +static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); +static XLogRecPtr CalculateMinFlushLsn(void); +static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); +static void HandleSafekeeperResponse(void); +static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); +static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); +static bool AsyncFlush(Safekeeper *sk); + + +static void nwp_shmem_startup_hook(void); +static void nwp_register_gucs(void); +static void nwp_prepare_shmem(void); +static uint64 backpressure_lag_impl(void); + + +static shmem_startup_hook_type prev_shmem_startup_hook_type; + + + +void pg_init_walproposer(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + nwp_register_gucs(); + + nwp_prepare_shmem(); + + delay_backend_us = &backpressure_lag_impl; + + WalProposerRegister(); + + WalProposerInit = &WalProposerInitImpl; + WalProposerStart = &WalProposerStartImpl; +} + +static void nwp_register_gucs(void) +{ + DefineCustomStringVariable( + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */ + NULL, NULL, NULL + ); + + DefineCustomIntVariable( + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL + ); + + DefineCustomIntVariable( + "neon.safekeeper_connect_timeout", + "Timeout after which give up connection attempt to safekeeper.", + NULL, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL + ); + +} + +/* shmem handling */ + +static void nwp_prepare_shmem(void) +{ + RequestAddinShmemSpace(WalproposerShmemSize()); + + prev_shmem_startup_hook_type = shmem_startup_hook; + shmem_startup_hook = nwp_shmem_startup_hook; +} + +static void nwp_shmem_startup_hook(void) +{ + if (prev_shmem_startup_hook_type) + prev_shmem_startup_hook_type(); + + WalproposerShmemInit(); +} + +/* + * WAL proposer bgworker entry point. + */ +void +WalProposerMain(Datum main_arg) +{ + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + GetXLogReplayRecPtr(&ThisTimeLineID); + + WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); + + last_reconnect_attempt = GetCurrentTimestamp(); + + application_name = (char *) "walproposer"; /* for + * synchronous_standby_names */ + am_wal_proposer = true; + am_walsender = true; + InitWalSender(); + InitProcessPhase2(); + + /* Create replication slot for WAL proposer if not exists */ + if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) + { + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); + ReplicationSlotReserveWal(); + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + } + + WalProposerStart(); +} + +/* + * Create new AppendRequest message and start sending it. This function is + * called from walsender every time the new WAL is available. + */ +void +WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) +{ + Assert(startpos == availableLsn && endpos >= availableLsn); + availableLsn = endpos; + BroadcastAppendRequest(); +} + +/* + * Advance the WAL proposer state machine, waiting each time for events to occur. + * Will exit only when latch is set, i.e. new WAL should be pushed from walsender + * to walproposer. + */ +void +WalProposerPoll(void) +{ + while (true) + { + Safekeeper *sk; + int rc; + WaitEvent event; + TimestampTz now = GetCurrentTimestamp(); + + rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); + sk = (Safekeeper *) event.user_data; + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) + AdvancePollState(sk, event.events); + + /* + * If the timeout expired, attempt to reconnect to any safekeepers that + * we dropped + */ + ReconnectSafekeepers(); + + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if (rc != 0 && (event.events & WL_LATCH_SET)) + { + ResetLatch(MyLatch); + break; + } + if (rc == 0) /* timeout expired: poll state */ + { + TimestampTz now; + + /* + * If no WAL was generated during timeout (and we have already + * collected the quorum), then send pool message + */ + if (availableLsn != InvalidXLogRecPtr) + { + BroadcastAppendRequest(); + } + + /* + * Abandon connection attempts which take too long. + */ + now = GetCurrentTimestamp(); + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + + if ((sk->state == SS_CONNECTING_WRITE || + sk->state == SS_CONNECTING_READ) && + TimestampDifferenceExceeds(sk->startedConnAt, now, + wal_acceptor_connect_timeout)) + { + elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", + sk->host, sk->port, wal_acceptor_connect_timeout); + ShutdownConnection(sk); + } + } + } + } +} + +/* + * Register a background worker proposing WAL to wal acceptors. + */ +void +WalProposerRegister(void) +{ + BackgroundWorker bgw; + + if (*wal_acceptors_list == '\0') + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static void +WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) +{ + char *host; + char *sep; + char *port; + + /* Load the libpq-specific functions */ + if (WalProposerFunctions == NULL) + elog(ERROR, "libpqwalproposer didn't initialize correctly"); + + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + + for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) + { + port = strchr(host, ':'); + if (port == NULL) + { + elog(FATAL, "port is not specified"); + } + *port++ = '\0'; + sep = strchr(port, ','); + if (sep != NULL) + *sep++ = '\0'; + if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) + { + elog(FATAL, "Too many safekeepers"); + } + safekeeper[n_safekeepers].host = host; + safekeeper[n_safekeepers].port = port; + safekeeper[n_safekeepers].state = SS_OFFLINE; + safekeeper[n_safekeepers].conn = NULL; + + /* + * Set conninfo to empty. We'll fill it out once later, in + * `ResetConnection` as needed + */ + safekeeper[n_safekeepers].conninfo[0] = '\0'; + initStringInfo(&safekeeper[n_safekeepers].outbuf); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + if (safekeeper[n_safekeepers].xlogreader == NULL) + elog(FATAL, "Failed to allocate xlog reader"); + safekeeper[n_safekeepers].flushWrite = false; + safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; + safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; + n_safekeepers += 1; + } + if (n_safekeepers < 1) + { + elog(FATAL, "Safekeepers addresses are not specified"); + } + quorum = n_safekeepers / 2 + 1; + + /* Fill the greeting package */ + greetRequest.tag = 'g'; + greetRequest.protocolVersion = SK_PROTOCOL_VERSION; + greetRequest.pgVersion = PG_VERSION_NUM; + pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); + greetRequest.systemId = systemId; + if (!zenith_timeline_walproposer) + elog(FATAL, "neon.timeline_id is not provided"); + if (*zenith_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); + if (!zenith_tenant_walproposer) + elog(FATAL, "neon.tenant_id is not provided"); + if (*zenith_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + + greetRequest.timeline = ThisTimeLineID; + greetRequest.walSegSize = wal_segment_size; + + InitEventSet(); +} + +static void +WalProposerStartImpl(void) +{ + + /* Initiate connections to all safekeeper nodes */ + for (int i = 0; i < n_safekeepers; i++) + { + ResetConnection(&safekeeper[i]); + } + + WalProposerLoop(); +} + +static void +WalProposerLoop(void) +{ + while (true) + WalProposerPoll(); +} + +/* Initializes the internal event set, provided that it is currently null */ +static void +InitEventSet(void) +{ + if (waitEvents) + elog(FATAL, "double-initialization of event set"); + + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); + AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); +} + +/* + * Updates the events we're already waiting on for the safekeeper, setting it to + * the provided `events` + * + * This function is called any time the safekeeper's state switches to one where + * it has to wait to continue. This includes the full body of AdvancePollState + * and calls to IO helper functions. + */ +static void +UpdateEventSet(Safekeeper *sk, uint32 events) +{ + /* eventPos = -1 when we don't have an event */ + Assert(sk->eventPos != -1); + + ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +} + +/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. + * + * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. + */ +static void +HackyRemoveWalProposerEvent(Safekeeper *to_remove) +{ + /* Remove the existing event set */ + if (waitEvents) + { + FreeWaitEventSet(waitEvents); + waitEvents = NULL; + } + /* Re-initialize it without adding any safekeeper events */ + InitEventSet(); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < n_safekeepers; i++) + { + uint32 desired_events = WL_NO_EVENTS; + Safekeeper *sk = &safekeeper[i]; + + sk->eventPos = -1; + + if (sk == to_remove) + continue; + + /* If this safekeeper isn't offline, add an event for it! */ + if (sk->conn != NULL) + { + desired_events = SafekeeperStateDesiredEvents(sk->state); + sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); + } + } +} + +/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ +static void +ShutdownConnection(Safekeeper *sk) +{ + if (sk->conn) + walprop_finish(sk->conn); + sk->conn = NULL; + sk->state = SS_OFFLINE; + sk->flushWrite = false; + sk->streamingAt = InvalidXLogRecPtr; + + if (sk->voteResponse.termHistory.entries) + pfree(sk->voteResponse.termHistory.entries); + sk->voteResponse.termHistory.entries = NULL; + + HackyRemoveWalProposerEvent(sk); +} + +/* + * This function is called to establish new connection or to reestablish + * connection in case of connection failure. + * + * On success, sets the state to SS_CONNECTING_WRITE. + */ +static void +ResetConnection(Safekeeper *sk) +{ + pgsocket sock; /* socket of the new connection */ + + if (sk->state != SS_OFFLINE) + { + ShutdownConnection(sk); + } + + /* + * Try to establish new connection + * + * If the connection information hasn't been filled out, we need to do + * that here. + */ + if (sk->conninfo[0] == '\0') + { + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, + // so it is better to be defensive and check that everything aligns well + if (written > MAXCONNINFO || written < 0) + elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + } + + sk->conn = walprop_connect_start((char *) &sk->conninfo); + + /* + * "If the result is null, then libpq has been unable to allocate a new + * PGconn structure" + */ + if (!sk->conn) + elog(FATAL, "failed to allocate new PGconn object"); + + /* + * PQconnectStart won't actually start connecting until we run + * PQconnectPoll. Before we do that though, we need to check that it + * didn't immediately fail. + */ + if (walprop_status(sk->conn) == WP_CONNECTION_BAD) + { + /*--- + * According to libpq docs: + * "If the result is CONNECTION_BAD, the connection attempt has already failed, + * typically because of invalid connection parameters." + * We should report this failure. + * + * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS + */ + elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s", + sk->conninfo, walprop_error_message(sk->conn)); + + /* + * Even though the connection failed, we still need to clean up the + * object + */ + walprop_finish(sk->conn); + sk->conn = NULL; + return; + } + + /* + * The documentation for PQconnectStart states that we should call + * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or + * PGRES_POLLING_FAILED. The other two possible returns indicate whether + * we should wait for reading or writing on the socket. For the first + * iteration of the loop, we're expected to wait until the socket becomes + * writable. + * + * The wording of the documentation is a little ambiguous; thankfully + * there's an example in the postgres source itself showing this behavior. + * (see libpqrcv_connect, defined in + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) + */ + elog(LOG, "connecting with node %s:%s", sk->host, sk->port); + + sk->state = SS_CONNECTING_WRITE; + sk->startedConnAt = GetCurrentTimestamp(); + + sock = walprop_socket(sk->conn); + sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); + return; +} + +/* + * How much milliseconds left till we should attempt reconnection to + * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect + * (do we actually need this?). + */ +static long +TimeToReconnect(TimestampTz now) +{ + TimestampTz passed; + TimestampTz till_reconnect; + + if (wal_acceptor_reconnect_timeout <= 0) + return -1; + + passed = now - last_reconnect_attempt; + till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; + if (till_reconnect <= 0) + return 0; + return (long) (till_reconnect / 1000); +} + +/* If the timeout has expired, attempt to reconnect to all offline safekeepers */ +static void +ReconnectSafekeepers(void) +{ + TimestampTz now = GetCurrentTimestamp(); + + if (TimeToReconnect(now) == 0) + { + last_reconnect_attempt = now; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_OFFLINE) + ResetConnection(&safekeeper[i]); + } + } +} + +/* + * Performs the logic for advancing the state machine of the specified safekeeper, + * given that a certain set of events has occured. + */ +static void +AdvancePollState(Safekeeper *sk, uint32 events) +{ + /* + * Sanity check. We assume further down that the operations don't + * block because the socket is ready. + */ + AssertEventsOkForState(events, sk); + + /* Execute the code corresponding to the current state */ + switch (sk->state) + { + /* + * safekeepers are only taken out of SS_OFFLINE by calls to + * ResetConnection + */ + case SS_OFFLINE: + elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ + + /* + * Both connecting states run the same logic. The only + * difference is the events they're expecting + */ + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + HandleConnectionEvent(sk); + break; + + /* + * Waiting for a successful CopyBoth response. + */ + case SS_WAIT_EXEC_RESULT: + RecvStartWALPushResult(sk); + break; + + /* + * Finish handshake comms: receive information about the safekeeper. + */ + case SS_HANDSHAKE_RECV: + RecvAcceptorGreeting(sk); + break; + + /* + * Voting is an idle state - we don't expect any events to trigger. + * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are + * transferred from SS_VOTING to sending actual vote requests. + */ + case SS_VOTING: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* Read the safekeeper response for our candidate */ + case SS_WAIT_VERDICT: + RecvVoteResponse(sk); + break; + + /* Flush proposer announcement message */ + case SS_SEND_ELECTED_FLUSH: + + /* + * AsyncFlush ensures we only move on to SS_ACTIVE once the flush + * completes. If we still have more to do, we'll wait until the next + * poll comes along. + */ + if (!AsyncFlush(sk)) + return; + + /* flush is done, event set and state will be updated later */ + StartStreaming(sk); + break; + + /* + * Idle state for waiting votes from quorum. + */ + case SS_IDLE: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* + * Active state is used for streaming WAL and receiving feedback. + */ + case SS_ACTIVE: + HandleActiveState(sk, events); + break; + } +} + +static void +HandleConnectionEvent(Safekeeper *sk) +{ + WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); + + /* The new set of events we'll wait on, after updating */ + uint32 new_events = WL_NO_EVENTS; + + switch (result) + { + case WP_CONN_POLLING_OK: + elog(LOG, "connected with node %s:%s", sk->host, + sk->port); + + /* + * We have to pick some event to update event set. + * We'll eventually need the socket to be readable, + * so we go with that. + */ + new_events = WL_SOCKET_READABLE; + break; + + /* + * If we need to poll to finish connecting, + * continue doing that + */ + case WP_CONN_POLLING_READING: + sk->state = SS_CONNECTING_READ; + new_events = WL_SOCKET_READABLE; + break; + case WP_CONN_POLLING_WRITING: + sk->state = SS_CONNECTING_WRITE; + new_events = WL_SOCKET_WRITEABLE; + break; + + case WP_CONN_POLLING_FAILED: + elog(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + + /* + * If connecting failed, we don't want to restart + * the connection because that might run us into a + * loop. Instead, shut it down -- it'll naturally + * restart at a slower interval on calls to + * ReconnectSafekeepers. + */ + ShutdownConnection(sk); + return; + } + + /* + * Because PQconnectPoll can change the socket, we have to + * un-register the old event and re-register an event on + * the new socket. + */ + HackyRemoveWalProposerEvent(sk); + sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); + + /* If we successfully connected, send START_WAL_PUSH query */ + if (result == WP_CONN_POLLING_OK) + SendStartWALPush(sk); +} + +/* + * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs + * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something + * goes wrong, change state to SS_OFFLINE and shutdown the connection. + */ +static void +SendStartWALPush(Safekeeper *sk) +{ + if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) + { + elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + } + sk->state = SS_WAIT_EXEC_RESULT; + UpdateEventSet(sk, WL_SOCKET_READABLE); +} + +static void +RecvStartWALPushResult(Safekeeper *sk) +{ + switch (walprop_get_query_result(sk->conn)) + { + /* + * Successful result, move on to starting the + * handshake + */ + case WP_EXEC_SUCCESS_COPYBOTH: + + SendProposerGreeting(sk); + break; + + /* + * Needs repeated calls to finish. Wait until the + * socket is readable + */ + case WP_EXEC_NEEDS_INPUT: + + /* + * SS_WAIT_EXEC_RESULT is always reached through an + * event, so we don't need to update the event set + */ + break; + + case WP_EXEC_FAILED: + elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + + /* + * Unexpected result -- funamdentally an error, but we + * want to produce a custom message, rather than a + * generic "something went wrong" + */ + case WP_EXEC_UNEXPECTED_SUCCESS: + elog(WARNING, "Received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +} + +/* + * Start handshake: first of all send information about the + * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * a response to finish the handshake. + */ +static void +SendProposerGreeting(Safekeeper *sk) +{ + /* + * On failure, logging & resetting the connection is handled. + * We just need to handle the control flow. + */ + BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); +} + +static void +RecvAcceptorGreeting(Safekeeper *sk) +{ + /* + * If our reading doesn't immediately succeed, any necessary + * error handling or state setting is taken care of. We can + * leave any other work until later. + */ + sk->greetResponse.apm.tag = 'g'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + return; + + /* Protocol is all good, move to voting. */ + sk->state = SS_VOTING; + + ++n_connected; + if (n_connected <= quorum) + { + /* We're still collecting terms from the majority. */ + propTerm = Max(sk->greetResponse.term, propTerm); + + /* Quorum is acquried, prepare the vote request. */ + if (n_connected == quorum) + { + propTerm++; + elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); + + voteRequest = (VoteRequest) + { + .tag = 'v', + .term = propTerm + }; + memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); + } + } + else if (sk->greetResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, propTerm); + } + + /* + * Check if we have quorum. If there aren't enough safekeepers, + * wait and do nothing. We'll eventually get a task when the + * election starts. + * + * If we do have quorum, we can start an election. + */ + if (n_connected < quorum) + { + /* + * SS_VOTING is an idle state; read-ready indicates the + * connection closed. + */ + UpdateEventSet(sk, WL_SOCKET_READABLE); + } + else + { + /* + * Now send voting request to the cohort and wait + * responses + */ + for (int j = 0; j < n_safekeepers; j++) + { + /* + * Remember: SS_VOTING indicates that the safekeeper is + * participating in voting, but hasn't sent anything + * yet. + */ + if (safekeeper[j].state == SS_VOTING) + SendVoteRequest(&safekeeper[j]); + } + } +} + +static void +SendVoteRequest(Safekeeper *sk) +{ + /* We have quorum for voting, send our vote request */ + elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); + /* On failure, logging & resetting is handled */ + if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) + return; + + /* If successful, wait for read-ready with SS_WAIT_VERDICT */ +} + +static void +RecvVoteResponse(Safekeeper *sk) +{ + sk->voteResponse.apm.tag = 'v'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + return; + + elog(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + + /* + * In case of acceptor rejecting our vote, bail out, but only + * if either it already lives in strictly higher term + * (concurrent compute spotted) or we are not elected yet and + * thus need the vote. + */ + if ((!sk->voteResponse.voteGiven) && + (sk->voteResponse.term > propTerm || n_votes < quorum)) + { + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, propTerm); + } + Assert(sk->voteResponse.term == propTerm); + + /* Handshake completed, do we have quorum? */ + n_votes++; + if (n_votes < quorum) + { + sk->state = SS_IDLE; /* can't do much yet, no quorum */ + } + else if (n_votes > quorum) + { + /* recovery already performed, just start streaming */ + SendProposerElected(sk); + } + else + { + sk->state = SS_IDLE; + UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for + * read-ready */ + + HandleElectedProposer(); + } +} + +/* + * Called once a majority of acceptors have voted for us and current proposer + * has been elected. + * + * Sends ProposerElected message to all acceptors in SS_IDLE state and starts + * replication from walsender. + */ +static void +HandleElectedProposer(void) +{ + DetermineEpochStartLsn(); + + /* + * Check if not all safekeepers are up-to-date, we need to + * download WAL needed to synchronize them + */ + if (truncateLsn < propEpochStartLsn) + { + elog(LOG, + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); + /* Perform recovery */ + if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) + elog(FATAL, "Failed to recover state"); + } + else if (syncSafekeepers) + { + /* Sync is not needed: just exit */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + SendProposerElected(&safekeeper[i]); + } + + /* + * The proposer has been elected, and there will be no quorum waiting + * after this point. There will be no safekeeper with state SS_IDLE + * also, because that state is used only for quorum waiting. + */ + + if (syncSafekeepers) + { + /* + * Send empty message to enforce receiving feedback + * even from nodes who are fully recovered; this is + * required to learn they switched epoch which finishes + * sync-safeekepers who doesn't generate any real new + * records. Will go away once we switch to async acks. + */ + BroadcastAppendRequest(); + + /* keep polling until all safekeepers are synced */ + return; + } + + WalProposerStartStreaming(propEpochStartLsn); + /* Should not return here */ +} + +/* latest term in TermHistory, or 0 is there is no entries */ +static term_t +GetHighestTerm(TermHistory *th) +{ + return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; +} + +/* safekeeper's epoch is the term of the highest entry in the log */ +static term_t +GetEpoch(Safekeeper *sk) +{ + return GetHighestTerm(&sk->voteResponse.termHistory); +} + +/* If LSN points to the page header, skip it */ +static XLogRecPtr +SkipXLogPageHeader(XLogRecPtr lsn) +{ + if (XLogSegmentOffset(lsn, wal_segment_size) == 0) + { + lsn += SizeOfXLogLongPHD; + } + else if (lsn % XLOG_BLCKSZ == 0) + { + lsn += SizeOfXLogShortPHD; + } + return lsn; +} + +/* + * Called after majority of acceptors gave votes, it calculates the most + * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since + * which we'll write WAL in our term. + * + * Sets truncateLsn along the way (though it is not of much use at this point -- + * only for skipping recovery). + */ +static void +DetermineEpochStartLsn(void) +{ + TermHistory *dth; + + propEpochStartLsn = InvalidXLogRecPtr; + donorEpoch = 0; + truncateLsn = InvalidXLogRecPtr; + timelineStartLsn = InvalidXLogRecPtr; + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + { + if (GetEpoch(&safekeeper[i]) > donorEpoch || + (GetEpoch(&safekeeper[i]) == donorEpoch && + safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) + { + donorEpoch = GetEpoch(&safekeeper[i]); + propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; + donor = i; + } + truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); + + if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) + { + /* timelineStartLsn should be the same everywhere or unknown */ + if (timelineStartLsn != InvalidXLogRecPtr && + timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) + { + elog(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(timelineStartLsn), + LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); + } + timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; + } + } + } + + /* + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was + * committed yet. Start streaming then from the basebackup LSN. + */ + if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) + { + propEpochStartLsn = truncateLsn = GetRedoStartLsn(); + if (timelineStartLsn == InvalidXLogRecPtr) + { + timelineStartLsn = GetRedoStartLsn(); + } + elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); + } + + /* + * If propEpochStartLsn is not 0, at least one msg with WAL was sent to + * some connected safekeeper; it must have carried truncateLsn pointing to + * the first record. + */ + Assert((truncateLsn != InvalidXLogRecPtr) || + (syncSafekeepers && truncateLsn == propEpochStartLsn)); + + /* + * We will be generating WAL since propEpochStartLsn, so we should set + * availableLsn to mark this LSN as the latest available position. + */ + availableLsn = propEpochStartLsn; + + /* + * Proposer's term history is the donor's + its own entry. + */ + dth = &safekeeper[donor].voteResponse.termHistory; + propTermHistory.n_entries = dth->n_entries + 1; + propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); + memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; + propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; + + elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + quorum, + propTerm, + LSN_FORMAT_ARGS(propEpochStartLsn), + safekeeper[donor].host, safekeeper[donor].port, + LSN_FORMAT_ARGS(truncateLsn) + ); + + /* + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since + * which we are going to write according to the consensus. If not, we must + * bail out, as clog and other non rel data is inconsistent. + */ + if (!syncSafekeepers) + { + /* + * Basebackup LSN always points to the beginning of the record (not the + * page), as StartupXLOG most probably wants it this way. Safekeepers + * don't skip header as they need continious stream of data, so + * correct LSN for comparison. + */ + if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) + { + /* + * However, allow to proceed if previously elected leader was me; plain + * restart of walproposer not intervened by concurrent compute (who could + * generate WAL) is ok. + */ + if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == + walprop_shared->mineLastElectedTerm))) + { + elog(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(propEpochStartLsn), + LSN_FORMAT_ARGS(GetRedoStartLsn())); + } + } + walprop_shared->mineLastElectedTerm = propTerm; + } +} + +/* + * Receive WAL from most advanced safekeeper + */ +static bool +WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +{ + char conninfo[MAXCONNINFO]; + char *err; + WalReceiverConn *wrconn; + WalRcvStreamOptions options; + + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); + if (!wrconn) + { + ereport(WARNING, + (errmsg("could not connect to WAL acceptor %s:%s: %s", + safekeeper[donor].host, safekeeper[donor].port, + err))); + return false; + } + elog(LOG, + "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + + options.logical = false; + options.startpoint = startpos; + options.slotname = NULL; + options.proto.physical.startpointTLI = timeline; + + if (walrcv_startstreaming(wrconn, &options)) + { + XLogRecPtr rec_start_lsn; + XLogRecPtr rec_end_lsn = 0; + int len; + char *buf; + pgsocket wait_fd = PGINVALID_SOCKET; + + while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) + { + if (len == 0) + { + (void) WaitLatchOrSocket( + MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, + -1, WAIT_EVENT_WAL_RECEIVER_MAIN); + } + else + { + Assert(buf[0] == 'w' || buf[0] == 'k'); + if (buf[0] == 'k') + continue; /* keepalive */ + memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], + sizeof rec_start_lsn); + rec_start_lsn = pg_ntoh64(rec_start_lsn); + rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; + + /* write WAL to disk */ + XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); + + ereport(DEBUG1, + (errmsg("Recover message %X/%X length %d", + LSN_FORMAT_ARGS(rec_start_lsn), len))); + if (rec_end_lsn >= endpos) + break; + } + } + ereport(LOG, + (errmsg("end of replication stream at %X/%X: %m", + LSN_FORMAT_ARGS(rec_end_lsn)))); + walrcv_disconnect(wrconn); + + /* failed to receive all WAL till endpos */ + if (rec_end_lsn < endpos) + return false; + } + else + { + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", + timeline, (uint32) (startpos >> 32), (uint32) startpos))); + return false; + } + + return true; +} + +/* + * Determine for sk the starting streaming point and send it message + * 1) Announcing we are elected proposer (which immediately advances epoch if + * safekeeper is synced, being important for sync-safekeepers) + * 2) Communicating starting streaming point -- safekeeper must truncate its WAL + * beyond it -- and history of term switching. + * + * Sets sk->startStreamingAt. + */ +static void +SendProposerElected(Safekeeper *sk) +{ + ProposerElected msg; + TermHistory *th; + term_t lastCommonTerm; + int i; + + /* + * Determine start LSN by comparing safekeeper's log term switch history and + * proposer's, searching for the divergence point. + * + * Note: there is a vanishingly small chance of no common point even if + * there is some WAL on safekeeper, if immediately after bootstrap compute + * wrote some WAL on single sk and died; we stream since the beginning then. + */ + th = &sk->voteResponse.termHistory; + /* + * If any WAL is present on the sk, it must be authorized by some term. + * OTOH, without any WAL there are no term swiches in the log. + */ + Assert((th->n_entries == 0) == + (sk->voteResponse.flushLsn == InvalidXLogRecPtr)); + /* We must start somewhere. */ + Assert(propTermHistory.n_entries >= 1); + + for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) + { + if (propTermHistory.entries[i].term != th->entries[i].term) + break; + /* term must begin everywhere at the same point */ + Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); + } + i--; /* step back to the last common term */ + if (i < 0) + { + /* safekeeper is empty or no common point, start from the beginning */ + sk->startStreamingAt = propTermHistory.entries[0].lsn; + + if (sk->startStreamingAt < truncateLsn) + { + /* + * There's a gap between the WAL starting point and a truncateLsn, + * which can't appear in a normal working cluster. That gap means + * that all safekeepers reported that they have persisted WAL up + * to the truncateLsn before, but now current safekeeper tells + * otherwise. + * + * Also we have a special condition here, which is empty safekeeper + * with no history. In combination with a gap, that can happen when + * we introduce a new safekeeper to the cluster. This is a rare case, + * which is triggered manually for now, and should be treated with + * care. + */ + + /* + * truncateLsn will not change without ack from current safekeeper, + * and it's aligned to the WAL record, so we can safely start + * streaming from this point. + */ + sk->startStreamingAt = truncateLsn; + + elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", + sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), + LSN_FORMAT_ARGS(sk->startStreamingAt)); + } + } + else + { + /* + * End of (common) term is the start of the next except it is the last + * one; there it is flush_lsn in case of safekeeper or, in case of + * proposer, LSN it is currently writing, but then we just pick + * safekeeper pos as it obviously can't be higher. + */ + if (propTermHistory.entries[i].term == propTerm) + { + sk->startStreamingAt = sk->voteResponse.flushLsn; + } + else + { + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); + } + } + + Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); + + msg.tag = 'e'; + msg.term = propTerm; + msg.startStreamingAt = sk->startStreamingAt; + msg.termHistory = &propTermHistory; + msg.timelineStartLsn = timelineStartLsn; + + lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; + elog(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + + resetStringInfo(&sk->outbuf); + pq_sendint64_le(&sk->outbuf, msg.tag); + pq_sendint64_le(&sk->outbuf, msg.term); + pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); + pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); + for (int i = 0; i < msg.termHistory->n_entries; i++) + { + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); + } + pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + + if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) + return; + + StartStreaming(sk); +} + +/* + * Start walsender streaming replication + */ +static void +WalProposerStartStreaming(XLogRecPtr startpos) +{ + StartReplicationCmd cmd; + + elog(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); + cmd.slotname = WAL_PROPOSER_SLOT_NAME; + cmd.timeline = greetRequest.timeline; + cmd.startpoint = startpos; + StartProposerReplication(&cmd); +} + +/* + * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets + * correct event set. + */ +static void +StartStreaming(Safekeeper *sk) +{ + /* + * This is the only entrypoint to state SS_ACTIVE. It's executed + * exactly once for a connection. + */ + sk->state = SS_ACTIVE; + sk->streamingAt = sk->startStreamingAt; + + /* event set will be updated inside SendMessageToNode */ + SendMessageToNode(sk); +} + +/* + * Try to send message to the particular node. Always updates event set. Will + * send at least one message, if socket is ready. + * + * Can be used only for safekeepers in SS_ACTIVE state. State can be changed + * in case of errors. + */ +static void +SendMessageToNode(Safekeeper *sk) +{ + Assert(sk->state == SS_ACTIVE); + + /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + HandleActiveState(sk, WL_SOCKET_WRITEABLE); +} + +/* + * Broadcast new message to all caught-up safekeepers + */ +static void +BroadcastAppendRequest() +{ + for (int i = 0; i < n_safekeepers; i++) + if (safekeeper[i].state == SS_ACTIVE) + SendMessageToNode(&safekeeper[i]); +} + +static void +PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +{ + Assert(endLsn >= beginLsn); + req->tag = 'a'; + req->term = propTerm; + req->epochStartLsn = propEpochStartLsn; + req->beginLsn = beginLsn; + req->endLsn = endLsn; + req->commitLsn = GetAcknowledgedByQuorumWALPosition(); + req->truncateLsn = truncateLsn; + req->proposerId = greetRequest.proposerId; +} + +/* + * Process all events happened in SS_ACTIVE state, update event set after that. + */ +static void +HandleActiveState(Safekeeper *sk, uint32 events) +{ + uint32 newEvents = WL_SOCKET_READABLE; + + if (events & WL_SOCKET_WRITEABLE) + if (!SendAppendRequests(sk)) + return; + + if (events & WL_SOCKET_READABLE) + if (!RecvAppendResponses(sk)) + return; + + /* + * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data + * in the buffer. + * + * LSN comparison checks if we have pending unsent messages. This check isn't + * necessary now, because we always send append messages immediately after + * arrival. But it's good to have it here in case we change this behavior + * in the future. + */ + if (sk->streamingAt != availableLsn || sk->flushWrite) + newEvents |= WL_SOCKET_WRITEABLE; + + UpdateEventSet(sk, newEvents); +} + +/* + * Send WAL messages starting from sk->streamingAt until the end or non-writable + * socket, whichever comes first. Caller should take care of updating event set. + * Even if no unsent WAL is available, at least one empty message will be sent + * as a heartbeat, if socket is ready. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + */ +static bool +SendAppendRequests(Safekeeper *sk) +{ + XLogRecPtr endLsn; + AppendRequestHeader *req; + PGAsyncWriteResult writeResult; + WALReadError errinfo; + bool sentAnything = false; + + if (sk->flushWrite) + { + if (!AsyncFlush(sk)) + /* + * AsyncFlush failed, that could happen if the socket is closed or + * we have nothing to write and should wait for writeable socket. + */ + return sk->state == SS_ACTIVE; + + /* Event set will be updated in the end of HandleActiveState */ + sk->flushWrite = false; + } + + while (sk->streamingAt != availableLsn || !sentAnything) + { + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > availableLsn) { + endLsn = availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); + + ereport(DEBUG2, + (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + + /* write the WAL itself */ + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + if (!WALRead(sk->xlogreader, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + ThisTimeLineID, + &errinfo)) + { + WALReadRaiseError(&errinfo); + } + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + /* + * We still need to call PQflush some more to finish the job. + * Caller function will handle this by setting right event set. + */ + sk->flushWrite = true; + return true; + + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } + } + + return true; +} + +/* + * Receive and process all available feedback. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + * + * NB: This function can call SendMessageToNode and produce new messages. + */ +static bool +RecvAppendResponses(Safekeeper *sk) +{ + XLogRecPtr minQuorumLsn; + bool readAnything = false; + + while (true) + { + /* + * If our reading doesn't immediately succeed, any + * necessary error handling or state setting is taken care + * of. We can leave any other work until later. + */ + sk->appendResponse.apm.tag = 'a'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + break; + + ereport(DEBUG2, + (errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port))); + + if (sk->appendResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, propTerm); + } + + readAnything = true; + } + + if (!readAnything) + return sk->state == SS_ACTIVE; + + HandleSafekeeperResponse(); + + /* + * Also send the new commit lsn to all the safekeepers. + */ + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + if (minQuorumLsn > lastSentCommitLsn) + { + BroadcastAppendRequest(); + lastSentCommitLsn = minQuorumLsn; + } + + return sk->state == SS_ACTIVE; +} + +/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +void +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +{ + uint8 nkeys; + int i; + int32 len; + + /* get number of custom keys */ + nkeys = pq_getmsgbyte(reply_message); + + for (i = 0; i < nkeys; i++) + { + const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->currentClusterSize = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); + } + else if (strcmp(key, "ps_writelsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_writelsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); + } + else if (strcmp(key, "ps_flushlsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_flushlsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); + } + else if (strcmp(key, "ps_applylsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_applylsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); + } + else if (strcmp(key, "ps_replytime") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_replytime = pq_getmsgint64(reply_message); + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", + rf->ps_replytime, replyTimeStr); + + pfree(replyTimeStr); + } + } + else + { + len = pq_getmsgint(reply_message, sizeof(int32)); // read value length + // Skip unknown keys to support backward compatibile protocol changes + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + pq_getmsgbytes(reply_message, len); + }; + } +} + +/* + * Combine hot standby feedbacks from all safekeepers. + */ +static void +CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) +{ + hs->ts = 0; + hs->xmin.value = ~0; /* largest unsigned value */ + hs->catalog_xmin.value = ~0; /* largest unsigned value */ + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.hs.ts != 0) + { + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + { + hs->xmin = safekeeper[i].appendResponse.hs.xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + { + hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + } + } +} + + +/* + * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the + * last WAL record that can be safely discarded. + */ +static XLogRecPtr +CalculateMinFlushLsn(void) +{ + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + for (int i = 1; i < n_safekeepers; i++) + { + lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); + } + return lsn; +} + +/* + * Calculate WAL position acknowledged by quorum + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(void) +{ + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to epochStartLsn. + */ + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? + safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[n_safekeepers - quorum]; +} + +/* + * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + */ +Size +WalproposerShmemSize(void) +{ + return sizeof(WalproposerShmemState); +} + +bool +WalproposerShmemInit(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + walprop_shared = ShmemInitStruct("Walproposer shared state", + sizeof(WalproposerShmemState), + &found); + + if (!found) + { + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +void +replication_feedback_set(ReplicationFeedback *rf) +{ + SpinLockAcquire(&walprop_shared->mutex); + memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + SpinLockRelease(&walprop_shared->mutex); +} + + +void +replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) +{ + SpinLockAcquire(&walprop_shared->mutex); + *writeLsn = walprop_shared->feedback.ps_writelsn; + *flushLsn = walprop_shared->feedback.ps_flushlsn; + *applyLsn = walprop_shared->feedback.ps_applylsn; + SpinLockRelease(&walprop_shared->mutex); +} + + +/* + * Get ReplicationFeedback fields from the most advanced safekeeper + */ +static void +GetLatestZentihFeedback(ReplicationFeedback *rf) +{ + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + { + latest_safekeeper = i; + ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + } + } + + rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; + rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; + rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; + rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; + rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + + elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); + + replication_feedback_set(rf); +} + +static void +HandleSafekeeperResponse(void) +{ + HotStandbyFeedback hsFeedback; + XLogRecPtr minQuorumLsn; + XLogRecPtr diskConsistentLsn; + XLogRecPtr minFlushLsn; + + + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + + if (!syncSafekeepers) + { + // Get ReplicationFeedback fields from the most advanced safekeeper + GetLatestZentihFeedback(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + } + + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + { + + if (minQuorumLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = minQuorumLsn; + + /* advance the replication slot */ + if (!syncSafekeepers) + ProcessStandbyReply( + // write_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //flush_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //apply_lsn - This is what processed and durably saved at pageserver. + quorumFeedback.rf.ps_flushlsn, + GetCurrentTimestamp(), false); + } + + CombineHotStanbyFeedbacks(&hsFeedback); + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + { + quorumFeedback.hs = hsFeedback; + if (!syncSafekeepers) + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + } + + /* + * Try to advance truncateLsn to minFlushLsn, which is the last record + * flushed to all safekeepers. We must always start streaming from the + * beginning of the record, which simplifies decoding on the far end. + * + * Advanced truncateLsn should be not further than nearest commitLsn. + * This prevents surprising violation of truncateLsn <= commitLsn + * invariant which might occur because 1) truncateLsn can be advanced + * immediately once chunk is broadcast to all safekeepers, and + * commitLsn generally can't be advanced based on feedback from + * safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) chunks we + * read from WAL and send are plain sheets of bytes, but safekeepers + * ack only on record boundaries. + */ + minFlushLsn = CalculateMinFlushLsn(); + if (minFlushLsn > truncateLsn) + { + truncateLsn = minFlushLsn; + + /* + * Advance the replication slot to free up old WAL files. Note + * that slot doesn't exist if we are in syncSafekeepers mode. + */ + if (MyReplicationSlot) + PhysicalConfirmReceivedLocation(truncateLsn); + } + + /* + * Generally sync is done when majority switched the epoch so we committed + * epochStartLsn and made the majority aware of it, ensuring they are + * ready to give all WAL to pageserver. It would mean whichever majority + * is alive, there will be at least one safekeeper who is able to stream + * WAL to pageserver to make basebackup possible. However, since at the + * moment we don't have any good mechanism of defining the healthy and + * most advanced safekeeper who should push the wal into pageserver and + * basically the random one gets connected, to prevent hanging basebackup + * (due to pageserver connecting to not-synced-safekeeper) we currently + * wait for all seemingly alive safekeepers to get synced. + */ + if (syncSafekeepers) + { + int n_synced; + + n_synced = 0; + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; + + /* alive safekeeper which is not synced yet; wait for it */ + if (sk->state != SS_OFFLINE && !synced) + return; + if (synced) + n_synced++; + } + if (n_synced >= quorum) + { + /* All safekeepers synced! */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + } +} + +/* + * Try to read CopyData message from i'th safekeeper, resetting connection on + * failure. + */ +static bool +AsyncRead(Safekeeper *sk, char **buf, int *buf_size) +{ + switch (walprop_async_read(sk->conn, buf, buf_size)) + { + case PG_ASYNC_READ_SUCCESS: + return true; + + case PG_ASYNC_READ_TRY_AGAIN: + /* WL_SOCKET_READABLE is always set during copyboth */ + return false; + + case PG_ASYNC_READ_FAIL: + elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + Assert(false); + return false; +} + +/* + * Read next message with known type into provided struct, by reading a CopyData + * block from the safekeeper's postgres connection, returning whether the read + * was successful. + * + * If the read needs more polling, we return 'false' and keep the state + * unmodified, waiting until it becomes read-ready to try again. If it fully + * failed, a warning is emitted and the connection is reset. + */ +static bool +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +{ + char *buf; + int buf_size; + uint64 tag; + StringInfoData s; + + if (!(AsyncRead(sk, &buf, &buf_size))) + return false; + + /* parse it */ + s.data = buf; + s.len = buf_size; + s.cursor = 0; + + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return false; + } + + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } + + default: + { + Assert(false); + return false; + } + } +} + +/* + * Blocking equivalent to AsyncWrite. + * + * We use this everywhere messages are small enough that they should fit in a + * single packet. + */ +static bool +BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) +{ + uint32 events; + + if (!walprop_blocking_write(sk->conn, msg, msg_size)) + { + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + + sk->state = success_state; + + /* + * If the new state will be waiting for events to happen, update the event + * set to wait for those + */ + events = SafekeeperStateDesiredEvents(success_state); + if (events) + UpdateEventSet(sk, events); + + return true; +} + +/* + * Starts a write into the 'i'th safekeeper's postgres connection, moving to + * flush_state (adjusting eventset) if write still needs flushing. + * + * Returns false if sending is unfinished (requires flushing or conn failed). + * Upon failure, a warning is emitted and the connection is reset. + */ +static bool +AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) +{ + switch (walprop_async_write(sk->conn, msg, msg_size)) + { + case PG_ASYNC_WRITE_SUCCESS: + return true; + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the job; go + * to the appropriate state. Update the event set at the bottom of + * this function + */ + sk->state = flush_state; + UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); + return false; + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +/* + * Flushes a previous call to AsyncWrite. This only needs to be called when the + * socket becomes read or write ready *after* calling AsyncWrite. + * + * If flushing successfully completes returns true, otherwise false. Event set + * is updated only if connection fails, otherwise caller should manually unset + * WL_SOCKET_WRITEABLE. + */ +static bool +AsyncFlush(Safekeeper *sk) +{ + /*--- + * PQflush returns: + * 0 if successful [we're good to move on] + * 1 if unable to send everything yet [call PQflush again] + * -1 if it failed [emit an error] + */ + switch (walprop_flush(sk->conn)) + { + case 0: + /* flush is done */ + return true; + case 1: + /* Nothing to do; try again when the socket's ready */ + return false; + case -1: + elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ResetConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +// Check if we need to suspend inserts because of lagging replication. +static uint64 +backpressure_lag_impl(void) +{ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) + { + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + XLogRecPtr myFlushLsn = GetFlushRecPtr(); + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); +#define MB ((XLogRecPtr)1024*1024) + + elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + LSN_FORMAT_ARGS(myFlushLsn), + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr)); + + if ((writePtr != InvalidXLogRecPtr + && max_replication_write_lag > 0 + && myFlushLsn > writePtr + max_replication_write_lag*MB)) + { + return (myFlushLsn - writePtr - max_replication_write_lag*MB); + } + + if ((flushPtr != InvalidXLogRecPtr + && max_replication_flush_lag > 0 + && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + { + return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + } + + if ((applyPtr != InvalidXLogRecPtr + && max_replication_apply_lag > 0 + && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + { + return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + } + } + return 0; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h new file mode 100644 index 0000000000..b684d5264f --- /dev/null +++ b/pgxn/neon/walproposer.h @@ -0,0 +1,540 @@ +#ifndef __NEON_WALPROPOSER_H__ +#define __NEON_WALPROPOSER_H__ + +#include "access/xlogdefs.h" +#include "postgres.h" +#include "port.h" +#include "access/xlog_internal.h" +#include "access/transam.h" +#include "nodes/replnodes.h" +#include "utils/uuid.h" +#include "replication/walreceiver.h" + +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 + +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ + +/* + * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, + * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 + */ +#define WL_NO_EVENTS 0 + +extern char* wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; + +struct WalProposerConn; /* Defined in libpqwalproposer */ +typedef struct WalProposerConn WalProposerConn; + +struct WalMessage; +typedef struct WalMessage WalMessage; + +extern char *zenith_timeline_walproposer; +extern char *zenith_tenant_walproposer; + +/* Possible return values from ReadPGAsync */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + /* The read is ongoing. Wait until the connection is read-ready, then try + * again. */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from WritePGAsync */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + /* The write started, but you'll need to call PQflush some more times + * to finish it off. We just tried, so it's best to wait until the + * connection is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * WAL safekeeper state, which is used to wait for some event. + * + * States are listed here in the order that they're executed. + * + * Most states, upon failure, will move back to SS_OFFLINE by calls to + * ResetConnection or ShutdownConnection. + */ +typedef enum +{ + /* + * Does not have an active connection and will stay that way until + * further notice. + * + * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. + */ + SS_OFFLINE, + + /* + * Connecting states. "_READ" waits for the socket to be available for + * reading, "_WRITE" waits for writing. There's no difference in the code + * they execute when polled, but we have this distinction in order to + * recreate the event set in HackyRemoveWalProposerEvent. + * + * After the connection is made, "START_WAL_PUSH" query is sent. + */ + SS_CONNECTING_WRITE, + SS_CONNECTING_READ, + + /* + * Waiting for the result of the "START_WAL_PUSH" command. + * + * After we get a successful result, sends handshake to safekeeper. + */ + SS_WAIT_EXEC_RESULT, + + /* + * Executing the receiving half of the handshake. After receiving, moves to + * SS_VOTING. + */ + SS_HANDSHAKE_RECV, + + /* + * Waiting to participate in voting, but a quorum hasn't yet been reached. + * This is an idle state - we do not expect AdvancePollState to be called. + * + * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a + * quorum of handshakes. + */ + SS_VOTING, + + /* + * Already sent voting information, waiting to receive confirmation from the + * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + */ + SS_WAIT_VERDICT, + + /* Need to flush ProposerElected message. */ + SS_SEND_ELECTED_FLUSH, + + /* + * Waiting for quorum to send WAL. Idle state. If the socket becomes + * read-ready, the connection has been closed. + * + * Moves to SS_ACTIVE only by call to StartStreaming. + */ + SS_IDLE, + + /* + * Active phase, when we acquired quorum and have WAL to send or feedback + * to read. + */ + SS_ACTIVE, +} SafekeeperState; + +/* Consensus logical timestamp. */ +typedef uint64 term_t; + +/* neon storage node id */ +typedef uint64 NNodeId; + +/* + * Proposer <-> Acceptor messaging. + */ + +/* Initial Proposer -> Acceptor message */ +typedef struct ProposerGreeting +{ + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; + +typedef struct AcceptorProposerMessage +{ + uint64 tag; +} AcceptorProposerMessage; + +/* + * Acceptor -> Proposer initial response: the highest term acceptor voted for. + */ +typedef struct AcceptorGreeting +{ + AcceptorProposerMessage apm; + term_t term; + NNodeId nodeId; +} AcceptorGreeting; + +/* + * Proposer -> Acceptor vote request. + */ +typedef struct VoteRequest +{ + uint64 tag; + term_t term; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; + +/* Element of term switching chain. */ +typedef struct TermSwitchEntry +{ + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; + +typedef struct TermHistory +{ + uint32 n_entries; + TermSwitchEntry *entries; +} TermHistory; + +/* Vote itself, sent from safekeeper to proposer */ +typedef struct VoteResponse { + AcceptorProposerMessage apm; + term_t term; + uint64 voteGiven; + /* + * Safekeeper flush_lsn (end of WAL) + history of term switches allow + * proposer to choose the most advanced one. + */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + TermHistory termHistory; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; + +/* + * Proposer -> Acceptor message announcing proposer is elected and communicating + * epoch history to it. + */ +typedef struct ProposerElected +{ + uint64 tag; + term_t term; + /* proposer will send since this point */ + XLogRecPtr startStreamingAt; + /* history of term switches up to this proposer */ + TermHistory *termHistory; + /* timeline globally starts at this LSN */ + XLogRecPtr timelineStartLsn; +} ProposerElected; + +/* + * Header of request with WAL message sent from proposer to safekeeper. + */ +typedef struct AppendRequestHeader +{ + uint64 tag; + term_t term; /* term of the proposer */ + /* + * LSN since which current proposer appends WAL (begin_lsn of its first + * record); determines epoch switch point. + */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* + * minimal LSN which may be needed for recovery of some safekeeper (end lsn + * + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; + +/* + * Hot standby feedback received from replica + */ +typedef struct HotStandbyFeedback +{ + TimestampTz ts; + FullTransactionId xmin; + FullTransactionId catalog_xmin; +} HotStandbyFeedback; + + +typedef struct ReplicationFeedback +{ + // current size of the timeline on pageserver + uint64 currentClusterSize; + // standby_status_update fields that safekeeper received from pageserver + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; + TimestampTz ps_replytime; +} ReplicationFeedback; + + +typedef struct WalproposerShmemState +{ + slock_t mutex; + ReplicationFeedback feedback; + term_t mineLastElectedTerm; +} WalproposerShmemState; + +/* + * Report safekeeper state to proposer + */ +typedef struct AppendResponse +{ + AcceptorProposerMessage apm; + /* + * Current term of the safekeeper; if it is higher than proposer's, the + * compute is out of date. + */ + term_t term; + // TODO: add comment + XLogRecPtr flushLsn; + // Safekeeper reports back his awareness about which WAL is committed, as + // this is a criterion for walproposer --sync mode exit + XLogRecPtr commitLsn; + HotStandbyFeedback hs; + // Feedback recieved from pageserver includes standby_status_update fields + // and custom zenith feedback. + // This part of the message is extensible. + ReplicationFeedback rf; +} AppendResponse; + +// ReplicationFeedback is extensible part of the message that is parsed separately +// Other fields are fixed part +#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) + + +/* + * Descriptor of safekeeper + */ +typedef struct Safekeeper +{ + char const* host; + char const* port; + char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + + /* + * postgres protocol connection to the WAL acceptor + * + * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we + * reach SS_ACTIVE; not before. + */ + WalProposerConn* conn; + /* + * Temporary buffer for the message being sent to the safekeeper. + */ + StringInfoData outbuf; + /* + * WAL reader, allocated for each safekeeper. + */ + XLogReaderState* xlogreader; + + /* + * Streaming will start here; must be record boundary. + */ + XLogRecPtr startStreamingAt; + + bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + + int eventPos; /* position in wait event set. Equal to -1 if no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ +} Safekeeper; + + +extern PGDLLIMPORT void WalProposerMain(Datum main_arg); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback *rf); +extern void StartProposerReplication(StartReplicationCmd *cmd); + +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback *rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); + +/* libpqwalproposer hooks & helper type */ + +/* Re-exported PostgresPollingStatusType */ +typedef enum +{ + WP_CONN_POLLING_FAILED = 0, + WP_CONN_POLLING_READING, + WP_CONN_POLLING_WRITING, + WP_CONN_POLLING_OK, + /* + * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. + * We've removed it here to avoid clutter. + */ +} WalProposerConnectPollStatusType; + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + /* Any success result other than a single CopyBoth was received. The specifics of the result + * were already logged, but it may be useful to provide an error message indicating which + * safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. */ + WP_EXEC_UNEXPECTED_SUCCESS, + /* No result available at this time. Wait until read-ready, then call again. Internally, this is + * returned when PQisBusy indicates that PQgetResult would block. */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Re-exported ConnStatusType */ +typedef enum +{ + WP_CONNECTION_OK, + WP_CONNECTION_BAD, + + /* + * The original ConnStatusType has many more tags, but requests that + * they not be relied upon (except for displaying to the user). We + * don't need that extra functionality, so we collect them into a + * single tag here. + */ + WP_CONNECTION_IN_PROGRESS, +} WalProposerConnStatusType; + +/* Re-exported PQerrorMessage */ +typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); + +/* Re-exported PQstatus */ +typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); + +/* Re-exported PQconnectStart */ +typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); + +/* Re-exported PQconectPoll */ +typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); + +/* Blocking wrapper around PQsendQuery */ +typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); + +/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ +typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); + +/* Re-exported PQsocket */ +typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); + +/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ +typedef int (*walprop_flush_fn) (WalProposerConn* conn); + +/* Re-exported PQfinish */ +typedef void (*walprop_finish_fn) (WalProposerConn* conn); + +/* + * Ergonomic wrapper around PGgetCopyData + * + * Reads a CopyData block from a safekeeper, setting *amount to the number + * of bytes returned. + * + * This function is allowed to assume certain properties specific to the + * protocol with the safekeepers, so it should not be used as-is for any + * other purpose. + * + * Note: If possible, using is generally preferred, because it + * performs a bit of extra checking work that's always required and is normally + * somewhat verbose. + */ +typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, + char** buf, + int* amount); + +/* + * Ergonomic wrapper around PQputCopyData + PQflush + * + * Starts to write a CopyData block to a safekeeper. + * + * For information on the meaning of return codes, refer to PGAsyncWriteResult. + */ +typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, + void const* buf, + size_t size); + +/* + * Blocking equivalent to walprop_async_write_fn + * + * Returns 'true' if successful, 'false' on failure. + */ +typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); + +/* All libpqwalproposer exported functions collected together. */ +typedef struct WalProposerFunctionsType +{ + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; + +/* Allow the above functions to be "called" with normal syntax */ +#define walprop_error_message(conn) \ + WalProposerFunctions->walprop_error_message(conn) +#define walprop_status(conn) \ + WalProposerFunctions->walprop_status(conn) +#define walprop_connect_start(conninfo) \ + WalProposerFunctions->walprop_connect_start(conninfo) +#define walprop_connect_poll(conn) \ + WalProposerFunctions->walprop_connect_poll(conn) +#define walprop_send_query(conn, query) \ + WalProposerFunctions->walprop_send_query(conn, query) +#define walprop_get_query_result(conn) \ + WalProposerFunctions->walprop_get_query_result(conn) +#define walprop_set_nonblocking(conn, arg) \ + WalProposerFunctions->walprop_set_nonblocking(conn, arg) +#define walprop_socket(conn) \ + WalProposerFunctions->walprop_socket(conn) +#define walprop_flush(conn) \ + WalProposerFunctions->walprop_flush(conn) +#define walprop_finish(conn) \ + WalProposerFunctions->walprop_finish(conn) +#define walprop_async_read(conn, buf, amount) \ + WalProposerFunctions->walprop_async_read(conn, buf, amount) +#define walprop_async_write(conn, buf, size) \ + WalProposerFunctions->walprop_async_write(conn, buf, size) +#define walprop_blocking_write(conn, buf, size) \ + WalProposerFunctions->walprop_blocking_write(conn, buf, size) + +/* + * The runtime location of the libpqwalproposer functions. + * + * This pointer is set by the initializer in libpqwalproposer, so that we + * can use it later. + */ +extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; + +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c new file mode 100644 index 0000000000..7b96fd580c --- /dev/null +++ b/pgxn/neon/walproposer_utils.c @@ -0,0 +1,1110 @@ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlogutils.h" +#include "common/logging.h" +#include "common/ip.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "replication/slot.h" +#include "walproposer_utils.h" +#include "replication/walsender_private.h" + +#include "storage/ipc.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" + +#include "libpq-fe.h" +#include +#include + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID + * corresponding the filename of walpropFile. + */ +static int walpropFile = -1; +static TimeLineID walpropFileTLI = 0; +static XLogSegNo walpropSegNo = 0; + +/* START cloned file-local variables and functions from walsender.c */ + +/* + * xlogreader used for replication. Note that a WAL sender doing physical + * replication does not need xlogreader to read WAL, but it needs one to + * keep a state of its work. + */ +static XLogReaderState *xlogreader = NULL; + +/* + * These variables keep track of the state of the timeline we're currently + * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, + * the timeline is not the latest timeline on this server, and the server's + * history forked off from that timeline at sendTimeLineValidUpto. + */ +static TimeLineID sendTimeLine = 0; +static TimeLineID sendTimeLineNextTLI = 0; +static bool sendTimeLineIsHistoric = false; +static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; + +/* + * Timestamp of last ProcessRepliesIfAny() that saw a reply from the + * standby. Set to 0 if wal_sender_timeout doesn't need to be active. + */ +static TimestampTz last_reply_timestamp = 0; + +/* Have we sent a heartbeat message asking for reply, since last reply? */ +static bool waiting_for_ping_response = false; + +static bool streamingDoneSending; +static bool streamingDoneReceiving; + +/* Are we there yet? */ +static bool WalSndCaughtUp = false; + +/* Flags set by signal handlers for later service in main loop */ +static volatile sig_atomic_t got_STOPPING = false; + +/* + * How far have we sent WAL already? This is also advertised in + * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) + */ +static XLogRecPtr sentPtr = InvalidXLogRecPtr; + +/* + * This is set while we are streaming. When not set + * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set, + * the main loop is responsible for checking got_STOPPING and terminating when + * it's set (after streaming any remaining WAL). + */ +static volatile sig_atomic_t replication_active = false; + +typedef void (*WalSndSendDataCallback) (void); +static void WalSndLoop(WalSndSendDataCallback send_data); +static void XLogSendPhysical(void); +static XLogRecPtr GetStandbyFlushRecPtr(void); + +static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p); + +/* END cloned file-level variables and functions from walsender.c */ + +int +CompareLsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* Returns a human-readable string corresonding to the SafekeeperState + * + * The string should not be freed. + * + * The strings are intended to be used as a prefix to "state", e.g.: + * + * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * + * If this sort of phrasing doesn't fit the message, instead use something like: + * + * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + */ +char* +FormatSafekeeperState(SafekeeperState state) +{ + char* return_val = NULL; + + switch (state) + { + case SS_OFFLINE: + return_val = "offline"; + break; + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + return_val = "connecting"; + break; + case SS_WAIT_EXEC_RESULT: + return_val = "receiving query result"; + break; + case SS_HANDSHAKE_RECV: + return_val = "handshake (receiving)"; + break; + case SS_VOTING: + return_val = "voting"; + break; + case SS_WAIT_VERDICT: + return_val = "wait-for-verdict"; + break; + case SS_SEND_ELECTED_FLUSH: + return_val = "send-announcement-flush"; + break; + case SS_IDLE: + return_val = "idle"; + break; + case SS_ACTIVE: + return_val = "active"; + break; + } + + Assert(return_val != NULL); + + return return_val; +} + +/* Asserts that the provided events are expected for given safekeeper's state */ +void +AssertEventsOkForState(uint32 events, Safekeeper* sk) +{ + uint32 expected = SafekeeperStateDesiredEvents(sk->state); + + /* The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. + * (b) if we are expecting something, there's overlap + * (i.e. `events & expected != 0`) + */ + bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + + if (expected == WL_NO_EVENTS) + events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + else + events_ok_for_state = ((events & expected) != 0); + + if (!events_ok_for_state) + { + /* To give a descriptive message in the case of failure, we use elog and + * then an assertion that's guaranteed to fail. */ + elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + Assert(events_ok_for_state); + } +} + +/* Returns the set of events a safekeeper in this state should be waiting on + * + * This will return WL_NO_EVENTS (= 0) for some events. */ +uint32 +SafekeeperStateDesiredEvents(SafekeeperState state) +{ + uint32 result = WL_NO_EVENTS; + + /* If the state doesn't have a modifier, we can check the base state */ + switch (state) + { + /* Connecting states say what they want in the name */ + case SS_CONNECTING_READ: + result = WL_SOCKET_READABLE; + break; + case SS_CONNECTING_WRITE: + result = WL_SOCKET_WRITEABLE; + break; + + /* Reading states need the socket to be read-ready to continue */ + case SS_WAIT_EXEC_RESULT: + case SS_HANDSHAKE_RECV: + case SS_WAIT_VERDICT: + result = WL_SOCKET_READABLE; + break; + + /* Idle states use read-readiness as a sign that the connection has been + * disconnected. */ + case SS_VOTING: + case SS_IDLE: + result = WL_SOCKET_READABLE; + break; + + /* + * Flush states require write-ready for flushing. + * Active state does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should + * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ + case SS_SEND_ELECTED_FLUSH: + case SS_ACTIVE: + result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + break; + + /* The offline state expects no events. */ + case SS_OFFLINE: + result = WL_NO_EVENTS; + break; + + default: + Assert(false); + break; + } + + return result; +} + +/* Returns a human-readable string corresponding to the event set + * + * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the + * returned string may be meaingless. + * + * The string should not be freed. It should also not be expected to remain the same between + * function calls. */ +char* +FormatEvents(uint32 events) +{ + static char return_str[8]; + + /* Helper variable to check if there's extra bits */ + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; + + /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an + * sense of what events have been triggered without needing to remember your powers of two. */ + + return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; + return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; + return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; + return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; + + if (events & (~all_flags)) + { + elog(WARNING, "Event formatting found unexpected component %d", + events & (~all_flags)); + return_str[6] = '*'; + return_str[7] = '\0'; + } + else + return_str[6] = '\0'; + + return (char *) &return_str; +} + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +static int +HexDecodeChar(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +bool +HexDecodeString(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = HexDecodeChar(input[i * 2]); + int n2 = HexDecodeChar(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} + +/* -------------------------------- + * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint32 +pq_getmsgint32_le(StringInfo msg) +{ + uint32 n32; + + pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); + + return n32; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint64 +pq_getmsgint64_le(StringInfo msg) +{ + uint64 n64; + + pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); + + return n64; +} + +/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ +void +pq_sendint32_le(StringInfo buf, uint32 i) +{ + enlargeStringInfo(buf, sizeof(uint32)); + memcpy(buf->data + buf->len, &i, sizeof(uint32)); + buf->len += sizeof(uint32); +} + +/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ +void +pq_sendint64_le(StringInfo buf, uint64 i) +{ + enlargeStringInfo(buf, sizeof(uint64)); + memcpy(buf->data + buf->len, &i, sizeof(uint64)); + buf->len += sizeof(uint64); +} + +/* + * Write XLOG data to disk. + */ +void +XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) +{ + int startoff; + int byteswritten; + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + XLogWalPropClose(recptr); + + if (walpropFile < 0) + { + bool use_existent = true; + + /* Create/use new log file */ + XLByteToSeg(recptr, walpropSegNo, wal_segment_size); + walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); + walpropFileTLI = ThisTimeLineID; + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + } + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop. + */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + { + XLogWalPropClose(recptr); + } +} + +/* + * Close the current segment. + */ +void +XLogWalPropClose(XLogRecPtr recptr) +{ + Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); + + if (close(walpropFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log segment %s: %m", + xlogfname))); + } + + walpropFile = -1; +} + +/* START of cloned functions from walsender.c */ + +/* + * Handle START_REPLICATION command. + * + * At the moment, this never returns, but an ereport(ERROR) will take us back + * to the main loop. + */ +void +StartProposerReplication(StartReplicationCmd *cmd) +{ + XLogRecPtr FlushPtr; + + if (ThisTimeLineID == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + + /* create xlogreader for physical replication */ + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + NULL); + + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* + * We assume here that we're logging enough information in the WAL for + * log-shipping, since this is checked in PostmasterMain(). + * + * NOTE: wal_level can only change at shutdown, so in most cases it is + * difficult for there to be WAL data that we can still see that was + * written at wal_level='minimal'. + */ + + if (cmd->slotname) + { + ReplicationSlotAcquire(cmd->slotname, true); + if (SlotIsLogical(MyReplicationSlot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use a logical replication slot for physical replication"))); + + /* + * We don't need to verify the slot's restart_lsn here; instead we + * rely on the caller requesting the starting point to use. If the + * WAL segment doesn't exist, we'll fail later. + */ + } + + /* + * Select the timeline. If it was given explicitly by the client, use + * that. Otherwise use the timeline of the last replayed record, which is + * kept in ThisTimeLineID. + * + * Neon doesn't currently use PG Timelines, but it may in the future, so + * we keep this code around to lighten the load for when we need it. + */ + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(); + } + else + FlushPtr = GetFlushRecPtr(); + + if (cmd->timeline != 0) + { + XLogRecPtr switchpoint; + + sendTimeLine = cmd->timeline; + if (sendTimeLine == ThisTimeLineID) + { + sendTimeLineIsHistoric = false; + sendTimeLineValidUpto = InvalidXLogRecPtr; + } + else + { + List *timeLineHistory; + + sendTimeLineIsHistoric = true; + + /* + * Check that the timeline the client requested exists, and the + * requested start location is on that timeline. + */ + timeLineHistory = readTimeLineHistory(ThisTimeLineID); + switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, + &sendTimeLineNextTLI); + list_free_deep(timeLineHistory); + + /* + * Found the requested timeline in the history. Check that + * requested startpoint is on that timeline in our history. + * + * This is quite loose on purpose. We only check that we didn't + * fork off the requested timeline before the switchpoint. We + * don't check that we switched *to* it before the requested + * starting point. This is because the client can legitimately + * request to start replication from the beginning of the WAL + * segment that contains switchpoint, but on the new timeline, so + * that it doesn't end up with a partial segment. If you ask for + * too old a starting point, you'll get an error later when we + * fail to find the requested WAL segment in pg_wal. + * + * XXX: we could be more strict here and only allow a startpoint + * that's older than the switchpoint, if it's still in the same + * WAL segment. + */ + if (!XLogRecPtrIsInvalid(switchpoint) && + switchpoint < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); + } + sendTimeLineValidUpto = switchpoint; + } + } + else + { + sendTimeLine = ThisTimeLineID; + sendTimeLineValidUpto = InvalidXLogRecPtr; + sendTimeLineIsHistoric = false; + } + + streamingDoneSending = streamingDoneReceiving = false; + + /* If there is nothing to stream, don't even enter COPY mode */ + if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto) + { + /* + * When we first start replication the standby will be behind the + * primary. For some applications, for example synchronous + * replication, it is important to have a clear state for this initial + * catchup mode, so we can trigger actions when we change streaming + * state later. We may stay in this state for a long time, which is + * exactly why we want to be able to monitor whether or not we are + * still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* + * Don't allow a request to stream from a future point in WAL that + * hasn't been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); + } + + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; + + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); + + SyncRepInitConfig(); + + /* Main loop of walsender */ + replication_active = true; + + WalSndLoop(XLogSendPhysical); + + replication_active = false; + if (got_STOPPING) + proc_exit(0); + WalSndSetState(WALSNDSTATE_STARTUP); + + Assert(streamingDoneSending && streamingDoneReceiving); + } + + if (cmd->slotname) + ReplicationSlotRelease(); + + /* + * Copy is finished now. Send a single-row result set indicating the next + * timeline. + */ + if (sendTimeLineIsHistoric) + { + char startpos_str[8 + 1 + 8 + 1]; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[2]; + bool nulls[2]; + + snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + LSN_FORMAT_ARGS(sendTimeLineValidUpto)); + + dest = CreateDestReceiver(DestRemoteSimple); + MemSet(nulls, false, sizeof(nulls)); + + /* + * Need a tuple descriptor representing two columns. int8 may seem + * like a surprising data type for this, but in theory int4 would not + * be wide enough for this, as TimeLineID is unsigned. + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli", + INT8OID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos", + TEXTOID, -1, 0); + + /* prepare for projection of tuple */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + values[0] = Int64GetDatum((int64) sendTimeLineNextTLI); + values[1] = CStringGetTextDatum(startpos_str); + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + + end_tup_output(tstate); + } + + /* Send CommandComplete message */ + EndReplicationCommand("START_STREAMING"); +} + +/* + * Returns the latest point in WAL that has been safely flushed to disk, and + * can be sent to the standby. This should only be called when in recovery, + * ie. we're streaming to a cascaded standby. + * + * As a side-effect, ThisTimeLineID is updated to the TLI of the last + * replayed WAL record. + */ +static XLogRecPtr +GetStandbyFlushRecPtr(void) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + ThisTimeLineID = replayTLI; + + result = replayPtr; + if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr) + result = receivePtr; + + return result; +} + +/* XLogReaderRoutine->segment_open callback */ +static void +WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + char path[MAXPGPATH]; + + /*------- + * When reading from a historic timeline, and there is a timeline switch + * within this segment, read from the WAL segment belonging to the new + * timeline. + * + * For example, imagine that this server is currently on timeline 5, and + * we're streaming timeline 4. The switch from timeline 4 to 5 happened at + * 0/13002088. In pg_wal, we have these files: + * + * ... + * 000000040000000000000012 + * 000000040000000000000013 + * 000000050000000000000013 + * 000000050000000000000014 + * ... + * + * In this situation, when requested to send the WAL from segment 0x13, on + * timeline 4, we read the WAL from file 000000050000000000000013. Archive + * recovery prefers files from newer timelines, so if the segment was + * restored from the archive on this server, the file belonging to the old + * timeline, 000000040000000000000013, might not exist. Their contents are + * equal up to the switchpoint, because at a timeline switch, the used + * portion of the old segment is copied to the new file. ------- + */ + *tli_p = sendTimeLine; + if (sendTimeLineIsHistoric) + { + XLogSegNo endSegNo; + + XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); + if (nextSegNo == endSegNo) + *tli_p = sendTimeLineNextTLI; + } + + XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + /* + * If the file is not found, assume it's because the standby asked for a + * too old WAL segment that has already been removed or recycled. + */ + if (errno == ENOENT) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + xlogfname))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + + +/* Main loop of walsender process that streams the WAL over Copy messages. */ +static void +WalSndLoop(WalSndSendDataCallback send_data) +{ + /* + * Initialize the last reply timestamp. That enables timeout processing + * from hereon. + */ + last_reply_timestamp = GetCurrentTimestamp(); + waiting_for_ping_response = false; + + /* + * Loop until we reach the end of this timeline or the client requests to + * stop streaming. + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* always true */ + if (am_wal_proposer) + { + send_data(); + if (WalSndCaughtUp) + { + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(); + WalSndCaughtUp = false; + } + continue; + } + } +} + +/* + * Send out the WAL in its normal physical/stored form. + * + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, + * but not yet sent to the client, and buffer it in the libpq output + * buffer. + * + * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, + * otherwise WalSndCaughtUp is set to false. + */ +static void +XLogSendPhysical(void) +{ + XLogRecPtr SendRqstPtr; + XLogRecPtr startptr; + XLogRecPtr endptr; + Size nbytes PG_USED_FOR_ASSERTS_ONLY; + + /* If requested switch the WAL sender to the stopping state. */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + if (streamingDoneSending) + { + WalSndCaughtUp = true; + return; + } + + /* Figure out how far we can safely send the WAL. */ + if (sendTimeLineIsHistoric) + { + /* + * Streaming an old timeline that's in this server's history, but is + * not the one we're currently inserting or replaying. It can be + * streamed up to the point where we switched off that timeline. + */ + SendRqstPtr = sendTimeLineValidUpto; + } + else if (am_cascading_walsender) + { + /* + * Streaming the latest timeline on a standby. + * + * Attempt to send all WAL that has already been replayed, so that we + * know it's valid. If we're receiving WAL through streaming + * replication, it's also OK to send any WAL that has been received + * but not replayed. + * + * The timeline we're recovering from can change, or we can be + * promoted. In either case, the current timeline becomes historic. We + * need to detect that so that we don't try to stream past the point + * where we switched to another timeline. We check for promotion or + * timeline switch after calculating FlushPtr, to avoid a race + * condition: if the timeline becomes historic just after we checked + * that it was still current, it's still be OK to stream it up to the + * FlushPtr that was calculated before it became historic. + */ + bool becameHistoric = false; + + SendRqstPtr = GetStandbyFlushRecPtr(); + + if (!RecoveryInProgress()) + { + /* + * We have been promoted. RecoveryInProgress() updated + * ThisTimeLineID to the new current timeline. + */ + am_cascading_walsender = false; + becameHistoric = true; + } + else + { + /* + * Still a cascading standby. But is the timeline we're sending + * still the one recovery is recovering from? ThisTimeLineID was + * updated by the GetStandbyFlushRecPtr() call above. + */ + if (sendTimeLine != ThisTimeLineID) + becameHistoric = true; + } + + if (becameHistoric) + { + /* + * The timeline we were sending has become historic. Read the + * timeline history file of the new timeline to see where exactly + * we forked off from the timeline we were sending. + */ + List *history; + + history = readTimeLineHistory(ThisTimeLineID); + sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); + + Assert(sendTimeLine < sendTimeLineNextTLI); + list_free_deep(history); + + sendTimeLineIsHistoric = true; + + SendRqstPtr = sendTimeLineValidUpto; + } + } + else + { + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ + SendRqstPtr = GetFlushRecPtr(); + } + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); + + /* + * If this is a historic timeline and we've reached the point where we + * forked to the next timeline, stop streaming. + * + * Note: We might already have sent WAL > sendTimeLineValidUpto. The + * startup process will normally replay all WAL that has been received + * from the primary, before promoting, but if the WAL streaming is + * terminated at a WAL page boundary, the valid portion of the timeline + * might end in the middle of a WAL record. We might've already sent the + * first half of that partial WAL record to the cascading standby, so that + * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't + * replay the partial WAL record either, so it can still follow our + * timeline switch. + */ + if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) + { + /* close the current file. */ + if (xlogreader->seg.ws_file >= 0) + wal_segment_close(xlogreader); + + /* Send CopyDone */ + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + + WalSndCaughtUp = true; + + elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + LSN_FORMAT_ARGS(sendTimeLineValidUpto), + LSN_FORMAT_ARGS(sentPtr)); + return; + } + + /* Do we have any work to do? */ + Assert(sentPtr <= SendRqstPtr); + if (SendRqstPtr <= sentPtr) + { + WalSndCaughtUp = true; + return; + } + + /* + * Figure out how much to send in one message. If there's no more than + * MAX_SEND_SIZE bytes to send, send everything. Otherwise send + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. + * + * The rounding is not only for performance reasons. Walreceiver relies on + * the fact that we never split a WAL record across two messages. Since a + * long WAL record is split at page boundary into continuation records, + * page boundary is always a safe cut-off point. We also assume that + * SendRqstPtr never points to the middle of a WAL record. + */ + startptr = sentPtr; + endptr = startptr; + endptr += MAX_SEND_SIZE; + + /* if we went beyond SendRqstPtr, back off */ + if (SendRqstPtr <= endptr) + { + endptr = SendRqstPtr; + if (sendTimeLineIsHistoric) + WalSndCaughtUp = false; + else + WalSndCaughtUp = true; + } + else + { + /* round down to page boundary. */ + endptr -= (endptr % XLOG_BLCKSZ); + WalSndCaughtUp = false; + } + + nbytes = endptr - startptr; + Assert(nbytes <= MAX_SEND_SIZE); + + /* always true */ + if (am_wal_proposer) + { + WalProposerBroadcast(startptr, endptr); + } + else + { + /* code removed for brevity */ + } + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(sentPtr)); + set_ps_display(activitymsg); + } +} + diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h new file mode 100644 index 0000000000..4771d3ff82 --- /dev/null +++ b/pgxn/neon/walproposer_utils.h @@ -0,0 +1,19 @@ +#ifndef __NEON_WALPROPOSER_UTILS_H__ +#define __NEON_WALPROPOSER_UTILS_H__ + +#include "walproposer.h" + +int CompareLsn(const void *a, const void *b); +char* FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper* sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char* FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); + +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile new file mode 100644 index 0000000000..9c774ec185 --- /dev/null +++ b/pgxn/neon_test_utils/Makefile @@ -0,0 +1,15 @@ +# pgxs/neon_test_utils/Makefile + + +MODULE_big = neon_test_utils +OBJS = \ + $(WIN32RES) \ + neontest.o + +EXTENSION = neon_test_utils +DATA = neon_test_utils--1.0.sql +PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql new file mode 100644 index 0000000000..402981a9a6 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit + +CREATE FUNCTION test_consume_xids(nxids int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_xids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION clear_buffer_cache() +RETURNS VOID +AS 'MODULE_PATHNAME', 'clear_buffer_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +RETURNS VOID +AS 'MODULE_PATHNAME', 'neon_xlogflush' +LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control new file mode 100644 index 0000000000..94e6720503 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -0,0 +1,5 @@ +# neon_test_utils extension +comment = 'helpers for neon testing and debugging' +default_version = '1.0' +module_pathname = '$libdir/neon_test_utils' +relocatable = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c new file mode 100644 index 0000000000..3e30065cd3 --- /dev/null +++ b/pgxn/neon_test_utils/neontest.c @@ -0,0 +1,304 @@ +/*------------------------------------------------------------------------- + * + * neontest.c + * Helpers for neon testing and debugging + * + * IDENTIFICATION + * contrib/neon_test_utils/neontest.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relation.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/namespace.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/builtins.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/varlena.h" +#include "../neon/pagestore_client.h" + +PG_MODULE_MAGIC; + +extern void _PG_init(void); + +PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(clear_buffer_cache); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); +PG_FUNCTION_INFO_V1(neon_xlogflush); + +/* + * Linkage to functions in zenith module. + * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c + */ +typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; + +/* + * Module initialize function: fetch function pointers for cross-module calls. + */ +void +_PG_init(void) +{ + /* Asserts verify that typedefs above match original declarations */ + AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); + zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) + load_external_function("$libdir/neon", "zenith_read_at_lsn", + true, NULL); +} + +#define zenith_read_at_lsn zenith_read_at_lsn_ptr + +/* + * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. + */ +Datum +test_consume_xids(PG_FUNCTION_ARGS) +{ + int32 nxids = PG_GETARG_INT32(0); + TransactionId topxid; + FullTransactionId fullxid; + TransactionId xid; + TransactionId targetxid; + + /* make sure we have a top-XID first */ + topxid = GetTopTransactionId(); + + xid = ReadNextTransactionId(); + + targetxid = xid + nxids; + while (targetxid < FirstNormalTransactionId) + targetxid++; + + while (TransactionIdPrecedes(xid, targetxid)) + { + fullxid = GetNewTransactionId(true); + xid = XidFromFullTransactionId(fullxid); + elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); + } + + PG_RETURN_VOID(); +} + +/* + * Flush the buffer cache, evicting all pages that are not currently pinned. + */ +Datum +clear_buffer_cache(PG_FUNCTION_ARGS) +{ + bool save_zenith_test_evict; + + /* + * Temporarily set the zenith_test_evict GUC, so that when we pin and + * unpin a buffer, the buffer is evicted. We use that hack to evict all + * buffers, as there is no explicit "evict this buffer" function in the + * buffer manager. + */ + save_zenith_test_evict = zenith_test_evict; + zenith_test_evict = true; + PG_TRY(); + { + /* Scan through all the buffers */ + for (int i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr; + uint32 buf_state; + Buffer bufferid; + bool isvalid; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blocknum; + + /* Peek into the buffer header to see what page it holds. */ + bufHdr = GetBufferDescriptor(i); + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + isvalid = true; + else + isvalid = false; + bufferid = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; + forknum = bufHdr->tag.forkNum; + blocknum = bufHdr->tag.blockNum; + + UnlockBufHdr(bufHdr, buf_state); + + /* + * Pin the buffer, and release it again. Because we have + * zenith_test_evict==true, this will evict the page from + * the buffer cache if no one else is holding a pin on it. + */ + if (isvalid) + { + if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + ReleaseBuffer(bufferid); + } + } + } + PG_FINALLY(); + { + /* restore the GUC */ + zenith_test_evict = save_zenith_test_evict; + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + + +/* + * Reads the page from page server without buffer cache + * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN + * NULL read lsn will result in reading the latest version. + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn(PG_FUNCTION_ARGS) +{ + bytea *raw_page; + ForkNumber forknum; + RangeVar *relrv; + Relation rel; + char *raw_page_data; + text *relname; + text *forkname; + uint32 blkno; + + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + relname = PG_GETARG_TEXT_PP(0); + forkname = PG_GETARG_TEXT_PP(1); + blkno = PG_GETARG_UINT32(2); + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from view \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from composite type \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from foreign table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned index \"%s\"", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + + forknum = forkname_to_number(text_to_cstring(forkname)); + + /* Initialize buffer to copy to */ + raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + + relation_close(rel, AccessShareLock); + + PG_RETURN_BYTEA_P(raw_page); +} + +/* + * Another option to read a relation page from page server without cache + * this version doesn't validate input and allows reading blocks of dropped relations + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) +{ + char *raw_page_data; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || + PG_ARGISNULL(3) || PG_ARGISNULL(4)) + PG_RETURN_NULL(); + + { + RelFileNode rnode = { + .spcNode = PG_GETARG_OID(0), + .dbNode = PG_GETARG_OID(1), + .relNode = PG_GETARG_OID(2) + }; + + ForkNumber forknum = PG_GETARG_UINT32(3); + + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + + + /* Initialize buffer to copy to */ + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + PG_RETURN_BYTEA_P(raw_page); + } +} + +/* + * Directly calls XLogFlush(lsn) to flush WAL buffers. + */ +Datum +neon_xlogflush(PG_FUNCTION_ARGS) +{ + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); + PG_RETURN_VOID(); +} diff --git a/vendor/postgres b/vendor/postgres index a479855158..8f132d968c 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit a4798551587fb5a52740687a341af83b28733dc6 +Subproject commit 8f132d968cd44068fc6f72e4047f7d3d6320f4bb