diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 71b9e8d803..6e570b22d4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -136,6 +136,10 @@ jobs:
         run: mold -run make postgres -j$(nproc)
         shell: bash -euxo pipefail {0}
 
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+        shell: bash -euxo pipefail {0}
+
       - name: Run cargo build
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml
index 029beba351..eddfee88fc 100644
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -81,6 +81,9 @@ jobs:
         if: steps.cache_pg.outputs.cache-hit != 'true'
         run: make postgres
 
+      - name: Build neon extensions
+        run: make neon-pg-ext
+
       # Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
       # and the real cause will be inside config.log
       - name: Print configure logs in case of failure
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 117a4155cd..4527fb9ece 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,8 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
     # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
 # Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes.
 # Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some
@@ -55,6 +56,16 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.3.tar.gz && \
     make install && \
     rm -rf /plv8-*
 
+# compile neon extensions
+FROM build-deps AS neon-pg-ext-build
+COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY pgxn/ pgxn/
+
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon \
+        -s install
+
 # Compile and run the Neon-specific `compute_ctl` binary
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
@@ -73,8 +84,8 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
 # TODO: Check if we can make the extension setup more modular versus a linear build
-# currently plv8-build copies the output /usr/local/pgsql from postgis-build#
-COPY --from=plv8-build --chown=postgres /usr/local/pgsql /usr/local
+# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
+COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/compute_tools/target/release/compute_ctl /usr/local/bin/compute_ctl
 
 RUN apt update &&  \
diff --git a/Makefile b/Makefile
index fc75e9fc5e..9d7e1497e5 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
 # Top level Makefile to build Zenith and PostgreSQL
 #
 .PHONY: all
-all: zenith postgres
+all: zenith postgres neon-pg-ext
 
 ### Zenith Rust bits
 #
@@ -87,25 +87,39 @@ postgres: postgres-configure \
 		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
 	+@echo "Compiling PostgreSQL"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
-	+@echo "Compiling contrib/neon"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
-	+@echo "Compiling contrib/neon_test_utils"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
+	+@echo "Compiling libpq"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install
 	+@echo "Compiling pg_buffercache"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
 
-
 .PHONY: postgres-clean
 postgres-clean:
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean
+
+neon-pg-ext: postgres
+	+@echo "Compiling neon"
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \
+		-C $(ROOT_PROJECT_DIR)/pgxn/neon install
+	+@echo "Compiling neon_test_utils"
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \
+		-C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install
+
+.PHONY: neon-pg-ext-clean
+	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
+	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean:
 	cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
 	$(CARGO_CMD_PREFIX) cargo clean
+	cd pgxn/neon && $(MAKE) clean
+	cd pgxn/neon_test_utils && $(MAKE) clean
 
 # This removes everything
 .PHONY: distclean
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
new file mode 100644
index 0000000000..a6ce611974
--- /dev/null
+++ b/pgxn/neon/Makefile
@@ -0,0 +1,26 @@
+# pgxs/neon/Makefile
+
+
+MODULE_big = neon
+OBJS = \
+	$(WIN32RES) \
+	inmem_smgr.o \
+	libpagestore.o \
+	libpqwalproposer.o \
+	pagestore_smgr.o \
+	relsize_cache.o \
+	neon.o \
+	walproposer.o \
+	walproposer_utils.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK_INTERNAL = $(libpq)
+
+EXTENSION = neon
+DATA = neon--1.0.sql
+PGFILEDESC = "neon - cloud storage for PostgreSQL"
+
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c
new file mode 100644
index 0000000000..7840292b08
--- /dev/null
+++ b/pgxn/neon/inmem_smgr.c
@@ -0,0 +1,286 @@
+/*-------------------------------------------------------------------------
+ *
+ * inmem_smgr.c
+ *
+ * This is an implementation of the SMGR interface, used in the WAL redo
+ * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
+ * storage, the pages that are written out are kept in a small number of
+ * in-memory buffers.
+ *
+ * Normally, replaying a WAL record only needs to access a handful of
+ * buffers, which fit in the normal buffer cache, so this is just for
+ * "overflow" storage when the buffer cache is not large enough.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  contrib/neon/inmem_smgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "pagestore_client.h"
+#include "storage/block.h"
+#include "storage/buf_internals.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+
+/* Size of the in-memory smgr */
+#define MAX_PAGES 64
+
+/* If more than WARN_PAGES are used, print a warning in the log */
+#define WARN_PAGES 32
+
+static BufferTag page_tag[MAX_PAGES];
+static char page_body[MAX_PAGES][BLCKSZ];
+static int	used_pages;
+
+static int
+locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
+{
+	/* We only hold a small number of pages, so linear search */
+	for (int i = 0; i < used_pages; i++)
+	{
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum
+			&& blkno == page_tag[i].blockNum)
+		{
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ *	inmem_init() -- Initialize private state
+ */
+void
+inmem_init(void)
+{
+	used_pages = 0;
+}
+
+/*
+ *	inmem_exists() -- Does the physical file exist?
+ */
+bool
+inmem_exists(SMgrRelation reln, ForkNumber forknum)
+{
+	for (int i = 0; i < used_pages; i++)
+	{
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum)
+		{
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ *	inmem_create() -- Create a new relation on zenithd storage
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+}
+
+/*
+ *	inmem_unlink() -- Unlink a relation.
+ */
+void
+inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
+{
+}
+
+/*
+ *	inmem_extend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 char *buffer, bool skipFsync)
+{
+	/* same as smgwrite() for us */
+	inmem_write(reln, forknum, blkno, buffer, skipFsync);
+}
+
+/*
+ *  inmem_open() -- Initialize newly-opened relation.
+ */
+void
+inmem_open(SMgrRelation reln)
+{
+}
+
+/*
+ *	inmem_close() -- Close the specified relation, if it isn't closed already.
+ */
+void
+inmem_close(SMgrRelation reln, ForkNumber forknum)
+{
+}
+
+/*
+ *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	return true;
+}
+
+/*
+ * inmem_writeback() -- Tell the kernel to write pages back to storage.
+ */
+void
+inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+				BlockNumber blocknum, BlockNumber nblocks)
+{
+}
+
+/*
+ *	inmem_read() -- Read the specified block from a relation.
+ */
+void
+inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+		   char *buffer)
+{
+	int			pg;
+
+	pg = locate_page(reln, forknum, blkno);
+	if (pg < 0)
+		memset(buffer, 0, BLCKSZ);
+	else
+		memcpy(buffer, page_body[pg], BLCKSZ);
+}
+
+/*
+ *	inmem_write() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			char *buffer, bool skipFsync)
+{
+	int			pg;
+
+	pg = locate_page(reln, forknum, blocknum);
+	if (pg < 0)
+	{
+		/*
+		 * We assume the buffer cache is large enough to hold all the buffers
+		 * needed for most operations. Overflowing to this "in-mem smgr" in rare
+		 * cases is OK. But if we find that we're using more than WARN_PAGES,
+		 * print a warning so that we get alerted and get to investigate why
+		 * we're accessing so many buffers.
+		 */
+		elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
+			 "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum,
+			 blocknum,
+			 used_pages);
+		if (used_pages == MAX_PAGES)
+			elog(ERROR, "Inmem storage overflow");
+
+		pg = used_pages;
+		used_pages++;
+		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
+	}  else {
+		elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum,
+			 blocknum,
+			 used_pages);
+	}
+	memcpy(page_body[pg], buffer, BLCKSZ);
+}
+
+/*
+ *	inmem_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	/*
+	 * It's not clear why a WAL redo function would call smgrnblocks().
+	 * During recovery, at least before reaching consistency, the size of a
+	 * relation could be arbitrarily small, if it was truncated after the
+	 * record being replayed, or arbitrarily large if it was extended
+	 * afterwards. But one place where it's called is in
+	 * XLogReadBufferExtended(): it extends the relation, if it's smaller than
+	 * the requested page. That's a waste of time in the WAL redo
+	 * process. Pretend that all relations are maximally sized to avoid it.
+	 */
+	return MaxBlockNumber;
+}
+
+/*
+ *	inmem_truncate() -- Truncate relation to specified number of blocks.
+ */
+void
+inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+}
+
+/*
+ *	inmem_immedsync() -- Immediately sync a relation to stable storage.
+ */
+void
+inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+}
+
+static const struct f_smgr inmem_smgr =
+{
+	.smgr_init = inmem_init,
+	.smgr_shutdown = NULL,
+	.smgr_open = inmem_open,
+	.smgr_close = inmem_close,
+	.smgr_create = inmem_create,
+	.smgr_exists = inmem_exists,
+	.smgr_unlink = inmem_unlink,
+	.smgr_extend = inmem_extend,
+	.smgr_prefetch = inmem_prefetch,
+	.smgr_read = inmem_read,
+	.smgr_write = inmem_write,
+	.smgr_writeback = inmem_writeback,
+	.smgr_nblocks = inmem_nblocks,
+	.smgr_truncate = inmem_truncate,
+	.smgr_immedsync = inmem_immedsync,
+};
+
+const f_smgr *
+smgr_inmem(BackendId backend, RelFileNode rnode)
+{
+	Assert(InRecovery);
+	if (backend != InvalidBackendId)
+		return smgr_standard(backend, rnode);
+	else
+		return &inmem_smgr;
+}
+
+void
+smgr_init_inmem()
+{
+	inmem_init();
+}
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
new file mode 100644
index 0000000000..649fc1037e
--- /dev/null
+++ b/pgxn/neon/libpagestore.c
@@ -0,0 +1,432 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpagestore.c
+ *	  Handles network communications with the remote pagestore.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/libpqpagestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pagestore_client.h"
+#include "fmgr.h"
+#include "access/xlog.h"
+
+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "libpq/libpq.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/guc.h"
+
+#include "neon.h"
+#include "walproposer.h"
+#include "walproposer_utils.h"
+
+
+#define PageStoreTrace DEBUG5
+
+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag, \
+		(errmsg(NEON_TAG fmt, ## __VA_ARGS__), \
+		 errhidestmt(true), errhidecontext(true)))
+
+bool		connected = false;
+PGconn	   *pageserver_conn = NULL;
+
+char	   *page_server_connstring_raw;
+
+static ZenithResponse *pageserver_call(ZenithRequest *request);
+page_server_api api = {
+	.request = pageserver_call
+};
+
+static void
+pageserver_connect()
+{
+	char	   *query;
+	int			ret;
+
+	Assert(!connected);
+
+	pageserver_conn = PQconnectdb(page_server_connstring);
+
+	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errdetail_internal("%s", msg)));
+	}
+
+	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
+	ret = PQsendQuery(pageserver_conn, query);
+	if (ret != 1)
+	{
+		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
+		neon_log(ERROR, "could not send pagestream command to pageserver");
+	}
+
+	while (PQisBusy(pageserver_conn))
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(pageserver_conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(pageserver_conn))
+			{
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+				PQfinish(pageserver_conn);
+				pageserver_conn = NULL;
+
+				neon_log(ERROR, "could not complete handshake with pageserver: %s",
+						 msg);
+			}
+		}
+	}
+
+	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
+
+	connected = true;
+}
+
+/*
+ * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
+ */
+static int
+call_PQgetCopyData(PGconn *conn, char **buffer)
+{
+	int			ret;
+
+retry:
+	ret = PQgetCopyData(conn, buffer, 1 /* async */ );
+
+	if (ret == 0)
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(conn))
+				neon_log(ERROR, "could not get response from pageserver: %s",
+						 PQerrorMessage(conn));
+		}
+
+		goto retry;
+	}
+
+	return ret;
+}
+
+
+static ZenithResponse *
+pageserver_call(ZenithRequest *request)
+{
+	StringInfoData req_buff;
+	StringInfoData resp_buff;
+	ZenithResponse *resp;
+
+	PG_TRY();
+	{
+		/* If the connection was lost for some reason, reconnect */
+		if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+		{
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			connected = false;
+		}
+
+		if (!connected)
+			pageserver_connect();
+
+		req_buff = zm_pack_request(request);
+
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output
+		 * and TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
+		{
+			neon_log(ERROR, "failed to send page request: %s",
+					 PQerrorMessage(pageserver_conn));
+		}
+		pfree(req_buff.data);
+
+		if (message_level_is_interesting(PageStoreTrace))
+		{
+			char	   *msg = zm_to_string((ZenithMessage *) request);
+
+			neon_log(PageStoreTrace, "sent request: %s", msg);
+			pfree(msg);
+		}
+
+		/* read response */
+		resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
+		resp_buff.cursor = 0;
+
+		if (resp_buff.len == -1)
+			neon_log(ERROR, "end of COPY");
+		else if (resp_buff.len == -2)
+			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+
+		resp = zm_unpack_response(&resp_buff);
+		PQfreemem(resp_buff.data);
+
+		if (message_level_is_interesting(PageStoreTrace))
+		{
+			char	   *msg = zm_to_string((ZenithMessage *) resp);
+
+			neon_log(PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
+		}
+	}
+	PG_CATCH();
+	{
+		/*
+		 * If anything goes wrong while we were sending a request, it's not
+		 * clear what state the connection is in. For example, if we sent the
+		 * request but didn't receive a response yet, we might receive the
+		 * response some time later after we have already sent a new unrelated
+		 * request. Close the connection to avoid getting confused.
+		 */
+		if (connected)
+		{
+			neon_log(LOG, "dropping connection to page server due to error");
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			connected = false;
+		}
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	return (ZenithResponse *) resp;
+}
+
+
+static bool
+check_zenith_id(char **newval, void **extra, GucSource source)
+{
+	uint8		zid[16];
+
+	return **newval == '\0' || HexDecodeString(zid, *newval, 16);
+}
+
+static char *
+substitute_pageserver_password(const char *page_server_connstring_raw)
+{
+	char	   *host = NULL;
+	char	   *port = NULL;
+	char	   *user = NULL;
+	char	   *auth_token = NULL;
+	char	   *err = NULL;
+	char	   *page_server_connstring = NULL;
+	PQconninfoOption *conn_options;
+	PQconninfoOption *conn_option;
+	MemoryContext oldcontext;
+
+	/*
+	 * Here we substitute password in connection string with an environment
+	 * variable. To simplify things we construct a connection string back with
+	 * only known options. In particular: host port user and password. We do
+	 * not currently use other options and constructing full connstring in an
+	 * URI shape is quite messy.
+	 */
+
+	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
+		return NULL;
+
+	/* extract the auth token from the connection string */
+	conn_options = PQconninfoParse(page_server_connstring_raw, &err);
+	if (conn_options == NULL)
+	{
+		/* The error string is malloc'd, so we must free it explicitly */
+		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+
+		PQfreemem(err);
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid connection string syntax: %s", errcopy)));
+	}
+
+	/*
+	 * Trying to populate pageserver connection string with auth token from
+	 * environment. We are looking for password in with placeholder value like
+	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
+	 * to fetch environment variable value and fail loudly if it is not set.
+	 */
+	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
+	{
+		if (strcmp(conn_option->keyword, "host") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				host = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "port") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				port = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "user") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				user = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "password") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+			{
+				/* ensure that this is a template */
+				if (strncmp(conn_option->val, "$", 1) != 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
+
+				neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
+				auth_token = getenv(&conn_option->val[1]);
+				if (!auth_token)
+				{
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
+				}
+				else
+				{
+					neon_log(LOG, "using auth token from environment passed via env");
+				}
+			}
+		}
+	}
+
+	/*
+	 * allocate connection string in TopMemoryContext to make sure it is not
+	 * freed
+	 */
+	oldcontext = CurrentMemoryContext;
+	MemoryContextSwitchTo(TopMemoryContext);
+	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
+	MemoryContextSwitchTo(oldcontext);
+
+	PQconninfoFree(conn_options);
+	return page_server_connstring;
+}
+
+/*
+ * Module initialization function
+ */
+void
+pg_init_libpagestore(void)
+{
+	DefineCustomStringVariable("neon.pageserver_connstring",
+							   "connection string to the page server",
+							   NULL,
+							   &page_server_connstring_raw,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   NULL, NULL, NULL);
+
+	DefineCustomStringVariable("neon.timeline_id",
+							   "Zenith timelineid the server is running on",
+							   NULL,
+							   &zenith_timeline,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_zenith_id, NULL, NULL);
+
+	DefineCustomStringVariable("neon.tenant_id",
+							   "Neon tenantid the server is running on",
+							   NULL,
+							   &zenith_tenant,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_zenith_id, NULL, NULL);
+
+	DefineCustomBoolVariable("neon.wal_redo",
+							 "start in wal-redo mode",
+							 NULL,
+							 &wal_redo,
+							 false,
+							 PGC_POSTMASTER,
+							 0,
+							 NULL, NULL, NULL);
+
+	DefineCustomIntVariable("neon.max_cluster_size",
+							"cluster size limit",
+							NULL,
+							&max_cluster_size,
+							-1, -1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL, NULL, NULL);
+
+	relsize_hash_init();
+
+	if (page_server != NULL)
+		neon_log(ERROR, "libpagestore already loaded");
+
+	neon_log(PageStoreTrace, "libpagestore already loaded");
+	page_server = &api;
+
+	/* substitute password in pageserver_connstring */
+	page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
+
+	/* Is there more correct way to pass CustomGUC to postgres code? */
+	zenith_timeline_walproposer = zenith_timeline;
+	zenith_tenant_walproposer = zenith_tenant;
+
+	if (wal_redo)
+	{
+		neon_log(PageStoreTrace, "set inmem_smgr hook");
+		smgr_hook = smgr_inmem;
+		smgr_init_hook = smgr_init_inmem;
+	}
+	else if (page_server_connstring && page_server_connstring[0])
+	{
+		neon_log(PageStoreTrace, "set neon_smgr hook");
+		smgr_hook = smgr_zenith;
+		smgr_init_hook = smgr_init_zenith;
+		dbsize_hook = zenith_dbsize;
+	}
+}
diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c
new file mode 100644
index 0000000000..2b2b7a1a6a
--- /dev/null
+++ b/pgxn/neon/libpqwalproposer.c
@@ -0,0 +1,413 @@
+#include "postgres.h"
+
+#include "libpq-fe.h"
+#include "neon.h"
+#include "walproposer.h"
+
+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn* pg_conn;
+	bool    is_nonblocking; /* whether the connection is non-blocking */
+	char   *recvbuf;	/* last received data from libpqprop_async_read */
+};
+
+/* Prototypes for exported functions */
+static char*							libpqprop_error_message(WalProposerConn* conn);
+static WalProposerConnStatusType		libpqprop_status(WalProposerConn* conn);
+static WalProposerConn*					libpqprop_connect_start(char* conninfo);
+static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn* conn);
+static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
+static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
+static pgsocket							libpqprop_socket(WalProposerConn* conn);
+static int								libpqprop_flush(WalProposerConn* conn);
+static void								libpqprop_finish(WalProposerConn* conn);
+static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
+static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
+static bool                             libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size);
+
+static WalProposerFunctionsType PQWalProposerFunctions = {
+	libpqprop_error_message,
+	libpqprop_status,
+	libpqprop_connect_start,
+	libpqprop_connect_poll,
+	libpqprop_send_query,
+	libpqprop_get_query_result,
+	libpqprop_socket,
+	libpqprop_flush,
+	libpqprop_finish,
+	libpqprop_async_read,
+	libpqprop_async_write,
+	libpqprop_blocking_write,
+};
+
+/* Module initialization */
+void
+pg_init_libpqwalproposer(void)
+{
+	if (WalProposerFunctions != NULL)
+		elog(ERROR, "libpqwalproposer already loaded");
+	WalProposerFunctions = &PQWalProposerFunctions;
+}
+
+/* Helper function */
+static bool
+ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking)
+{
+	/* If we're already correctly blocking or nonblocking, all good */
+	if (is_nonblocking == conn->is_nonblocking)
+		return true;
+
+	/* Otherwise, set it appropriately */
+	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
+		return false;
+
+	conn->is_nonblocking = is_nonblocking;
+	return true;
+}
+
+/* Exported function definitions */
+static char*
+libpqprop_error_message(WalProposerConn* conn)
+{
+	return PQerrorMessage(conn->pg_conn);
+}
+
+static WalProposerConnStatusType
+libpqprop_status(WalProposerConn* conn)
+{
+	switch (PQstatus(conn->pg_conn))
+	{
+		case CONNECTION_OK:
+			return WP_CONNECTION_OK;
+		case CONNECTION_BAD:
+			return WP_CONNECTION_BAD;
+		default:
+			return WP_CONNECTION_IN_PROGRESS;
+	}
+}
+
+static WalProposerConn*
+libpqprop_connect_start(char* conninfo)
+{
+	WalProposerConn*	conn;
+	PGconn*				pg_conn;
+
+	pg_conn = PQconnectStart(conninfo);
+	/*
+	 * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the
+	 * behavior of PQconnectStart here.
+	 */
+	if (!pg_conn)
+		return NULL;
+
+	/*
+	 * And in theory this allocation can fail as well, but it's incredibly unlikely if we just
+	 * successfully allocated a PGconn.
+	 *
+	 * palloc will exit on failure though, so there's not much we could do if it *did* fail.
+	 */
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false; /* connections always start in blocking mode */
+	conn->recvbuf = NULL;
+	return conn;
+}
+
+static WalProposerConnectPollStatusType
+libpqprop_connect_poll(WalProposerConn* conn)
+{
+	WalProposerConnectPollStatusType return_val;
+
+	switch (PQconnectPoll(conn->pg_conn))
+	{
+		case PGRES_POLLING_FAILED:
+			return_val = WP_CONN_POLLING_FAILED;
+			break;
+		case PGRES_POLLING_READING:
+			return_val = WP_CONN_POLLING_READING;
+			break;
+		case PGRES_POLLING_WRITING:
+			return_val = WP_CONN_POLLING_WRITING;
+			break;
+		case PGRES_POLLING_OK:
+			return_val = WP_CONN_POLLING_OK;
+			break;
+
+		/* There's a comment at its source about this constant being unused. We'll expect it's never
+		 * returned. */
+		case PGRES_POLLING_ACTIVE:
+			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			/* This return is never actually reached, but it's here to make the compiler happy */
+			return WP_CONN_POLLING_FAILED;
+
+		default:
+			Assert(false);
+			return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
+	}
+
+	return return_val;
+}
+
+static bool
+libpqprop_send_query(WalProposerConn* conn, char* query)
+{
+	/* We need to be in blocking mode for sending the query to run without
+	 * requiring a call to PQflush */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
+
+	/* PQsendQuery returns 1 on success, 0 on failure */
+	if (!PQsendQuery(conn->pg_conn, query))
+		return false;
+
+	return true;
+}
+
+static WalProposerExecStatusType
+libpqprop_get_query_result(WalProposerConn* conn)
+{
+	PGresult* result;
+	WalProposerExecStatusType return_val;
+
+	/* Marker variable if we need to log an unexpected success result */
+	char* unexpected_success = NULL;
+
+	/* Consume any input that we might be missing */
+	if (!PQconsumeInput(conn->pg_conn))
+		return WP_EXEC_FAILED;
+
+	if (PQisBusy(conn->pg_conn))
+		return WP_EXEC_NEEDS_INPUT;
+
+
+	result = PQgetResult(conn->pg_conn);
+	/* PQgetResult returns NULL only if getting the result was successful & there's no more of the
+	 * result to get. */
+	if (!result)
+	{
+		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		return WP_EXEC_UNEXPECTED_SUCCESS;
+	}
+
+	/* Helper macro to reduce boilerplate */
+	#define UNEXPECTED_SUCCESS(msg) \
+		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
+		unexpected_success = msg; \
+		break;
+
+
+	switch (PQresultStatus(result))
+	{
+		/* "true" success case */
+		case PGRES_COPY_BOTH:
+			return_val = WP_EXEC_SUCCESS_COPYBOTH;
+			break;
+
+		/* Unexpected success case */
+		case PGRES_EMPTY_QUERY:
+			UNEXPECTED_SUCCESS("empty query return");
+		case PGRES_COMMAND_OK:
+			UNEXPECTED_SUCCESS("data-less command end");
+		case PGRES_TUPLES_OK:
+			UNEXPECTED_SUCCESS("tuples return");
+		case PGRES_COPY_OUT:
+			UNEXPECTED_SUCCESS("'Copy Out' response");
+		case PGRES_COPY_IN:
+			UNEXPECTED_SUCCESS("'Copy In' response");
+		case PGRES_SINGLE_TUPLE:
+			UNEXPECTED_SUCCESS("single tuple return");
+		case PGRES_PIPELINE_SYNC:
+			UNEXPECTED_SUCCESS("pipeline sync point");
+
+		/* Failure cases */
+		case PGRES_BAD_RESPONSE:
+		case PGRES_NONFATAL_ERROR:
+		case PGRES_FATAL_ERROR:
+		case PGRES_PIPELINE_ABORTED:
+			return_val = WP_EXEC_FAILED;
+			break;
+
+		default:
+			Assert(false);
+			return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
+	}
+
+	if (unexpected_success)
+		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+
+	return return_val;
+}
+
+static pgsocket
+libpqprop_socket(WalProposerConn* conn)
+{
+	return PQsocket(conn->pg_conn);
+}
+
+static int
+libpqprop_flush(WalProposerConn* conn)
+{
+	return (PQflush(conn->pg_conn));
+}
+
+static void
+libpqprop_finish(WalProposerConn* conn)
+{
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
+{
+	int result;
+
+	if (conn->recvbuf != NULL)
+	{
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
+	}
+
+	/* Call PQconsumeInput so that we have the data we need */
+	if (!PQconsumeInput(conn->pg_conn))
+	{
+		*amount = 0;
+		*buf = NULL;
+		return PG_ASYNC_READ_FAIL;
+	}
+
+	/* The docs for PQgetCopyData list the return values as:
+	 *      0 if the copy is still in progress, but no "complete row" is
+	 *        available
+	 *     -1 if the copy is done
+	 *     -2 if an error occured
+	 *  (> 0) if it was successful; that value is the amount transferred.
+	 *
+	 * The protocol we use between walproposer and safekeeper means that we
+	 * *usually* wouldn't expect to see that the copy is done, but this can
+	 * sometimes be triggered by the server returning an ErrorResponse (which
+	 * also happens to have the effect that the copy is done).
+	 */
+	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
+	{
+		case 0:
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_TRY_AGAIN;
+		case -1:
+		{
+			/*
+			 * If we get -1, it's probably because of a server error; the
+			 * safekeeper won't normally send a CopyDone message.
+			 *
+			 * We can check PQgetResult to make sure that the server failed;
+			 * it'll always result in PGRES_FATAL_ERROR
+			 */
+			ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+
+			if (status != PGRES_FATAL_ERROR)
+				elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+
+			/* If there was actually an error, it'll be properly reported by
+			 * calls to PQerrorMessage -- we don't have to do anything else */
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_FAIL;
+		}
+		case -2:
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_FAIL;
+		default:
+			/* Positive values indicate the size of the returned result */
+			*amount = result;
+			*buf = conn->recvbuf;
+			return PG_ASYNC_READ_SUCCESS;
+	}
+}
+
+static PGAsyncWriteResult
+libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
+{
+	int result;
+
+	/* If we aren't in non-blocking mode, switch to it. */
+	if (!ensure_nonblocking_status(conn, true))
+		return PG_ASYNC_WRITE_FAIL;
+
+	/* The docs for PQputcopyData list the return values as:
+	 *   1 if the data was queued,
+	 *   0 if it was not queued because of full buffers, or
+	 *  -1 if an error occured
+	 */
+	result = PQputCopyData(conn->pg_conn, buf, size);
+
+	/* We won't get a result of zero because walproposer always empties the
+	 * connection's buffers before sending more */
+	Assert(result != 0);
+
+	switch (result)
+	{
+		case 1:
+			/* good -- continue */
+			break;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQputCopyData", result);
+	}
+
+	/* After queueing the data, we still need to flush to get it to send.
+	 * This might take multiple tries, but we don't want to wait around
+	 * until it's done.
+	 *
+	 * PQflush has the following returns (directly quoting the docs):
+	 *   0 if sucessful,
+	 *   1 if it was unable to send all the data in the send queue yet
+	 *  -1 if it failed for some reason
+	 */
+	switch (result = PQflush(conn->pg_conn)) {
+		case 0:
+			return PG_ASYNC_WRITE_SUCCESS;
+		case 1:
+			return PG_ASYNC_WRITE_TRY_FLUSH;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQflush", result);
+	}
+}
+
+static bool
+libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size)
+{
+	int result;
+
+	/* If we are in non-blocking mode, switch out of it. */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
+
+	/* Ths function is very similar to libpqprop_async_write. For more
+	 * information, refer to the comments there */
+	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
+		return false;
+
+	Assert(result == 1);
+
+	/* Because the connection is non-blocking, flushing returns 0 or -1 */
+
+	if ((result = PQflush(conn->pg_conn)) == -1)
+		return false;
+
+	Assert(result == 0);
+	return true;
+}
diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql
new file mode 100644
index 0000000000..34f1ba78d4
--- /dev/null
+++ b/pgxn/neon/neon--1.0.sql
@@ -0,0 +1,17 @@
+\echo Use "CREATE EXTENSION neon" to load this file. \quit
+
+CREATE FUNCTION pg_cluster_size()
+RETURNS bigint
+AS 'MODULE_PATHNAME', 'pg_cluster_size'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION backpressure_lsns(
+    OUT received_lsn pg_lsn,
+    OUT disk_consistent_lsn pg_lsn,
+    OUT remote_consistent_lsn pg_lsn
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'backpressure_lsns'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
new file mode 100644
index 0000000000..595a126f04
--- /dev/null
+++ b/pgxn/neon/neon.c
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon.c
+ *	  Utility functions to expose neon specific information to user
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/neon.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "catalog/pg_type.h"
+#include "replication/walsender.h"
+#include "funcapi.h"
+#include "access/htup_details.h"
+#include "utils/pg_lsn.h"
+#include "utils/guc.h"
+
+#include "neon.h"
+#include "walproposer.h"
+
+PG_MODULE_MAGIC;
+void		_PG_init(void);
+
+
+void		_PG_init(void)
+{
+	pg_init_libpagestore();
+	pg_init_libpqwalproposer();
+	pg_init_walproposer();
+
+	EmitWarningsOnPlaceholders("neon");
+}
+
+PG_FUNCTION_INFO_V1(pg_cluster_size);
+PG_FUNCTION_INFO_V1(backpressure_lsns);
+
+Datum
+pg_cluster_size(PG_FUNCTION_ARGS)
+{
+	int64		size;
+
+	size = GetZenithCurrentClusterSize();
+
+	if (size == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(size);
+}
+
+
+Datum
+backpressure_lsns(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr writePtr;
+	XLogRecPtr flushPtr;
+	XLogRecPtr applyPtr;
+	Datum		values[3];
+	bool		nulls[3];
+	TupleDesc	tupdesc;
+
+	replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+
+	tupdesc = CreateTemplateTupleDesc(3);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = LSNGetDatum(writePtr);
+	values[1] = LSNGetDatum(flushPtr);
+	values[2] = LSNGetDatum(applyPtr);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
new file mode 100644
index 0000000000..84f79881c1
--- /dev/null
+++ b/pgxn/neon/neon.control
@@ -0,0 +1,4 @@
+# neon extension
+comment = 'cloud storage for PostgreSQL'
+default_version = '1.0'
+module_pathname = '$libdir/neon'
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
new file mode 100644
index 0000000000..2c66bc7bf0
--- /dev/null
+++ b/pgxn/neon/neon.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon.h
+ *	  Functions used in the initialization of this extension.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/neon.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef NEON_H
+#define NEON_H
+
+extern void pg_init_libpagestore(void);
+extern void pg_init_libpqwalproposer(void);
+extern void pg_init_walproposer(void);
+
+#endif /* NEON_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
new file mode 100644
index 0000000000..f79a3c9142
--- /dev/null
+++ b/pgxn/neon/pagestore_client.h
@@ -0,0 +1,221 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagestore_client.h
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * contrib/neon/pagestore_client.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef pageserver_h
+#define pageserver_h
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "storage/relfilenode.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "utils/memutils.h"
+
+#include "pg_config.h"
+
+typedef enum
+{
+	/* pagestore_client -> pagestore */
+	T_ZenithExistsRequest = 0,
+	T_ZenithNblocksRequest,
+	T_ZenithGetPageRequest,
+	T_ZenithDbSizeRequest,
+
+	/* pagestore -> pagestore_client */
+	T_ZenithExistsResponse = 100,
+	T_ZenithNblocksResponse,
+	T_ZenithGetPageResponse,
+	T_ZenithErrorResponse,
+	T_ZenithDbSizeResponse,
+} ZenithMessageTag;
+
+
+
+/* base struct for c-style inheritance */
+typedef struct
+{
+	ZenithMessageTag tag;
+} ZenithMessage;
+
+#define messageTag(m)		(((const ZenithMessage *)(m))->tag)
+
+/*
+ * supertype of all the Zenith*Request structs below
+ *
+ * If 'latest' is true, we are requesting the latest page version, and 'lsn'
+ * is just a hint to the server that we know there are no versions of the page
+ * (or relation size, for exists/nblocks requests) later than the 'lsn'.
+ */
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		latest;			/* if true, request latest page version */
+	XLogRecPtr	lsn;			/* request page version @ this LSN */
+} ZenithRequest;
+
+typedef struct
+{
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} ZenithExistsRequest;
+
+typedef struct
+{
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} ZenithNblocksRequest;
+
+
+typedef struct
+{
+	ZenithRequest req;
+	Oid dbNode;
+} ZenithDbSizeRequest;
+
+
+typedef struct
+{
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+} ZenithGetPageRequest;
+
+/* supertype of all the Zenith*Response structs below */
+typedef struct
+{
+	ZenithMessageTag tag;
+} ZenithResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		exists;
+} ZenithExistsResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	uint32		n_blocks;
+} ZenithNblocksResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	char		page[FLEXIBLE_ARRAY_MEMBER];
+} ZenithGetPageResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	int64		db_size;
+} ZenithDbSizeResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */
+} ZenithErrorResponse;
+
+extern StringInfoData zm_pack_request(ZenithRequest *msg);
+extern ZenithResponse *zm_unpack_response(StringInfo s);
+extern char *zm_to_string(ZenithMessage *msg);
+
+/*
+ * API
+ */
+
+typedef struct
+{
+	ZenithResponse *(*request) (ZenithRequest *request);
+} page_server_api;
+
+extern page_server_api *page_server;
+
+extern char *page_server_connstring;
+extern char *zenith_timeline;
+extern char *zenith_tenant;
+extern bool wal_redo;
+extern int32 max_cluster_size;
+
+extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
+extern void smgr_init_zenith(void);
+
+extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern void smgr_init_inmem(void);
+extern void smgr_shutdown_inmem(void);
+
+/* zenith storage manager functionality */
+
+extern void zenith_init(void);
+extern void zenith_open(SMgrRelation reln);
+extern void zenith_close(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void zenith_extend(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum);
+extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+						char *buffer);
+
+extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer);
+
+extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
+							 BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern const int64 zenith_dbsize(Oid dbNode);
+extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber nblocks);
+extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+/* zenith wal-redo storage manager functionality */
+
+extern void inmem_init(void);
+extern void inmem_open(SMgrRelation reln);
+extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum);
+extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   char *buffer);
+extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber nblocks);
+extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+
+/* utils for zenith relsize cache */
+extern void relsize_hash_init(void);
+extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
+extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
+
+#endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
new file mode 100644
index 0000000000..3e1b74dba7
--- /dev/null
+++ b/pgxn/neon/pagestore_smgr.c
@@ -0,0 +1,1696 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagestore_smgr.c
+ *
+ *
+ *
+ * Temporary and unlogged rels
+ * ---------------------------
+ *
+ * Temporary and unlogged tables are stored locally, by md.c. The functions
+ * here just pass the calls through to corresponding md.c functions.
+ *
+ * Index build operations that use the buffer cache are also handled locally,
+ * just like unlogged tables. Such operations must be marked by calling
+ * smgr_start_unlogged_build() and friends.
+ *
+ * In order to know what relations are permanent and which ones are not, we
+ * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
+ * by smgropen() callers, when they have the relcache entry at hand.  However,
+ * sometimes we need to open an SmgrRelation for a relation without the
+ * relcache. That is needed when we evict a buffer; we might not have the
+ * SmgrRelation for that relation open yet. To deal with that, the
+ * 'relpersistence' can be left to zero, meaning we don't know if it's
+ * permanent or not. Most operations are not allowed with relpersistence==0,
+ * but smgrwrite() does work, which is what we need for buffer eviction.  and
+ * smgrunlink() so that a backend doesn't need to have the relcache entry at
+ * transaction commit, where relations that were dropped in the transaction
+ * are unlinked.
+ *
+ * If smgrwrite() is called and smgr_relpersistence == 0, we check if the
+ * relation file exists locally or not. If it does exist, we assume it's an
+ * unlogged relation and write the page there. Otherwise it must be a
+ * permanent relation, WAL-logged and stored on the page server, and we ignore
+ * the write like we do for permanent relations.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  contrib/neon/pagestore_smgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlog_internal.h"
+#include "catalog/pg_class.h"
+#include "pagestore_client.h"
+#include "pagestore_client.h"
+#include "storage/smgr.h"
+#include "access/xlogdefs.h"
+#include "postmaster/interrupt.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/md.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "catalog/pg_tablespace_d.h"
+#include "postmaster/autovacuum.h"
+
+/*
+ * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
+ * calls to md.c, and *also* do the calls to the Page Server. On every
+ * read, compare the versions we read from local disk and Page Server,
+ * and Assert that they are identical.
+ */
+/* #define DEBUG_COMPARE_LOCAL */
+
+#ifdef DEBUG_COMPARE_LOCAL
+#include "access/nbtree.h"
+#include "storage/bufpage.h"
+#include "access/xlog_internal.h"
+
+static char *hexdump_page(char *page);
+#endif
+
+#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
+
+const int	SmgrTrace = DEBUG5;
+
+page_server_api *page_server;
+
+/* GUCs */
+char	   *page_server_connstring; // with substituted password
+char	   *zenith_timeline;
+char	   *zenith_tenant;
+bool		wal_redo = false;
+int32		max_cluster_size;
+
+/* unlogged relation build states */
+typedef enum
+{
+	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
+	UNLOGGED_BUILD_PHASE_1,
+	UNLOGGED_BUILD_PHASE_2,
+	UNLOGGED_BUILD_NOT_PERMANENT
+} UnloggedBuildPhase;
+
+static SMgrRelation unlogged_build_rel = NULL;
+static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+
+StringInfoData
+zm_pack_request(ZenithRequest *msg)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+	pq_sendbyte(&s, msg->tag);
+
+	switch (messageTag(msg))
+	{
+			/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+			{
+				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
+
+				break;
+			}
+		case T_ZenithNblocksRequest:
+			{
+				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
+
+				break;
+			}
+		case T_ZenithDbSizeRequest:
+			{
+				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
+
+					pq_sendbyte(&s, msg_req->req.latest);
+					pq_sendint64(&s, msg_req->req.lsn);
+					pq_sendint32(&s, msg_req->dbNode);
+
+					break;
+			}
+		case T_ZenithGetPageRequest:
+			{
+				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
+				pq_sendint32(&s, msg_req->blkno);
+
+				break;
+			}
+
+			/* pagestore -> pagestore_client. We never need to create these. */
+		case T_ZenithExistsResponse:
+		case T_ZenithNblocksResponse:
+		case T_ZenithGetPageResponse:
+		case T_ZenithErrorResponse:
+		case T_ZenithDbSizeResponse:
+		default:
+			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
+			break;
+	}
+	return s;
+}
+
+ZenithResponse *
+zm_unpack_response(StringInfo s)
+{
+	ZenithMessageTag tag = pq_getmsgbyte(s);
+	ZenithResponse *resp = NULL;
+
+	switch (tag)
+	{
+			/* pagestore -> pagestore_client */
+		case T_ZenithExistsResponse:
+			{
+				ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
+
+				msg_resp->tag = tag;
+				msg_resp->exists = pq_getmsgbyte(s);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
+		case T_ZenithNblocksResponse:
+			{
+				ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
+
+				msg_resp->tag = tag;
+				msg_resp->n_blocks = pq_getmsgint(s, 4);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
+		case T_ZenithGetPageResponse:
+			{
+				ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
+
+				msg_resp->tag = tag;
+				/* XXX:	should be varlena */
+				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
+		case T_ZenithDbSizeResponse:
+			{
+				ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse));
+
+				msg_resp->tag = tag;
+				msg_resp->db_size = pq_getmsgint64(s);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
+		case T_ZenithErrorResponse:
+			{
+				ZenithErrorResponse *msg_resp;
+				size_t		msglen;
+				const char *msgtext;
+
+				msgtext = pq_getmsgrawstring(s);
+				msglen = strlen(msgtext);
+
+				msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
+				msg_resp->tag = tag;
+				memcpy(msg_resp->message, msgtext, msglen + 1);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
+			/*
+			 * pagestore_client -> pagestore
+			 *
+			 * We create these ourselves, and don't need to decode them.
+			 */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithGetPageRequest:
+		case T_ZenithDbSizeRequest:
+		default:
+			elog(ERROR, "unexpected zenith message tag 0x%02x", tag);
+			break;
+	}
+
+	return resp;
+}
+
+/* dump to json for debugging / error reporting purposes */
+char *
+zm_to_string(ZenithMessage *msg)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	switch (messageTag(msg))
+	{
+			/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+			{
+				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+		case T_ZenithNblocksRequest:
+			{
+				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+		case T_ZenithGetPageRequest:
+			{
+				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_ZenithDbSizeRequest:
+			{
+				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\"");
+				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+
+			/* pagestore -> pagestore_client */
+		case T_ZenithExistsResponse:
+			{
+				ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
+				appendStringInfo(&s, ", \"exists\": %d}",
+								 msg_resp->exists
+					);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_ZenithNblocksResponse:
+			{
+				ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks
+					);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_ZenithGetPageResponse:
+			{
+#if 0
+				ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
+#endif
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
+				appendStringInfo(&s, ", \"page\": \"XXX\"}");
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_ZenithErrorResponse:
+			{
+				ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
+
+				/* FIXME: escape double-quotes in the message */
+				appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
+				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_ZenithDbSizeResponse:
+			{
+				ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\"");
+				appendStringInfo(&s, ", \"db_size\": %ld}",
+								 msg_resp->db_size
+					);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+
+		default:
+			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
+	}
+	return s.data;
+}
+
+/*
+ * Wrapper around log_newpage() that makes a temporary copy of the block and
+ * WAL-logs that. This makes it safe to use while holding only a shared lock
+ * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint
+ * directly because it skips the logging if the LSN is new enough.
+ */
+static XLogRecPtr
+log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+				 Page page, bool page_std)
+{
+	PGAlignedBlock copied_buffer;
+
+	memcpy(copied_buffer.data, page, BLCKSZ);
+	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
+}
+
+/*
+ * Is 'buffer' identical to a freshly initialized empty heap page?
+ */
+static bool
+PageIsEmptyHeapPage(char *buffer)
+{
+	PGAlignedBlock empty_page;
+
+	PageInit((Page) empty_page.data, BLCKSZ, 0);
+
+	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
+}
+
+static void
+zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
+{
+	XLogRecPtr	lsn = PageGetLSN(buffer);
+
+	if (ShutdownRequestPending)
+		return;
+
+	/*
+	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
+	 * changes are not WAL-logged when the changes are made, so this is our
+	 * last chance to log them, otherwise they're lost. That's OK for
+	 * correctness, the non-logged updates are not critical. But we want to
+	 * have a reasonably up-to-date VM and FSM in the page server.
+	 */
+	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
+	{
+		/* FSM is never WAL-logged and we don't care. */
+		XLogRecPtr	recptr;
+
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		XLogFlush(recptr);
+		lsn = recptr;
+		ereport(SmgrTrace,
+				(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
+	}
+	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
+	{
+		/*
+		 * Always WAL-log vm. We should never miss clearing visibility map
+		 * bits.
+		 *
+		 * TODO Is it too bad for performance? Hopefully we do not evict
+		 * actively used vm too often.
+		 */
+		XLogRecPtr	recptr;
+
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		XLogFlush(recptr);
+		lsn = recptr;
+
+		ereport(SmgrTrace,
+				(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
+	}
+	else if (lsn == InvalidXLogRecPtr)
+	{
+		/*
+		 * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages,
+		 * and we can just ignore that in Zenith. We do need to remember the new size,
+		 * though, so that smgrnblocks() returns the right answer after the rel has
+		 * been extended. We rely on the relsize cache for that.
+		 *
+		 * A completely empty heap page doesn't need to be WAL-logged, either. The
+		 * heapam can leave such a page behind, if e.g. an insert errors out after
+		 * initializing the page, but before it has inserted the tuple and WAL-logged
+		 * the change. When we read the page from the page server, it will come back
+		 * as all-zeros. That's OK, the heapam will initialize an all-zeros page on
+		 * first use.
+		 *
+		 * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies
+		 * that the page was not WAL-logged, and its contents will be lost when it's
+		 * evicted.
+		 */
+		if (PageIsNew(buffer))
+		{
+			ereport(SmgrTrace,
+					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+		else if (PageIsEmptyHeapPage(buffer))
+		{
+			ereport(SmgrTrace,
+					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+		else
+		{
+			ereport(PANIC,
+					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+	}
+	else
+	{
+		ereport(SmgrTrace,
+				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
+	}
+
+	/*
+	 * Remember the LSN on this page. When we read the page again, we must
+	 * read the same or newer version of it.
+	 */
+	SetLastWrittenPageLSN(lsn);
+}
+
+
+/*
+ *	zenith_init() -- Initialize private state
+ */
+void
+zenith_init(void)
+{
+	/* noop */
+#ifdef DEBUG_COMPARE_LOCAL
+	mdinit();
+#endif
+}
+
+/*
+ * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position
+ * to physical position in WAL. It always adds SizeOfXLogShortPHD:
+ *		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ * so even if there are no records on the page, offset will be SizeOfXLogShortPHD.
+ * It may cause problems with XLogFlush. So return pointer backward to the origin of the page.
+ */
+static XLogRecPtr
+zm_adjust_lsn(XLogRecPtr lsn)
+{
+	/*
+	 * If lsn points to the beging of first record on page or segment, then
+	 * "return" it back to the page origin
+	 */
+	if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD)
+	{
+		lsn -= SizeOfXLogShortPHD;
+	}
+	else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD)
+	{
+		lsn -= SizeOfXLogLongPHD;
+	}
+	return lsn;
+}
+
+/*
+ * Return LSN for requesting pages and number of blocks from page server
+ */
+static XLogRecPtr
+zenith_get_request_lsn(bool *latest)
+{
+	XLogRecPtr	lsn;
+
+	if (RecoveryInProgress())
+	{
+		*latest = false;
+		lsn = GetXLogReplayRecPtr(NULL);
+		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
+	}
+	else if (am_walsender)
+	{
+		*latest = true;
+		lsn = InvalidXLogRecPtr;
+		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
+	}
+	else
+	{
+		XLogRecPtr	flushlsn;
+
+		/*
+		 * Use the latest LSN that was evicted from the buffer cache. Any
+		 * pages modified by later WAL records must still in the buffer cache,
+		 * so our request cannot concern those.
+		 */
+		*latest = true;
+		lsn = GetLastWrittenPageLSN();
+		Assert(lsn != InvalidXLogRecPtr);
+		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
+
+		lsn = zm_adjust_lsn(lsn);
+
+		/*
+		 * Is it possible that the last-written LSN is ahead of last flush
+		 * LSN? Generally not, we shouldn't evict a page from the buffer cache
+		 * before all its modifications have been safely flushed. That's the
+		 * "WAL before data" rule. However, such case does exist at index building,
+		 * _bt_blwritepage logs the full page without flushing WAL before
+		 * smgrextend (files are fsynced before build ends).
+		 */
+		flushlsn = GetFlushRecPtr();
+		if (lsn > flushlsn)
+		{
+			elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+				 (uint32) (lsn >> 32), (uint32) lsn,
+				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
+			XLogFlush(lsn);
+		}
+	}
+
+	return lsn;
+}
+
+
+/*
+ *	zenith_exists() -- Does the physical file exist?
+ */
+bool
+zenith_exists(SMgrRelation reln, ForkNumber forkNum)
+{
+	bool		exists;
+	ZenithResponse *resp;
+	BlockNumber n_blocks;
+	bool		latest;
+	XLogRecPtr	request_lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/*
+			 * We don't know if it's an unlogged rel stored locally, or permanent
+			 * rel stored in the page server. First check if it exists locally.
+			 * If it does, great. Otherwise check if it exists in the page server.
+			 */
+			if (mdexists(reln, forkNum))
+				return true;
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdexists(reln, forkNum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
+	{
+		return true;
+	}
+
+	/*
+	 * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
+	 * will error out if you check that, because the whole dbdir for tablespace
+	 * 0, db 0 doesn't exists. We possibly should change the page server to
+	 * accept that and return 'false', to be consistent with mdexists(). But
+	 * we probably also should fix pg_table_size() to not call smgrexists()
+	 * with bogus relfilenode.
+	 *
+	 * For now, handle that special case here.
+	 */
+	if (reln->smgr_rnode.node.spcNode == 0 &&
+		reln->smgr_rnode.node.dbNode == 0 &&
+		reln->smgr_rnode.node.relNode == 0)
+	{
+		return false;
+	}
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithExistsRequest request = {
+			.req.tag = T_ZenithExistsRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forkNum
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag)
+	{
+		case T_ZenithExistsResponse:
+			exists = ((ZenithExistsResponse *) resp)->exists;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forkNum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s",
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+	pfree(resp);
+	return exists;
+}
+
+/*
+ *	zenith_create() -- Create a new relation on zenithd storage
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdcreate(reln, forkNum, isRedo);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forkNum);
+
+	/*
+	 * Newly created relation is empty, remember that in the relsize cache.
+	 *
+	 * FIXME: This is currently not just an optimization, but required for
+	 * correctness. Postgres can call smgrnblocks() on the newly-created
+	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
+	 * relation created, so if we didn't remember the size in the relsize
+	 * cache, we might call smgrnblocks() on the newly-created relation before
+	 * the creation WAL record hass been received by the page server.
+	 */
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdcreate(reln, forkNum, isRedo);
+#endif
+}
+
+/*
+ *	zenith_unlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
+{
+	/*
+	 * Might or might not exist locally, depending on whether it's
+	 * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is
+	 * set). Try to unlink, it won't do any harm if the file doesn't
+	 * exist.
+	 */
+	mdunlink(rnode, forkNum, isRedo);
+	if (!RelFileNodeBackendIsTemp(rnode)) {
+		forget_cached_relsize(rnode.node, forkNum);
+	}
+}
+
+/*
+ *	zenith_extend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			  char *buffer, bool skipFsync)
+{
+	XLogRecPtr	lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdextend(reln, forkNum, blkno, buffer, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	/*
+	 * Check that the cluster size limit has not been exceeded.
+	 *
+	 * Temporary and unlogged relations are not included in the cluster size measured
+	 * by the page server, so ignore those. Autovacuum processes are also exempt.
+	 */
+	if (max_cluster_size > 0 &&
+		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
+		!IsAutoVacuumWorkerProcess())
+	{
+		uint64		current_size = GetZenithCurrentClusterSize();
+
+		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
+			ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+					errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+						   max_cluster_size),
+					errhint("This limit is defined by neon.max_cluster_size GUC")));
+	}
+
+	zenith_wallog_page(reln, forkNum, blkno, buffer);
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
+
+	lsn = PageGetLSN(buffer);
+	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forkNum, blkno,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdextend(reln, forkNum, blkno, buffer, skipFsync);
+#endif
+}
+
+/*
+ *  zenith_open() -- Initialize newly-opened relation.
+ */
+void
+zenith_open(SMgrRelation reln)
+{
+	/*
+	 * We don't have anything special to do here. Call mdopen() to let md.c
+	 * initialize itself. That's only needed for temporary or unlogged
+	 * relations, but it's dirt cheap so do it always to make sure the md
+	 * fields are initialized, for debugging purposes if nothing else.
+	 */
+	mdopen(reln);
+
+	/* no work */
+	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
+}
+
+/*
+ *	zenith_close() -- Close the specified relation, if it isn't closed already.
+ */
+void
+zenith_close(SMgrRelation reln, ForkNumber forknum)
+{
+	/*
+	 * Let md.c close it, if it had it open. Doesn't hurt to do this
+	 * even for permanent relations that have no local storage.
+	 */
+	mdclose(reln, forknum);
+}
+
+/*
+ *	zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* probably shouldn't happen, but ignore it */
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdprefetch(reln, forknum, blocknum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	/* not implemented */
+	elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop");
+	return true;
+}
+
+/*
+ * zenith_writeback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+zenith_writeback(SMgrRelation reln, ForkNumber forknum,
+				 BlockNumber blocknum, BlockNumber nblocks)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* mdwriteback() does nothing if the file doesn't exist */
+			mdwriteback(reln, forknum, blocknum, nblocks);
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdwriteback(reln, forknum, blocknum, nblocks);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	/* not implemented */
+	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdwriteback(reln, forknum, blocknum, nblocks);
+#endif
+}
+
+/*
+ * While function is defined in the zenith extension it's used within neon_test_utils directly.
+ * To avoid breaking tests in the runtime please keep function signature in sync.
+ */
+void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer)
+{
+	ZenithResponse *resp;
+
+	{
+		ZenithGetPageRequest request = {
+			.req.tag = T_ZenithGetPageRequest,
+			.req.latest = request_latest,
+			.req.lsn = request_lsn,
+			.rnode = rnode,
+			.forknum = forkNum,
+			.blkno = blkno
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag)
+	{
+		case T_ZenithGetPageResponse:
+			memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ);
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							blkno,
+							rnode.spcNode,
+							rnode.dbNode,
+							rnode.relNode,
+							forkNum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s",
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+
+	pfree(resp);
+}
+
+/*
+ *	zenith_read() -- Read the specified block from a relation.
+ */
+void
+zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			char *buffer)
+{
+	bool		latest;
+	XLogRecPtr	request_lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
+	{
+		char		pageserver_masked[BLCKSZ];
+		char		mdbuf[BLCKSZ];
+		char		mdbuf_masked[BLCKSZ];
+
+		mdread(reln, forkNum, blkno, mdbuf);
+
+		memcpy(pageserver_masked, buffer, BLCKSZ);
+		memcpy(mdbuf_masked, mdbuf, BLCKSZ);
+
+		if (PageIsNew(mdbuf))
+		{
+			if (!PageIsNew(pageserver_masked))
+			{
+				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(buffer));
+			}
+		}
+		else if (PageIsNew(buffer))
+		{
+			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+				 blkno,
+				 reln->smgr_rnode.node.spcNode,
+				 reln->smgr_rnode.node.dbNode,
+				 reln->smgr_rnode.node.relNode,
+				 forkNum,
+				 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+				 hexdump_page(mdbuf));
+		}
+		else if (PageGetSpecialSize(mdbuf) == 0)
+		{
+			/* assume heap */
+			RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
+			RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
+
+			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+			{
+				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf_masked),
+					 hexdump_page(pageserver_masked));
+			}
+		}
+		else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData)))
+		{
+			if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
+			{
+				/* assume btree */
+				RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
+				RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
+
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+				{
+					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+						 blkno,
+						 reln->smgr_rnode.node.spcNode,
+						 reln->smgr_rnode.node.dbNode,
+						 reln->smgr_rnode.node.relNode,
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(mdbuf_masked),
+						 hexdump_page(pageserver_masked));
+				}
+			}
+		}
+	}
+#endif
+}
+
+#ifdef DEBUG_COMPARE_LOCAL
+static char *
+hexdump_page(char *page)
+{
+	StringInfoData result;
+
+	initStringInfo(&result);
+
+	for (int i = 0; i < BLCKSZ; i++)
+	{
+		if (i % 8 == 0)
+			appendStringInfo(&result, " ");
+		if (i % 40 == 0)
+			appendStringInfo(&result, "\n");
+		appendStringInfo(&result, "%02x", (unsigned char) (page[i]));
+	}
+
+	return result.data;
+}
+#endif
+
+/*
+ *	zenith_write() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			 char *buffer, bool skipFsync)
+{
+	XLogRecPtr	lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* This is a bit tricky. Check if the relation exists locally */
+			if (mdexists(reln, forknum))
+			{
+				/* It exists locally. Guess it's unlogged then. */
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+
+				/*
+				 * We could set relpersistence now that we have determined
+				 * that it's local. But we don't dare to do it, because that
+				 * would immediately allow reads as well, which shouldn't
+				 * happen. We could cache it with a different 'relpersistence'
+				 * value, but this isn't performance critical.
+				 */
+				return;
+			}
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	zenith_wallog_page(reln, forknum, blocknum, buffer);
+
+	lsn = PageGetLSN(buffer);
+	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forknum, blocknum,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+}
+
+/*
+ *	zenith_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	ZenithResponse *resp;
+	BlockNumber n_blocks;
+	bool		latest;
+	XLogRecPtr	request_lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdnblocks(reln, forknum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
+	{
+		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, n_blocks);
+		return n_blocks;
+	}
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithNblocksRequest request = {
+			.req.tag = T_ZenithNblocksRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forknum,
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag)
+	{
+		case T_ZenithNblocksResponse:
+			n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s",
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
+
+	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forknum,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 n_blocks);
+
+	pfree(resp);
+	return n_blocks;
+}
+
+/*
+ *	zenith_db_size() -- Get the size of the database in bytes.
+ */
+const int64
+zenith_dbsize(Oid dbNode)
+{
+	ZenithResponse *resp;
+	int64 db_size;
+	XLogRecPtr request_lsn;
+	bool		latest;
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithDbSizeRequest request = {
+			.req.tag = T_ZenithDbSizeRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
+			.dbNode = dbNode,
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag)
+	{
+		case T_ZenithDbSizeResponse:
+			db_size = ((ZenithDbSizeResponse *) resp)->db_size;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
+							dbNode,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s",
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+
+	elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+		 dbNode,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 db_size);
+
+	pfree(resp);
+	return db_size;
+}
+
+/*
+ *	zenith_truncate() -- Truncate relation to specified number of blocks.
+ */
+void
+zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+	XLogRecPtr	lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdtruncate(reln, forknum, nblocks);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
+
+	/*
+	 * Truncating a relation drops all its buffers from the buffer cache
+	 * without calling smgrwrite() on them. But we must account for that in
+	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
+	 * request must return the new size after the truncation. We don't know
+	 * what the LSN of the truncation record was, so be conservative and use
+	 * the most recently inserted WAL record's LSN.
+	 */
+	lsn = GetXLogInsertRecPtr();
+
+	lsn = zm_adjust_lsn(lsn);
+
+	/*
+	 * Flush it, too. We don't actually care about it here, but let's uphold
+	 * the invariant that last-written LSN <= flush LSN.
+	 */
+	XLogFlush(lsn);
+
+	SetLastWrittenPageLSN(lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdtruncate(reln, forknum, nblocks);
+#endif
+}
+
+/*
+ *	zenith_immedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdimmedsync(reln, forknum);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdimmedsync(reln, forknum);
+#endif
+}
+
+/*
+ * zenith_start_unlogged_build() -- Starting build operation on a rel.
+ *
+ * Some indexes are built in two phases, by first populating the table with
+ * regular inserts, using the shared buffer cache but skipping WAL-logging,
+ * and WAL-logging the whole relation after it's done. Zenith relies on the
+ * WAL to reconstruct pages, so we cannot use the page server in the
+ * first phase when the changes are not logged.
+ */
+static void
+zenith_start_unlogged_build(SMgrRelation reln)
+{
+	/*
+	 * Currently, there can be only one unlogged relation build operation in
+	 * progress at a time. That's enough for the current usage.
+	 */
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+		elog(ERROR, "unlogged relation build is already in progress");
+	Assert(unlogged_build_rel == NULL);
+
+	ereport(SmgrTrace,
+			(errmsg("starting unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			unlogged_build_rel = reln;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
+		elog(ERROR, "cannot perform unlogged index build, index is not empty ");
+
+	unlogged_build_rel = reln;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
+
+	/* Make the relation look like it's unlogged */
+	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
+
+	/*
+	 * FIXME: should we pass isRedo true to create the tablespace dir if it
+	 * doesn't exist? Is it needed?
+	 */
+	mdcreate(reln, MAIN_FORKNUM, false);
+}
+
+/*
+ * zenith_finish_unlogged_build_phase_1()
+ *
+ * Call this after you have finished populating a relation in unlogged mode,
+ * before you start WAL-logging it.
+ */
+static void
+zenith_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
+		return;
+
+	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
+	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+}
+
+/*
+ * zenith_end_unlogged_build() -- Finish an unlogged rel build.
+ *
+ * Call this after you have finished WAL-logging an relation that was
+ * first populated without WAL-logging.
+ *
+ * This removes the local copy of the rel, since it's now been fully
+ * WAL-logged and is present in the page server.
+ */
+static void
+zenith_end_unlogged_build(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg("ending unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
+	{
+		RelFileNodeBackend rnode;
+
+		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
+		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+		/* Make the relation look permanent again */
+		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
+
+		/* Remove local copy */
+		rnode = reln->smgr_rnode;
+		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
+				 rnode.node.spcNode,
+				 rnode.node.dbNode,
+				 rnode.node.relNode,
+				 forknum);
+
+			forget_cached_relsize(rnode.node, forknum);
+			mdclose(reln, forknum);
+			/* use isRedo == true, so that we drop it immediately */
+			mdunlink(rnode, forknum, true);
+		}
+	}
+
+	unlogged_build_rel = NULL;
+	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+}
+
+static void
+AtEOXact_zenith(XactEvent event, void *arg)
+{
+	switch (event)
+	{
+		case XACT_EVENT_ABORT:
+		case XACT_EVENT_PARALLEL_ABORT:
+
+			/*
+			 * Forget about any build we might have had in progress. The local
+			 * file will be unlinked by smgrDoPendingDeletes()
+			 */
+			unlogged_build_rel = NULL;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+			break;
+
+		case XACT_EVENT_COMMIT:
+		case XACT_EVENT_PARALLEL_COMMIT:
+		case XACT_EVENT_PREPARE:
+		case XACT_EVENT_PRE_COMMIT:
+		case XACT_EVENT_PARALLEL_PRE_COMMIT:
+		case XACT_EVENT_PRE_PREPARE:
+			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+			{
+				unlogged_build_rel = NULL;
+				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 (errmsg("unlogged index build was not properly finished"))));
+			}
+			break;
+	}
+}
+
+static const struct f_smgr zenith_smgr =
+{
+	.smgr_init = zenith_init,
+	.smgr_shutdown = NULL,
+	.smgr_open = zenith_open,
+	.smgr_close = zenith_close,
+	.smgr_create = zenith_create,
+	.smgr_exists = zenith_exists,
+	.smgr_unlink = zenith_unlink,
+	.smgr_extend = zenith_extend,
+	.smgr_prefetch = zenith_prefetch,
+	.smgr_read = zenith_read,
+	.smgr_write = zenith_write,
+	.smgr_writeback = zenith_writeback,
+	.smgr_nblocks = zenith_nblocks,
+	.smgr_truncate = zenith_truncate,
+	.smgr_immedsync = zenith_immedsync,
+
+	.smgr_start_unlogged_build = zenith_start_unlogged_build,
+	.smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1,
+	.smgr_end_unlogged_build = zenith_end_unlogged_build,
+};
+
+
+const f_smgr *
+smgr_zenith(BackendId backend, RelFileNode rnode)
+{
+
+	/* Don't use page server for temp relations */
+	if (backend != InvalidBackendId)
+		return smgr_standard(backend, rnode);
+	else
+		return &zenith_smgr;
+}
+
+void
+smgr_init_zenith(void)
+{
+	RegisterXactCallback(AtEOXact_zenith, NULL);
+
+	smgr_init_standard();
+	zenith_init();
+}
diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
new file mode 100644
index 0000000000..8dfcffe1d1
--- /dev/null
+++ b/pgxn/neon/relsize_cache.c
@@ -0,0 +1,167 @@
+/*-------------------------------------------------------------------------
+ *
+ * relsize_cache.c
+ *      Relation size cache for better zentih performance.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  contrib/neon/relsize_cache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pagestore_client.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/lwlock.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "catalog/pg_tablespace_d.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+
+
+typedef struct
+{
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} RelTag;
+
+typedef struct
+{
+	RelTag		tag;
+	BlockNumber size;
+} RelSizeEntry;
+
+static HTAB *relsize_hash;
+static LWLockId relsize_lock;
+static int	relsize_hash_size;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+/*
+ * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
+ * which seems reasonable.
+ */
+#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
+
+static void
+zenith_smgr_shmem_startup(void)
+{
+	static HASHCTL info;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
+	info.keysize = sizeof(RelTag);
+	info.entrysize = sizeof(RelSizeEntry);
+	relsize_hash = ShmemInitHash("neon_relsize",
+								 relsize_hash_size, relsize_hash_size,
+								 &info,
+								 HASH_ELEM | HASH_BLOBS);
+	LWLockRelease(AddinShmemInitLock);
+}
+
+bool
+get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
+{
+	bool		found = false;
+
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_SHARED);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			*size = entry->size;
+			found = true;
+		}
+		LWLockRelease(relsize_lock);
+	}
+	return found;
+}
+
+void
+set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
+		entry->size = size;
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+		bool		found;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
+		if (!found || entry->size < size)
+			entry->size = size;
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+relsize_hash_init(void)
+{
+	DefineCustomIntVariable("neon.relsize_hash_size",
+							"Sets the maximum number of cached relation sizes for neon",
+							NULL,
+							&relsize_hash_size,
+							DEFAULT_RELSIZE_HASH_SIZE,
+							0,
+							INT_MAX,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
+	if (relsize_hash_size > 0)
+	{
+		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
+		RequestNamedLWLockTranche("neon_relsize", 1);
+
+		prev_shmem_startup_hook = shmem_startup_hook;
+		shmem_startup_hook = zenith_smgr_shmem_startup;
+	}
+}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
new file mode 100644
index 0000000000..9625325c0a
--- /dev/null
+++ b/pgxn/neon/walproposer.c
@@ -0,0 +1,2403 @@
+/*-------------------------------------------------------------------------
+ *
+ * walproposer.c
+ *
+ * Proposer/leader part of the total order broadcast protocol between postgres
+ * and WAL safekeepers.
+ *
+ * We have two ways of launching WalProposer:
+ *
+ *   1. As a background worker which will run physical WalSender with
+ *      am_wal_proposer flag set to true. WalSender in turn would handle WAL
+ *      reading part and call WalProposer when ready to scatter WAL.
+ *
+ *   2. As a standalone utility by running `postgres --sync-safekeepers`. That
+ *      is needed to create LSN from which it is safe to start postgres. More
+ *      specifically it addresses following problems:
+ *
+ *      a) Chicken-or-the-egg problem: compute postgres needs data directory
+ *         with non-rel files that are downloaded from pageserver by calling
+ *         basebackup@LSN. This LSN is not arbitrary, it must include all
+ *         previously committed transactions and defined through consensus
+ *         voting, which happens... in walproposer, a part of compute node.
+ *
+ *      b) Just warranting such LSN is not enough, we must also actually commit
+ *         it and make sure there is a safekeeper who knows this LSN is
+ *         committed so WAL before it can be streamed to pageserver -- otherwise
+ *         basebackup will hang waiting for WAL. Advancing commit_lsn without
+ *         playing consensus game is impossible, so speculative 'let's just poll
+ *         safekeepers, learn start LSN of future epoch and run basebackup'
+ *         won't work.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include "access/xlogdefs.h"
+#include "access/xlogutils.h"
+#include "storage/latch.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "access/xlog.h"
+#include "libpq/pqformat.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "postmaster/postmaster.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+#include "neon.h"
+#include "walproposer.h"
+#include "walproposer_utils.h"
+#include "replication/walpropshim.h"
+
+
+char	   *wal_acceptors_list;
+int			wal_acceptor_reconnect_timeout;
+int			wal_acceptor_connect_timeout;
+bool		am_wal_proposer;
+
+char	   *zenith_timeline_walproposer = NULL;
+char	   *zenith_tenant_walproposer = NULL;
+
+/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
+WalProposerFunctionsType *WalProposerFunctions = NULL;
+
+#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
+
+static int	n_safekeepers = 0;
+static int	quorum = 0;
+static Safekeeper safekeeper[MAX_SAFEKEEPERS];
+static XLogRecPtr availableLsn;	/* WAL has been generated up to this point */
+static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */
+static ProposerGreeting greetRequest;
+static VoteRequest voteRequest; /* Vote request for safekeeper */
+static WaitEventSet *waitEvents;
+static AppendResponse quorumFeedback;
+/*
+ *  Minimal LSN which may be needed for recovery of some safekeeper,
+ *  record-aligned (first record which might not yet received by someone).
+ */
+static XLogRecPtr truncateLsn;
+/*
+ * Term of the proposer. We want our term to be highest and unique,
+ * so we collect terms from safekeepers quorum, choose max and +1.
+ * After that our term is fixed and must not change. If we observe
+ * that some safekeeper has higher term, it means that we have another
+ * running compute, so we must stop immediately.
+ */
+static term_t propTerm;
+static TermHistory propTermHistory; /* term history of the proposer */
+static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
+static term_t donorEpoch;		/* Most advanced acceptor epoch */
+static int	donor;				/* Most advanced acceptor */
+static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
+static int	n_votes = 0;
+static int	n_connected = 0;
+static TimestampTz last_reconnect_attempt;
+
+static WalproposerShmemState *walprop_shared;
+
+/* Prototypes for private functions */
+static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStartImpl(void);
+static void WalProposerLoop(void);
+static void InitEventSet(void);
+static void UpdateEventSet(Safekeeper *sk, uint32 events);
+static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
+static void ShutdownConnection(Safekeeper *sk);
+static void ResetConnection(Safekeeper *sk);
+static long TimeToReconnect(TimestampTz now);
+static void ReconnectSafekeepers(void);
+static void AdvancePollState(Safekeeper *sk, uint32 events);
+static void HandleConnectionEvent(Safekeeper *sk);
+static void SendStartWALPush(Safekeeper *sk);
+static void RecvStartWALPushResult(Safekeeper *sk);
+static void SendProposerGreeting(Safekeeper *sk);
+static void RecvAcceptorGreeting(Safekeeper *sk);
+static void SendVoteRequest(Safekeeper *sk);
+static void RecvVoteResponse(Safekeeper *sk);
+static void HandleElectedProposer(void);
+static term_t GetHighestTerm(TermHistory *th);
+static term_t GetEpoch(Safekeeper *sk);
+static void DetermineEpochStartLsn(void);
+static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+static void SendProposerElected(Safekeeper *sk);
+static void WalProposerStartStreaming(XLogRecPtr startpos);
+static void StartStreaming(Safekeeper *sk);
+static void SendMessageToNode(Safekeeper *sk);
+static void BroadcastAppendRequest(void);
+static void HandleActiveState(Safekeeper *sk, uint32 events);
+static bool SendAppendRequests(Safekeeper *sk);
+static bool RecvAppendResponses(Safekeeper *sk);
+static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
+static XLogRecPtr CalculateMinFlushLsn(void);
+static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
+static void HandleSafekeeperResponse(void);
+static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
+static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
+static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
+static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
+static bool AsyncFlush(Safekeeper *sk);
+
+
+static void nwp_shmem_startup_hook(void);
+static void nwp_register_gucs(void);
+static void nwp_prepare_shmem(void);
+static uint64 backpressure_lag_impl(void);
+
+
+static shmem_startup_hook_type prev_shmem_startup_hook_type;
+
+
+
+void pg_init_walproposer(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	nwp_register_gucs();
+
+	nwp_prepare_shmem();
+
+	delay_backend_us = &backpressure_lag_impl;
+
+	WalProposerRegister();
+	
+	WalProposerInit = &WalProposerInitImpl;
+	WalProposerStart = &WalProposerStartImpl;
+}
+
+static void nwp_register_gucs(void)
+{
+	DefineCustomStringVariable(
+		"neon.safekeepers",
+		"List of Neon WAL acceptors (host:port)",
+		NULL, /* long_desc */
+		&wal_acceptors_list, /* valueAddr */
+		"", /* bootValue */
+		PGC_POSTMASTER,
+		GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */
+		NULL, NULL, NULL
+	);
+
+	DefineCustomIntVariable(
+		"neon.safekeeper_reconnect_timeout",
+		"Timeout for reconnecting to offline wal acceptor.",
+		NULL,
+		&wal_acceptor_reconnect_timeout,
+		1000, 0, INT_MAX, /* default, min, max */
+		PGC_SIGHUP, /* context */
+		GUC_UNIT_MS, /* flags */
+		NULL, NULL, NULL
+	);
+
+	DefineCustomIntVariable(
+		"neon.safekeeper_connect_timeout",
+		"Timeout after which give up connection attempt to safekeeper.",
+		NULL,
+		&wal_acceptor_connect_timeout,
+		5000, 0, INT_MAX,
+		PGC_SIGHUP,
+		GUC_UNIT_MS,
+		NULL, NULL, NULL
+	);
+	
+}
+
+/* shmem handling */
+
+static void nwp_prepare_shmem(void)
+{
+	RequestAddinShmemSpace(WalproposerShmemSize());
+
+	prev_shmem_startup_hook_type = shmem_startup_hook;
+	shmem_startup_hook = nwp_shmem_startup_hook;
+}
+
+static void nwp_shmem_startup_hook(void)
+{
+	if (prev_shmem_startup_hook_type)
+		prev_shmem_startup_hook_type();
+
+	WalproposerShmemInit();
+}
+
+/*
+ * WAL proposer bgworker entry point.
+ */
+void
+WalProposerMain(Datum main_arg)
+{
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
+
+	last_reconnect_attempt = GetCurrentTimestamp();
+
+	application_name = (char *) "walproposer";	/* for
+												 * synchronous_standby_names */
+	am_wal_proposer = true;
+	am_walsender = true;
+	InitWalSender();
+	InitProcessPhase2();
+
+	/* Create replication slot for WAL proposer if not exists */
+	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
+	{
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotReserveWal();
+		/* Write this slot to disk */
+		ReplicationSlotMarkDirty();
+		ReplicationSlotSave();
+		ReplicationSlotRelease();
+	}
+
+	WalProposerStart();
+}
+
+/*
+ * Create new AppendRequest message and start sending it. This function is
+ * called from walsender every time the new WAL is available.
+ */
+void
+WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos)
+{
+	Assert(startpos == availableLsn && endpos >= availableLsn);
+	availableLsn = endpos;
+	BroadcastAppendRequest();
+}
+
+/*
+ * Advance the WAL proposer state machine, waiting each time for events to occur.
+ * Will exit only when latch is set, i.e. new WAL should be pushed from walsender
+ * to walproposer.
+ */
+void
+WalProposerPoll(void)
+{
+	while (true)
+	{
+		Safekeeper  *sk;
+		int			rc;
+		WaitEvent	event;
+		TimestampTz now = GetCurrentTimestamp();
+
+		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		sk = (Safekeeper *) event.user_data;
+
+		/*
+		 * If the event contains something that one of our safekeeper states
+		 * was waiting for, we'll advance its state.
+		 */
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
+			AdvancePollState(sk, event.events);
+
+		/*
+		 * If the timeout expired, attempt to reconnect to any safekeepers that
+		 * we dropped
+		 */
+		ReconnectSafekeepers();
+
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (rc != 0 && (event.events & WL_LATCH_SET))
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+		if (rc == 0) /* timeout expired: poll state */
+		{
+			TimestampTz now;
+
+			/*
+			 * If no WAL was generated during timeout (and we have already
+			 * collected the quorum), then send pool message
+			 */
+			if (availableLsn != InvalidXLogRecPtr)
+			{
+				BroadcastAppendRequest();
+			}
+
+			/*
+			 * Abandon connection attempts which take too long.
+			 */
+			now = GetCurrentTimestamp();
+			for (int i = 0; i < n_safekeepers; i++)
+			{
+				Safekeeper  *sk = &safekeeper[i];
+
+				if ((sk->state == SS_CONNECTING_WRITE ||
+				     sk->state == SS_CONNECTING_READ) &&
+					TimestampDifferenceExceeds(sk->startedConnAt, now,
+										   	   wal_acceptor_connect_timeout))
+				{
+					elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
+						 sk->host, sk->port, wal_acceptor_connect_timeout);
+					ShutdownConnection(sk);
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Register a background worker proposing WAL to wal acceptors.
+ */
+void
+WalProposerRegister(void)
+{
+	BackgroundWorker bgw;
+
+	if (*wal_acceptors_list == '\0')
+		return;
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+static void
+WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
+{
+	char	   *host;
+	char	   *sep;
+	char	   *port;
+
+	/* Load the libpq-specific functions */
+	if (WalProposerFunctions == NULL)
+		elog(ERROR, "libpqwalproposer didn't initialize correctly");
+
+	load_file("libpqwalreceiver", false);
+	if (WalReceiverFunctions == NULL)
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+
+	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
+	{
+		port = strchr(host, ':');
+		if (port == NULL)
+		{
+			elog(FATAL, "port is not specified");
+		}
+		*port++ = '\0';
+		sep = strchr(port, ',');
+		if (sep != NULL)
+			*sep++ = '\0';
+		if (n_safekeepers + 1 >= MAX_SAFEKEEPERS)
+		{
+			elog(FATAL, "Too many safekeepers");
+		}
+		safekeeper[n_safekeepers].host = host;
+		safekeeper[n_safekeepers].port = port;
+		safekeeper[n_safekeepers].state = SS_OFFLINE;
+		safekeeper[n_safekeepers].conn = NULL;
+
+		/*
+		 * Set conninfo to empty. We'll fill it out once later, in
+		 * `ResetConnection` as needed
+		 */
+		safekeeper[n_safekeepers].conninfo[0] = '\0';
+		initStringInfo(&safekeeper[n_safekeepers].outbuf);
+		safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL);
+		if (safekeeper[n_safekeepers].xlogreader == NULL)
+			elog(FATAL, "Failed to allocate xlog reader");
+		safekeeper[n_safekeepers].flushWrite = false;
+		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
+		safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
+		n_safekeepers += 1;
+	}
+	if (n_safekeepers < 1)
+	{
+		elog(FATAL, "Safekeepers addresses are not specified");
+	}
+	quorum = n_safekeepers / 2 + 1;
+
+	/* Fill the greeting package */
+	greetRequest.tag = 'g';
+	greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
+	greetRequest.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
+	greetRequest.systemId = systemId;
+	if (!zenith_timeline_walproposer)
+		elog(FATAL, "neon.timeline_id is not provided");
+	if (*zenith_timeline_walproposer != '\0' &&
+		!HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16))
+		elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer);
+	if (!zenith_tenant_walproposer)
+		elog(FATAL, "neon.tenant_id is not provided");
+	if (*zenith_tenant_walproposer != '\0' &&
+		!HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16))
+		elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer);
+
+	greetRequest.timeline = ThisTimeLineID;
+	greetRequest.walSegSize = wal_segment_size;
+
+	InitEventSet();
+}
+
+static void
+WalProposerStartImpl(void)
+{
+
+	/* Initiate connections to all safekeeper nodes */
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		ResetConnection(&safekeeper[i]);
+	}
+
+	WalProposerLoop();
+}
+
+static void
+WalProposerLoop(void)
+{
+	while (true)
+		WalProposerPoll();
+}
+
+/* Initializes the internal event set, provided that it is currently null */
+static void
+InitEventSet(void)
+{
+	if (waitEvents)
+		elog(FATAL, "double-initialization of event set");
+
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
+	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+}
+
+/*
+ * Updates the events we're already waiting on for the safekeeper, setting it to
+ * the provided `events`
+ *
+ * This function is called any time the safekeeper's state switches to one where
+ * it has to wait to continue. This includes the full body of AdvancePollState
+ * and calls to IO helper functions.
+ */
+static void
+UpdateEventSet(Safekeeper *sk, uint32 events)
+{
+	/* eventPos = -1 when we don't have an event */
+	Assert(sk->eventPos != -1);
+
+	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
+}
+
+/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+ */
+static void
+HackyRemoveWalProposerEvent(Safekeeper *to_remove)
+{
+	/* Remove the existing event set */
+	if (waitEvents)
+	{
+		FreeWaitEventSet(waitEvents);
+		waitEvents = NULL;
+	}
+	/* Re-initialize it without adding any safekeeper events */
+	InitEventSet();
+
+	/*
+	 * loop through the existing safekeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		uint32		desired_events = WL_NO_EVENTS;
+		Safekeeper  *sk = &safekeeper[i];
+
+		sk->eventPos = -1;
+
+		if (sk == to_remove)
+			continue;
+
+		/* If this safekeeper isn't offline, add an event for it! */
+		if (sk->conn != NULL)
+		{
+			desired_events = SafekeeperStateDesiredEvents(sk->state);
+			sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk);
+		}
+	}
+}
+
+/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
+static void
+ShutdownConnection(Safekeeper *sk)
+{
+	if (sk->conn)
+		walprop_finish(sk->conn);
+	sk->conn = NULL;
+	sk->state = SS_OFFLINE;
+	sk->flushWrite = false;
+	sk->streamingAt = InvalidXLogRecPtr;
+
+	if (sk->voteResponse.termHistory.entries)
+		pfree(sk->voteResponse.termHistory.entries);
+	sk->voteResponse.termHistory.entries = NULL;
+
+	HackyRemoveWalProposerEvent(sk);
+}
+
+/*
+ * This function is called to establish new connection or to reestablish
+ * connection in case of connection failure.
+ *
+ * On success, sets the state to SS_CONNECTING_WRITE.
+ */
+static void
+ResetConnection(Safekeeper *sk)
+{
+	pgsocket	sock;			/* socket of the new connection */
+
+	if (sk->state != SS_OFFLINE)
+	{
+		ShutdownConnection(sk);
+	}
+
+	/*
+	 * Try to establish new connection
+	 *
+	 * If the connection information hasn't been filled out, we need to do
+	 * that here.
+	 */
+	if (sk->conninfo[0] == '\0')
+	{
+		int written = 0;
+		written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
+				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+				sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+		// currently connection string is not that long, but once we pass something like jwt we might overflow the buffer,
+		// so it is better to be defensive and check that everything aligns well
+		if (written > MAXCONNINFO || written < 0)
+			elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
+	}
+
+	sk->conn = walprop_connect_start((char *) &sk->conninfo);
+
+	/*
+	 * "If the result is null, then libpq has been unable to allocate a new
+	 * PGconn structure"
+	 */
+	if (!sk->conn)
+		elog(FATAL, "failed to allocate new PGconn object");
+
+	/*
+	 * PQconnectStart won't actually start connecting until we run
+	 * PQconnectPoll. Before we do that though, we need to check that it
+	 * didn't immediately fail.
+	 */
+	if (walprop_status(sk->conn) == WP_CONNECTION_BAD)
+	{
+		/*---
+		 * According to libpq docs:
+		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed,
+		 *    typically because of invalid connection parameters."
+		 * We should report this failure.
+		 *
+		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
+		 */
+		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
+			 sk->conninfo, walprop_error_message(sk->conn));
+
+		/*
+		 * Even though the connection failed, we still need to clean up the
+		 * object
+		 */
+		walprop_finish(sk->conn);
+		sk->conn = NULL;
+		return;
+	}
+
+	/*
+	 * The documentation for PQconnectStart states that we should call
+	 * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or
+	 * PGRES_POLLING_FAILED. The other two possible returns indicate whether
+	 * we should wait for reading or writing on the socket. For the first
+	 * iteration of the loop, we're expected to wait until the socket becomes
+	 * writable.
+	 *
+	 * The wording of the documentation is a little ambiguous; thankfully
+	 * there's an example in the postgres source itself showing this behavior.
+	 * (see libpqrcv_connect, defined in
+	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
+	 */
+	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
+
+	sk->state = SS_CONNECTING_WRITE;
+	sk->startedConnAt = GetCurrentTimestamp();
+
+	sock = walprop_socket(sk->conn);
+	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
+	return;
+}
+
+/*
+ * How much milliseconds left till we should attempt reconnection to
+ * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
+ * (do we actually need this?).
+ */
+static long
+TimeToReconnect(TimestampTz now)
+{
+	TimestampTz passed;
+	TimestampTz till_reconnect;
+
+	if (wal_acceptor_reconnect_timeout <= 0)
+		return -1;
+
+	passed = now - last_reconnect_attempt;
+	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
+	if (till_reconnect <= 0)
+		return 0;
+	return (long) (till_reconnect / 1000);
+}
+
+/* If the timeout has expired, attempt to reconnect to all offline safekeepers */
+static void
+ReconnectSafekeepers(void)
+{
+	TimestampTz now = GetCurrentTimestamp();
+
+	if (TimeToReconnect(now) == 0)
+	{
+		last_reconnect_attempt = now;
+		for (int i = 0; i < n_safekeepers; i++)
+		{
+			if (safekeeper[i].state == SS_OFFLINE)
+				ResetConnection(&safekeeper[i]);
+		}
+	}
+}
+
+/*
+ * Performs the logic for advancing the state machine of the specified safekeeper,
+ * given that a certain set of events has occured.
+ */
+static void
+AdvancePollState(Safekeeper *sk, uint32 events)
+{
+	/*
+	 * Sanity check. We assume further down that the operations don't
+	 * block because the socket is ready.
+	 */
+	AssertEventsOkForState(events, sk);
+
+	/* Execute the code corresponding to the current state */
+	switch (sk->state)
+	{
+			/*
+			 * safekeepers are only taken out of SS_OFFLINE by calls to
+			 * ResetConnection
+			 */
+		case SS_OFFLINE:
+			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
+				 sk->host, sk->port);
+			break;			/* actually unreachable, but prevents
+							 * -Wimplicit-fallthrough */
+
+			/*
+			 * Both connecting states run the same logic. The only
+			 * difference is the events they're expecting
+			 */
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
+			HandleConnectionEvent(sk);
+			break;
+
+			/*
+			 * Waiting for a successful CopyBoth response.
+			 */
+		case SS_WAIT_EXEC_RESULT:
+			RecvStartWALPushResult(sk);
+			break;
+
+			/*
+			 * Finish handshake comms: receive information about the safekeeper.
+			 */
+		case SS_HANDSHAKE_RECV:
+			RecvAcceptorGreeting(sk);
+			break;
+
+			/*
+			 * Voting is an idle state - we don't expect any events to trigger.
+			 * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are
+			 * transferred from SS_VOTING to sending actual vote requests.
+			 */
+		case SS_VOTING:
+			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+					sk->port, FormatSafekeeperState(sk->state));
+			ResetConnection(sk);
+			return;
+
+			/* Read the safekeeper response for our candidate */
+		case SS_WAIT_VERDICT:
+			RecvVoteResponse(sk);
+			break;
+
+			/* Flush proposer announcement message */
+		case SS_SEND_ELECTED_FLUSH:
+
+			/*
+			 * AsyncFlush ensures we only move on to SS_ACTIVE once the flush
+			 * completes. If we still have more to do, we'll wait until the next
+			 * poll comes along.
+			 */
+			if (!AsyncFlush(sk))
+				return;
+
+			/* flush is done, event set and state will be updated later */
+			StartStreaming(sk);
+			break;
+
+			/*
+			 * Idle state for waiting votes from quorum.
+			 */
+		case SS_IDLE:
+			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+					sk->port, FormatSafekeeperState(sk->state));
+			ResetConnection(sk);
+			return;
+
+			/*
+			 * Active state is used for streaming WAL and receiving feedback.
+			 */
+		case SS_ACTIVE:
+			HandleActiveState(sk, events);
+			break;
+	}
+}
+
+static void
+HandleConnectionEvent(Safekeeper *sk)
+{
+	WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn);
+
+	/* The new set of events we'll wait on, after updating */
+	uint32		new_events = WL_NO_EVENTS;
+
+	switch (result)
+	{
+		case WP_CONN_POLLING_OK:
+			elog(LOG, "connected with node %s:%s", sk->host,
+					sk->port);
+
+			/*
+			 * We have to pick some event to update event set.
+			 * We'll eventually need the socket to be readable,
+			 * so we go with that.
+			 */
+			new_events = WL_SOCKET_READABLE;
+			break;
+
+			/*
+			 * If we need to poll to finish connecting,
+			 * continue doing that
+			 */
+		case WP_CONN_POLLING_READING:
+			sk->state = SS_CONNECTING_READ;
+			new_events = WL_SOCKET_READABLE;
+			break;
+		case WP_CONN_POLLING_WRITING:
+			sk->state = SS_CONNECTING_WRITE;
+			new_events = WL_SOCKET_WRITEABLE;
+			break;
+
+		case WP_CONN_POLLING_FAILED:
+			elog(WARNING, "failed to connect to node '%s:%s': %s",
+					sk->host, sk->port, walprop_error_message(sk->conn));
+
+			/*
+			 * If connecting failed, we don't want to restart
+			 * the connection because that might run us into a
+			 * loop. Instead, shut it down -- it'll naturally
+			 * restart at a slower interval on calls to
+			 * ReconnectSafekeepers.
+			 */
+			ShutdownConnection(sk);
+			return;
+	}
+
+	/*
+	 * Because PQconnectPoll can change the socket, we have to
+	 * un-register the old event and re-register an event on
+	 * the new socket.
+	 */
+	HackyRemoveWalProposerEvent(sk);
+	sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk);
+
+	/* If we successfully connected, send START_WAL_PUSH query */
+	if (result == WP_CONN_POLLING_OK)
+		SendStartWALPush(sk);
+}
+
+/*
+ * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs
+ * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something
+ * goes wrong, change state to SS_OFFLINE and shutdown the connection.
+ */
+static void
+SendStartWALPush(Safekeeper *sk)
+{
+	if (!walprop_send_query(sk->conn, "START_WAL_PUSH"))
+	{
+		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			sk->host, sk->port, walprop_error_message(sk->conn));
+		ShutdownConnection(sk);
+		return;
+	}
+	sk->state = SS_WAIT_EXEC_RESULT;
+	UpdateEventSet(sk, WL_SOCKET_READABLE);
+}
+
+static void
+RecvStartWALPushResult(Safekeeper *sk)
+{
+	switch (walprop_get_query_result(sk->conn))
+	{
+			/*
+			 * Successful result, move on to starting the
+			 * handshake
+			 */
+		case WP_EXEC_SUCCESS_COPYBOTH:
+
+			SendProposerGreeting(sk);
+			break;
+
+			/*
+			 * Needs repeated calls to finish. Wait until the
+			 * socket is readable
+			 */
+		case WP_EXEC_NEEDS_INPUT:
+
+			/*
+			 * SS_WAIT_EXEC_RESULT is always reached through an
+			 * event, so we don't need to update the event set
+			 */
+			break;
+
+		case WP_EXEC_FAILED:
+			elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
+					sk->host, sk->port, walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
+			return;
+
+			/*
+			 * Unexpected result -- funamdentally an error, but we
+			 * want to produce a custom message, rather than a
+			 * generic "something went wrong"
+			 */
+		case WP_EXEC_UNEXPECTED_SUCCESS:
+			elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
+					sk->host, sk->port);
+			ShutdownConnection(sk);
+			return;
+	}
+}
+
+/*
+ * Start handshake: first of all send information about the
+ * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * a response to finish the handshake.
+ */
+static void
+SendProposerGreeting(Safekeeper *sk)
+{
+	/*
+	 * On failure, logging & resetting the connection is handled.
+	 * We just need to handle the control flow.
+	 */
+	BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV);
+}
+
+static void
+RecvAcceptorGreeting(Safekeeper *sk)
+{
+	/*
+	 * If our reading doesn't immediately succeed, any necessary
+	 * error handling or state setting is taken care of. We can
+	 * leave any other work until later.
+	 */
+	sk->greetResponse.apm.tag = 'g';
+	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
+		return;
+
+	/* Protocol is all good, move to voting. */
+	sk->state = SS_VOTING;
+
+	++n_connected;
+	if (n_connected <= quorum)
+	{
+		/* We're still collecting terms from the majority. */
+		propTerm = Max(sk->greetResponse.term, propTerm);
+
+		/* Quorum is acquried, prepare the vote request. */
+		if (n_connected == quorum)
+		{
+			propTerm++;
+			elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
+
+			voteRequest = (VoteRequest)
+			{
+				.tag = 'v',
+					.term = propTerm
+			};
+			memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN);
+		}
+	}
+	else if (sk->greetResponse.term > propTerm)
+	{
+		/* Another compute with higher term is running. */
+		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+				sk->host, sk->port,
+				sk->greetResponse.term, propTerm);
+	}
+
+	/*
+	 * Check if we have quorum. If there aren't enough safekeepers,
+	 * wait and do nothing. We'll eventually get a task when the
+	 * election starts.
+	 *
+	 * If we do have quorum, we can start an election.
+	 */
+	if (n_connected < quorum)
+	{
+		/*
+		 * SS_VOTING is an idle state; read-ready indicates the
+		 * connection closed.
+		 */
+		UpdateEventSet(sk, WL_SOCKET_READABLE);
+	}
+	else
+	{
+		/*
+		 * Now send voting request to the cohort and wait
+		 * responses
+		 */
+		for (int j = 0; j < n_safekeepers; j++)
+		{
+			/*
+			 * Remember: SS_VOTING indicates that the safekeeper is
+			 * participating in voting, but hasn't sent anything
+			 * yet.
+			 */
+			if (safekeeper[j].state == SS_VOTING)
+				SendVoteRequest(&safekeeper[j]);
+		}
+	}
+}
+
+static void
+SendVoteRequest(Safekeeper *sk)
+{
+	/* We have quorum for voting, send our vote request */
+	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
+	/* On failure, logging & resetting is handled */
+	if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+		return;
+
+	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
+}
+
+static void
+RecvVoteResponse(Safekeeper *sk)
+{
+	sk->voteResponse.apm.tag = 'v';
+	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
+		return;
+
+	elog(LOG,
+			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+			sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+			LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+			LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+
+	/*
+	 * In case of acceptor rejecting our vote, bail out, but only
+	 * if either it already lives in strictly higher term
+	 * (concurrent compute spotted) or we are not elected yet and
+	 * thus need the vote.
+	 */
+	if ((!sk->voteResponse.voteGiven) &&
+		(sk->voteResponse.term > propTerm || n_votes < quorum))
+	{
+		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+				sk->host, sk->port,
+				sk->voteResponse.term, propTerm);
+	}
+	Assert(sk->voteResponse.term == propTerm);
+
+	/* Handshake completed, do we have quorum? */
+	n_votes++;
+	if (n_votes < quorum)
+	{
+		sk->state = SS_IDLE; /* can't do much yet, no quorum */
+	}
+	else if (n_votes > quorum)
+	{
+		/* recovery already performed, just start streaming */
+		SendProposerElected(sk);
+	}
+	else
+	{
+		sk->state = SS_IDLE;
+		UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for
+												 * read-ready */
+
+		HandleElectedProposer();
+	}
+}
+
+/*
+ * Called once a majority of acceptors have voted for us and current proposer
+ * has been elected.
+ *
+ * Sends ProposerElected message to all acceptors in SS_IDLE state and starts
+ * replication from walsender.
+ */
+static void
+HandleElectedProposer(void)
+{
+	DetermineEpochStartLsn();
+
+	/*
+	 * Check if not all safekeepers are up-to-date, we need to
+	 * download WAL needed to synchronize them
+	 */
+	if (truncateLsn < propEpochStartLsn)
+	{
+		elog(LOG,
+				"start recovery because truncateLsn=%X/%X is not "
+				"equal to epochStartLsn=%X/%X",
+				LSN_FORMAT_ARGS(truncateLsn),
+				LSN_FORMAT_ARGS(propEpochStartLsn));
+		/* Perform recovery */
+		if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
+			elog(FATAL, "Failed to recover state");
+	}
+	else if (syncSafekeepers)
+	{
+		/* Sync is not needed: just exit */
+		fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+		exit(0);
+	}
+
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		if (safekeeper[i].state == SS_IDLE)
+			SendProposerElected(&safekeeper[i]);
+	}
+
+	/*
+	 * The proposer has been elected, and there will be no quorum waiting
+	 * after this point. There will be no safekeeper with state SS_IDLE
+	 * also, because that state is used only for quorum waiting.
+	 */
+
+	if (syncSafekeepers)
+	{
+		/*
+		 * Send empty message to enforce receiving feedback
+		 * even from nodes who are fully recovered; this is
+		 * required to learn they switched epoch which finishes
+		 * sync-safeekepers who doesn't generate any real new
+		 * records. Will go away once we switch to async acks.
+		 */
+		BroadcastAppendRequest();
+
+		/* keep polling until all safekeepers are synced */
+		return;
+	}
+
+	WalProposerStartStreaming(propEpochStartLsn);
+	/* Should not return here */
+}
+
+/* latest term in TermHistory, or 0 is there is no entries */
+static term_t
+GetHighestTerm(TermHistory *th)
+{
+	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
+}
+
+/* safekeeper's epoch is the term of the highest entry in the log */
+static term_t
+GetEpoch(Safekeeper *sk)
+{
+	return GetHighestTerm(&sk->voteResponse.termHistory);
+}
+
+/* If LSN points to the page header, skip it */
+static XLogRecPtr
+SkipXLogPageHeader(XLogRecPtr lsn)
+{
+	if (XLogSegmentOffset(lsn, wal_segment_size) == 0)
+	{
+		lsn += SizeOfXLogLongPHD;
+	}
+	else if (lsn % XLOG_BLCKSZ == 0)
+	{
+		lsn += SizeOfXLogShortPHD;
+	}
+	return lsn;
+}
+
+/*
+ * Called after majority of acceptors gave votes, it calculates the most
+ * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
+ * which we'll write WAL in our term.
+ *
+ * Sets truncateLsn along the way (though it is not of much use at this point --
+ * only for skipping recovery).
+ */
+static void
+DetermineEpochStartLsn(void)
+{
+	TermHistory *dth;
+
+	propEpochStartLsn = InvalidXLogRecPtr;
+	donorEpoch = 0;
+	truncateLsn = InvalidXLogRecPtr;
+	timelineStartLsn = InvalidXLogRecPtr;
+
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		if (safekeeper[i].state == SS_IDLE)
+		{
+			if (GetEpoch(&safekeeper[i]) > donorEpoch ||
+				(GetEpoch(&safekeeper[i]) == donorEpoch &&
+				 safekeeper[i].voteResponse.flushLsn > propEpochStartLsn))
+			{
+				donorEpoch = GetEpoch(&safekeeper[i]);
+				propEpochStartLsn = safekeeper[i].voteResponse.flushLsn;
+				donor = i;
+			}
+			truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn);
+
+			if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
+			{
+				/* timelineStartLsn should be the same everywhere or unknown */
+				if (timelineStartLsn != InvalidXLogRecPtr &&
+					timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn)
+				{
+					elog(WARNING,
+						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						 LSN_FORMAT_ARGS(timelineStartLsn),
+						 LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn));
+				}
+				timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn;
+			}
+		}
+	}
+
+	/*
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
+	 * committed yet. Start streaming then from the basebackup LSN.
+	 */
+	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
+	{
+		propEpochStartLsn = truncateLsn = GetRedoStartLsn();
+		if (timelineStartLsn == InvalidXLogRecPtr)
+		{
+			timelineStartLsn = GetRedoStartLsn();
+		}
+		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
+	}
+
+	/*
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
+	 * some connected safekeeper; it must have carried truncateLsn pointing to
+	 * the first record.
+	 */
+	Assert((truncateLsn != InvalidXLogRecPtr) ||
+		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
+
+	/*
+	 * We will be generating WAL since propEpochStartLsn, so we should set
+	 * availableLsn to mark this LSN as the latest available position.
+	 */
+	availableLsn = propEpochStartLsn;
+
+	/*
+	 * Proposer's term history is the donor's + its own entry.
+	 */
+	dth = &safekeeper[donor].voteResponse.termHistory;
+	propTermHistory.n_entries = dth->n_entries + 1;
+	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
+	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
+	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
+
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+		 quorum,
+		 propTerm,
+		 LSN_FORMAT_ARGS(propEpochStartLsn),
+		 safekeeper[donor].host, safekeeper[donor].port,
+		 LSN_FORMAT_ARGS(truncateLsn)
+		);
+
+	/*
+	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since
+	 * which we are going to write according to the consensus. If not, we must
+	 * bail out, as clog and other non rel data is inconsistent.
+	 */
+	if (!syncSafekeepers)
+	{
+		/*
+		 *  Basebackup LSN always points to the beginning of the record (not the
+		 *  page), as StartupXLOG most probably wants it this way. Safekeepers
+		 *  don't skip header as they need continious stream of data, so
+		 *  correct LSN for comparison.
+		 */
+		if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn())
+		{
+			/*
+			 * However, allow to proceed if previously elected leader was me; plain
+			 * restart of walproposer not intervened by concurrent compute (who could
+			 * generate WAL) is ok.
+			 */
+			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
+											walprop_shared->mineLastElectedTerm)))
+			{
+				elog(PANIC,
+					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					 LSN_FORMAT_ARGS(propEpochStartLsn),
+					 LSN_FORMAT_ARGS(GetRedoStartLsn()));
+			}
+		}
+		walprop_shared->mineLastElectedTerm = propTerm;
+	}
+}
+
+/*
+ * Receive WAL from most advanced safekeeper
+ */
+static bool
+WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+{
+	char		conninfo[MAXCONNINFO];
+	char	   *err;
+	WalReceiverConn *wrconn;
+	WalRcvStreamOptions options;
+
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+			safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+	if (!wrconn)
+	{
+		ereport(WARNING,
+				(errmsg("could not connect to WAL acceptor %s:%s: %s",
+						safekeeper[donor].host, safekeeper[donor].port,
+						err)));
+		return false;
+	}
+	elog(LOG,
+		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+
+	options.logical = false;
+	options.startpoint = startpos;
+	options.slotname = NULL;
+	options.proto.physical.startpointTLI = timeline;
+
+	if (walrcv_startstreaming(wrconn, &options))
+	{
+		XLogRecPtr	rec_start_lsn;
+		XLogRecPtr	rec_end_lsn = 0;
+		int			len;
+		char	   *buf;
+		pgsocket	wait_fd = PGINVALID_SOCKET;
+
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
+		{
+			if (len == 0)
+			{
+				(void) WaitLatchOrSocket(
+										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+			}
+			else
+			{
+				Assert(buf[0] == 'w' || buf[0] == 'k');
+				if (buf[0] == 'k')
+					continue; /* keepalive */
+				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
+					   sizeof rec_start_lsn);
+				rec_start_lsn = pg_ntoh64(rec_start_lsn);
+				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+
+				/* write WAL to disk */
+				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
+
+				ereport(DEBUG1,
+						(errmsg("Recover message %X/%X length %d",
+								LSN_FORMAT_ARGS(rec_start_lsn), len)));
+				if (rec_end_lsn >= endpos)
+					break;
+			}
+		}
+		ereport(LOG,
+				(errmsg("end of replication stream at %X/%X: %m",
+						LSN_FORMAT_ARGS(rec_end_lsn))));
+		walrcv_disconnect(wrconn);
+
+		/* failed to receive all WAL till endpos */
+		if (rec_end_lsn < endpos)
+			return false;
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
+						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Determine for sk the starting streaming point and send it message
+ * 1) Announcing we are elected proposer (which immediately advances epoch if
+ *    safekeeper is synced, being important for sync-safekeepers)
+ * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
+ *    beyond it -- and history of term switching.
+ *
+ * Sets sk->startStreamingAt.
+ */
+static void
+SendProposerElected(Safekeeper *sk)
+{
+	ProposerElected msg;
+	TermHistory *th;
+	term_t lastCommonTerm;
+	int i;
+
+	/*
+	 * Determine start LSN by comparing safekeeper's log term switch history and
+	 * proposer's, searching for the divergence point.
+	 *
+	 * Note: there is a vanishingly small chance of no common point even if
+	 * there is some WAL on safekeeper, if immediately after bootstrap compute
+	 * wrote some WAL on single sk and died; we stream since the beginning then.
+	 */
+	th = &sk->voteResponse.termHistory;
+	/*
+	 * If any WAL is present on the sk, it must be authorized by some term.
+	 * OTOH, without any WAL there are no term swiches in the log.
+	 */
+	Assert((th->n_entries == 0) ==
+		   (sk->voteResponse.flushLsn == InvalidXLogRecPtr));
+	/* We must start somewhere. */
+	Assert(propTermHistory.n_entries >= 1);
+
+	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
+	{
+		if (propTermHistory.entries[i].term != th->entries[i].term)
+			break;
+		/* term must begin everywhere at the same point */
+		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
+	}
+	i--; /* step back to the last common term */
+	if (i < 0)
+	{
+		/* safekeeper is empty or no common point, start from the beginning */
+		sk->startStreamingAt = propTermHistory.entries[0].lsn;
+
+		if (sk->startStreamingAt < truncateLsn)
+		{
+			/*
+			 * There's a gap between the WAL starting point and a truncateLsn,
+			 * which can't appear in a normal working cluster. That gap means
+			 * that all safekeepers reported that they have persisted WAL up
+			 * to the truncateLsn before, but now current safekeeper tells
+			 * otherwise.
+			 *
+			 * Also we have a special condition here, which is empty safekeeper
+			 * with no history. In combination with a gap, that can happen when
+			 * we introduce a new safekeeper to the cluster. This is a rare case,
+			 * which is triggered manually for now, and should be treated with
+			 * care.
+			 */
+
+			/*
+			 * truncateLsn will not change without ack from current safekeeper,
+			 * and it's aligned to the WAL record, so we can safely start
+			 * streaming from this point.
+			 */
+			sk->startStreamingAt = truncateLsn;
+
+			elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
+				 sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn),
+				 LSN_FORMAT_ARGS(sk->startStreamingAt));
+		}
+	}
+	else
+	{
+		/*
+		 * End of (common) term is the start of the next except it is the last
+		 * one; there it is flush_lsn in case of safekeeper or, in case of
+		 * proposer, LSN it is currently writing, but then we just pick
+		 * safekeeper pos as it obviously can't be higher.
+		 */
+		if (propTermHistory.entries[i].term == propTerm)
+		{
+			sk->startStreamingAt = sk->voteResponse.flushLsn;
+		}
+		else
+		{
+			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
+			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
+														   sk->voteResponse.flushLsn);
+			sk->startStreamingAt = Min(propEndLsn, skEndLsn);
+		}
+	}
+
+	Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn);
+
+	msg.tag = 'e';
+	msg.term = propTerm;
+	msg.startStreamingAt = sk->startStreamingAt;
+	msg.termHistory = &propTermHistory;
+	msg.timelineStartLsn = timelineStartLsn;
+
+	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
+	elog(LOG,
+		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+
+	resetStringInfo(&sk->outbuf);
+	pq_sendint64_le(&sk->outbuf, msg.tag);
+	pq_sendint64_le(&sk->outbuf, msg.term);
+	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
+	for (int i = 0; i < msg.termHistory->n_entries; i++)
+	{
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
+	}
+	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
+
+	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+		return;
+
+	StartStreaming(sk);
+}
+
+/*
+ * Start walsender streaming replication
+ */
+static void
+WalProposerStartStreaming(XLogRecPtr startpos)
+{
+	StartReplicationCmd cmd;
+
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
+	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
+	cmd.timeline = greetRequest.timeline;
+	cmd.startpoint = startpos;
+	StartProposerReplication(&cmd);
+}
+
+/*
+ * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets
+ * correct event set.
+ */
+static void
+StartStreaming(Safekeeper *sk)
+{
+	/*
+	 * This is the only entrypoint to state SS_ACTIVE. It's executed
+	 * exactly once for a connection.
+	 */
+	sk->state = SS_ACTIVE;
+	sk->streamingAt = sk->startStreamingAt;
+
+	/* event set will be updated inside SendMessageToNode */
+	SendMessageToNode(sk);
+}
+
+/*
+ * Try to send message to the particular node. Always updates event set. Will
+ * send at least one message, if socket is ready.
+ *
+ * Can be used only for safekeepers in SS_ACTIVE state. State can be changed
+ * in case of errors.
+ */
+static void
+SendMessageToNode(Safekeeper *sk)
+{
+	Assert(sk->state == SS_ACTIVE);
+
+	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
+	HandleActiveState(sk, WL_SOCKET_WRITEABLE);
+}
+
+/*
+ * Broadcast new message to all caught-up safekeepers
+ */
+static void
+BroadcastAppendRequest()
+{
+	for (int i = 0; i < n_safekeepers; i++)
+		if (safekeeper[i].state == SS_ACTIVE)
+			SendMessageToNode(&safekeeper[i]);
+}
+
+static void
+PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
+{
+	Assert(endLsn >= beginLsn);
+	req->tag = 'a';
+	req->term = propTerm;
+	req->epochStartLsn = propEpochStartLsn;
+	req->beginLsn = beginLsn;
+	req->endLsn = endLsn;
+	req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+	req->truncateLsn = truncateLsn;
+	req->proposerId = greetRequest.proposerId;
+}
+
+/*
+ * Process all events happened in SS_ACTIVE state, update event set after that.
+ */
+static void
+HandleActiveState(Safekeeper *sk, uint32 events)
+{
+	uint32 newEvents = WL_SOCKET_READABLE;
+
+	if (events & WL_SOCKET_WRITEABLE)
+		if (!SendAppendRequests(sk))
+			return;
+
+	if (events & WL_SOCKET_READABLE)
+		if (!RecvAppendResponses(sk))
+			return;
+
+	/*
+	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
+	 * in the buffer.
+	 *
+	 * LSN comparison checks if we have pending unsent messages. This check isn't
+	 * necessary now, because we always send append messages immediately after
+	 * arrival. But it's good to have it here in case we change this behavior
+	 * in the future.
+	 */
+	if (sk->streamingAt != availableLsn || sk->flushWrite)
+		newEvents |= WL_SOCKET_WRITEABLE;
+
+	UpdateEventSet(sk, newEvents);
+}
+
+/*
+ * Send WAL messages starting from sk->streamingAt until the end or non-writable
+ * socket, whichever comes first. Caller should take care of updating event set.
+ * Even if no unsent WAL is available, at least one empty message will be sent
+ * as a heartbeat, if socket is ready.
+ *
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ */
+static bool
+SendAppendRequests(Safekeeper *sk)
+{
+	XLogRecPtr endLsn;
+	AppendRequestHeader *req;
+	PGAsyncWriteResult writeResult;
+	WALReadError errinfo;
+	bool sentAnything = false;
+
+	if (sk->flushWrite)
+	{
+		if (!AsyncFlush(sk))
+			/*
+			 * AsyncFlush failed, that could happen if the socket is closed or
+			 * we have nothing to write and should wait for writeable socket.
+			 */
+			return sk->state == SS_ACTIVE;
+
+		/* Event set will be updated in the end of HandleActiveState */
+		sk->flushWrite = false;
+	}
+
+	while (sk->streamingAt != availableLsn || !sentAnything)
+	{
+		sentAnything = true;
+
+		endLsn = sk->streamingAt;
+		endLsn += MAX_SEND_SIZE;
+
+		/* if we went beyond available WAL, back off */
+		if (endLsn > availableLsn) {
+			endLsn = availableLsn;
+		}
+
+		req = &sk->appendRequest;
+		PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn);
+
+		ereport(DEBUG2,
+				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port)));
+
+		resetStringInfo(&sk->outbuf);
+
+		/* write AppendRequest header */
+		appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader));
+
+		/* write the WAL itself */
+		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+		if (!WALRead(sk->xlogreader,
+				 &sk->outbuf.data[sk->outbuf.len],
+				 req->beginLsn,
+				 req->endLsn - req->beginLsn,
+				 ThisTimeLineID,
+				 &errinfo))
+		{
+			WALReadRaiseError(&errinfo);
+		}
+		sk->outbuf.len += req->endLsn - req->beginLsn;
+
+		writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
+
+		/* Mark current message as sent, whatever the result is */
+		sk->streamingAt = endLsn;
+
+		switch (writeResult)
+		{
+			case PG_ASYNC_WRITE_SUCCESS:
+				/* Continue writing the next message */
+				break;
+
+			case PG_ASYNC_WRITE_TRY_FLUSH:
+				/*
+				 * We still need to call PQflush some more to finish the job.
+				 * Caller function will handle this by setting right event set.
+				 */
+				sk->flushWrite = true;
+				return true;
+
+			case PG_ASYNC_WRITE_FAIL:
+				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					walprop_error_message(sk->conn));
+				ShutdownConnection(sk);
+				return false;
+			default:
+				Assert(false);
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Receive and process all available feedback.
+ *
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ *
+ * NB: This function can call SendMessageToNode and produce new messages.
+ */
+static bool
+RecvAppendResponses(Safekeeper *sk)
+{
+	XLogRecPtr	minQuorumLsn;
+	bool readAnything = false;
+
+	while (true)
+	{
+		/*
+		 * If our reading doesn't immediately succeed, any
+		 * necessary error handling or state setting is taken care
+		 * of. We can leave any other work until later.
+		 */
+		sk->appendResponse.apm.tag = 'a';
+		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
+			break;
+
+		ereport(DEBUG2,
+				(errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+						sk->appendResponse.term,
+						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+						sk->host, sk->port)));
+
+		if (sk->appendResponse.term > propTerm)
+		{
+			/* Another compute with higher term is running. */
+			elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+					sk->host, sk->port,
+					sk->appendResponse.term, propTerm);
+		}
+
+		readAnything = true;
+	}
+
+	if (!readAnything)
+		return sk->state == SS_ACTIVE;
+
+	HandleSafekeeperResponse();
+
+	/*
+	 * Also send the new commit lsn to all the safekeepers.
+	 */
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastSentCommitLsn)
+	{
+		BroadcastAppendRequest();
+		lastSentCommitLsn = minQuorumLsn;
+	}
+
+	return sk->state == SS_ACTIVE;
+}
+
+/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
+void
+ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf)
+{
+	uint8 nkeys;
+	int i;
+	int32 len;
+
+	/* get number of custom keys */
+	nkeys = pq_getmsgbyte(reply_message);
+
+	for (i = 0; i < nkeys; i++)
+	{
+		const char *key = pq_getmsgstring(reply_message);
+		if (strcmp(key, "current_timeline_size") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				rf->currentClusterSize = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
+					rf->currentClusterSize);
+		}
+		else if (strcmp(key, "ps_writelsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				rf->ps_writelsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
+					LSN_FORMAT_ARGS(rf->ps_writelsn));
+		}
+		else if (strcmp(key, "ps_flushlsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				rf->ps_flushlsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
+					LSN_FORMAT_ARGS(rf->ps_flushlsn));
+		}
+		else if (strcmp(key, "ps_applylsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				rf->ps_applylsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
+					LSN_FORMAT_ARGS(rf->ps_applylsn));
+		}
+		else if (strcmp(key, "ps_replytime") == 0)
+		{
+			pq_getmsgint(reply_message, sizeof(int32)); // read value length
+			rf->ps_replytime = pq_getmsgint64(reply_message);
+			{
+				char	   *replyTimeStr;
+
+				/* Copy because timestamptz_to_str returns a static buffer */
+				replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
+					rf->ps_replytime, replyTimeStr);
+
+				pfree(replyTimeStr);
+			}
+		}
+		else
+		{
+			len = pq_getmsgint(reply_message, sizeof(int32)); // read value length
+			// Skip unknown keys to support backward compatibile protocol changes
+			elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
+			pq_getmsgbytes(reply_message, len);
+		};
+	}
+}
+
+/*
+ * Combine hot standby feedbacks from all safekeepers.
+ */
+static void
+CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
+{
+	hs->ts = 0;
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		if (safekeeper[i].appendResponse.hs.ts != 0)
+		{
+			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
+			{
+				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
+				hs->ts = safekeeper[i].appendResponse.hs.ts;
+			}
+			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
+			{
+				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
+				hs->ts = safekeeper[i].appendResponse.hs.ts;
+			}
+		}
+	}
+}
+
+
+/*
+ * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
+ * last WAL record that can be safely discarded.
+ */
+static XLogRecPtr
+CalculateMinFlushLsn(void)
+{
+	XLogRecPtr lsn = n_safekeepers > 0
+		? safekeeper[0].appendResponse.flushLsn
+		: InvalidXLogRecPtr;
+	for (int i = 1; i < n_safekeepers; i++)
+	{
+		lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn);
+	}
+	return lsn;
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(void)
+{
+	XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+	/*
+	 * Sort acknowledged LSNs
+	 */
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		/*
+		 * Like in Raft, we aren't allowed to commit entries from previous
+		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 */
+		responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ?
+			safekeeper[i].appendResponse.flushLsn : 0;
+	}
+	qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+	/*
+	 * Get the smallest LSN committed by quorum
+	 */
+	return responses[n_safekeepers - quorum];
+}
+
+/*
+ * ReplicationFeedbackShmemSize --- report amount of shared memory space needed
+ */
+Size
+WalproposerShmemSize(void)
+{
+	return sizeof(WalproposerShmemState);
+}
+
+bool
+WalproposerShmemInit(void)
+{
+	bool		found;
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	walprop_shared = ShmemInitStruct("Walproposer shared state",
+								sizeof(WalproposerShmemState),
+								&found);
+
+	if (!found)
+	{
+		memset(walprop_shared, 0, WalproposerShmemSize());
+		SpinLockInit(&walprop_shared->mutex);
+	}
+	LWLockRelease(AddinShmemInitLock);
+
+	return found;
+}
+
+void
+replication_feedback_set(ReplicationFeedback *rf)
+{
+	SpinLockAcquire(&walprop_shared->mutex);
+	memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
+	SpinLockRelease(&walprop_shared->mutex);
+}
+
+
+void
+replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
+{
+	SpinLockAcquire(&walprop_shared->mutex);
+	*writeLsn = walprop_shared->feedback.ps_writelsn;
+	*flushLsn = walprop_shared->feedback.ps_flushlsn;
+	*applyLsn = walprop_shared->feedback.ps_applylsn;
+	SpinLockRelease(&walprop_shared->mutex);
+}
+
+
+/*
+ * Get ReplicationFeedback fields from the most advanced safekeeper
+ */
+static void
+GetLatestZentihFeedback(ReplicationFeedback *rf)
+{
+	int latest_safekeeper = 0;
+	XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
+		{
+			latest_safekeeper = i;
+			ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
+		}
+	}
+
+	rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
+	rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
+	rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
+	rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
+	rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
+
+	elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu,"
+			  " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
+		rf->currentClusterSize,
+		LSN_FORMAT_ARGS(rf->ps_writelsn),
+		LSN_FORMAT_ARGS(rf->ps_flushlsn),
+		LSN_FORMAT_ARGS(rf->ps_applylsn),
+		rf->ps_replytime);
+
+	replication_feedback_set(rf);
+}
+
+static void
+HandleSafekeeperResponse(void)
+{
+	HotStandbyFeedback hsFeedback;
+	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	diskConsistentLsn;
+	XLogRecPtr  minFlushLsn;
+
+
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
+
+	if (!syncSafekeepers)
+	{
+		// Get ReplicationFeedback fields from the most advanced safekeeper
+		GetLatestZentihFeedback(&quorumFeedback.rf);
+		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
+	}
+
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
+	{
+
+		if (minQuorumLsn > quorumFeedback.flushLsn)
+			quorumFeedback.flushLsn = minQuorumLsn;
+
+		/* advance the replication slot */
+		if (!syncSafekeepers)
+			ProcessStandbyReply(
+								// write_lsn -  This is what durably stored in WAL service.
+								quorumFeedback.flushLsn,
+								//flush_lsn - This is what durably stored in WAL service.
+								quorumFeedback.flushLsn,
+								//apply_lsn - This is what processed and durably saved at pageserver.
+								quorumFeedback.rf.ps_flushlsn,
+								GetCurrentTimestamp(), false);
+	}
+
+	CombineHotStanbyFeedbacks(&hsFeedback);
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
+	{
+		quorumFeedback.hs = hsFeedback;
+		if (!syncSafekeepers)
+			ProcessStandbyHSFeedback(hsFeedback.ts,
+									 XidFromFullTransactionId(hsFeedback.xmin),
+									 EpochFromFullTransactionId(hsFeedback.xmin),
+									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+	}
+
+	/*
+	 * Try to advance truncateLsn to minFlushLsn, which is the last record
+	 * flushed to all safekeepers. We must always start streaming from the
+	 * beginning of the record, which simplifies decoding on the far end.
+	 *
+	 * Advanced truncateLsn should be not further than nearest commitLsn.
+	 * This prevents surprising violation of truncateLsn <= commitLsn
+	 * invariant which might occur because 1) truncateLsn can be advanced
+	 * immediately once chunk is broadcast to all safekeepers, and
+	 * commitLsn generally can't be advanced based on feedback from
+	 * safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2) chunks we
+	 * read from WAL and send are plain sheets of bytes, but safekeepers
+	 * ack only on record boundaries.
+	 */
+	minFlushLsn = CalculateMinFlushLsn();
+	if (minFlushLsn > truncateLsn)
+	{
+		truncateLsn = minFlushLsn;
+
+		/*
+		 * Advance the replication slot to free up old WAL files. Note
+		 * that slot doesn't exist if we are in syncSafekeepers mode.
+		 */
+		if (MyReplicationSlot)
+			PhysicalConfirmReceivedLocation(truncateLsn);
+	}
+
+	/*
+	 * Generally sync is done when majority switched the epoch so we committed
+	 * epochStartLsn and made the majority aware of it, ensuring they are
+	 * ready to give all WAL to pageserver. It would mean whichever majority
+	 * is alive, there will be at least one safekeeper who is able to stream
+	 * WAL to pageserver to make basebackup possible. However, since at the
+	 * moment we don't have any good mechanism of defining the healthy and
+	 * most advanced safekeeper who should push the wal into pageserver and
+	 * basically the random one gets connected, to prevent hanging basebackup
+	 * (due to pageserver connecting to not-synced-safekeeper) we currently
+	 * wait for all seemingly alive safekeepers to get synced.
+	 */
+	if (syncSafekeepers)
+	{
+		int			n_synced;
+
+		n_synced = 0;
+		for (int i = 0; i < n_safekeepers; i++)
+		{
+			Safekeeper  *sk = &safekeeper[i];
+			bool		synced = sk->appendResponse.commitLsn >= propEpochStartLsn;
+
+			/* alive safekeeper which is not synced yet; wait for it */
+			if (sk->state != SS_OFFLINE && !synced)
+				return;
+			if (synced)
+				n_synced++;
+		}
+		if (n_synced >= quorum)
+		{
+			/* All safekeepers synced! */
+			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+			exit(0);
+		}
+	}
+}
+
+/*
+ * Try to read CopyData message from i'th safekeeper, resetting connection on
+ * failure.
+ */
+static bool
+AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
+{
+	switch (walprop_async_read(sk->conn, buf, buf_size))
+	{
+		case PG_ASYNC_READ_SUCCESS:
+			return true;
+
+		case PG_ASYNC_READ_TRY_AGAIN:
+			/* WL_SOCKET_READABLE is always set during copyboth */
+			return false;
+
+		case PG_ASYNC_READ_FAIL:
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
+				 sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
+			return false;
+	}
+	Assert(false);
+	return false;
+}
+
+/*
+ * Read next message with known type into provided struct, by reading a CopyData
+ * block from the safekeeper's postgres connection, returning whether the read
+ * was successful.
+ *
+ * If the read needs more polling, we return 'false' and keep the state
+ * unmodified, waiting until it becomes read-ready to try again. If it fully
+ * failed, a warning is emitted and the connection is reset.
+ */
+static bool
+AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
+{
+	char *buf;
+	int buf_size;
+	uint64 tag;
+	StringInfoData s;
+
+	if (!(AsyncRead(sk, &buf, &buf_size)))
+		return false;
+
+	/* parse it */
+	s.data = buf;
+	s.len = buf_size;
+	s.cursor = 0;
+
+	tag = pq_getmsgint64_le(&s);
+	if (tag != anymsg->tag)
+	{
+		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			 sk->port, FormatSafekeeperState(sk->state));
+		ResetConnection(sk);
+		return false;
+	}
+
+	switch (tag)
+	{
+		case 'g':
+		{
+			AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+			msg->term = pq_getmsgint64_le(&s);
+			msg->nodeId = pq_getmsgint64_le(&s);
+			pq_getmsgend(&s);
+			return true;
+		}
+
+		case 'v':
+		{
+			VoteResponse *msg = (VoteResponse *) anymsg;
+
+			msg->term = pq_getmsgint64_le(&s);
+			msg->voteGiven = pq_getmsgint64_le(&s);
+			msg->flushLsn = pq_getmsgint64_le(&s);
+			msg->truncateLsn = pq_getmsgint64_le(&s);
+			msg->termHistory.n_entries = pq_getmsgint32_le(&s);
+			msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+			for (int i = 0; i < msg->termHistory.n_entries; i++)
+			{
+				msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
+				msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+			}
+			msg->timelineStartLsn = pq_getmsgint64_le(&s);
+			pq_getmsgend(&s);
+			return true;
+		}
+
+		case 'a':
+		{
+			AppendResponse *msg = (AppendResponse *) anymsg;
+			msg->term = pq_getmsgint64_le(&s);
+			msg->flushLsn = pq_getmsgint64_le(&s);
+			msg->commitLsn = pq_getmsgint64_le(&s);
+			msg->hs.ts = pq_getmsgint64_le(&s);
+			msg->hs.xmin.value = pq_getmsgint64_le(&s);
+			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
+				ParseReplicationFeedbackMessage(&s, &msg->rf);
+			pq_getmsgend(&s);
+			return true;
+		}
+
+		default:
+		{
+			Assert(false);
+			return false;
+		}
+	}
+}
+
+/*
+ * Blocking equivalent to AsyncWrite.
+ *
+ * We use this everywhere messages are small enough that they should fit in a
+ * single packet.
+ */
+static bool
+BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
+{
+	uint32		events;
+
+	if (!walprop_blocking_write(sk->conn, msg, msg_size))
+	{
+		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+			 sk->host, sk->port, FormatSafekeeperState(sk->state),
+			 walprop_error_message(sk->conn));
+		ShutdownConnection(sk);
+		return false;
+	}
+
+	sk->state = success_state;
+
+	/*
+	 * If the new state will be waiting for events to happen, update the event
+	 * set to wait for those
+	 */
+	events = SafekeeperStateDesiredEvents(success_state);
+	if (events)
+		UpdateEventSet(sk, events);
+
+	return true;
+}
+
+/*
+ * Starts a write into the 'i'th safekeeper's postgres connection, moving to
+ * flush_state (adjusting eventset) if write still needs flushing.
+ *
+ * Returns false if sending is unfinished (requires flushing or conn failed).
+ * Upon failure, a warning is emitted and the connection is reset.
+ */
+static bool
+AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state)
+{
+	switch (walprop_async_write(sk->conn, msg, msg_size))
+	{
+		case PG_ASYNC_WRITE_SUCCESS:
+			return true;
+		case PG_ASYNC_WRITE_TRY_FLUSH:
+
+			/*
+			 * We still need to call PQflush some more to finish the job; go
+			 * to the appropriate state. Update the event set at the bottom of
+			 * this function
+			 */
+			sk->state = flush_state;
+			UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+			return false;
+		case PG_ASYNC_WRITE_FAIL:
+			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
+			return false;
+		default:
+		    Assert(false);
+			return false;
+	}
+}
+
+/*
+ * Flushes a previous call to AsyncWrite. This only needs to be called when the
+ * socket becomes read or write ready *after* calling AsyncWrite.
+ *
+ * If flushing successfully completes returns true, otherwise false. Event set
+ * is updated only if connection fails, otherwise caller should manually unset
+ * WL_SOCKET_WRITEABLE.
+ */
+static bool
+AsyncFlush(Safekeeper *sk)
+{
+	/*---
+	 * PQflush returns:
+	 *   0 if successful                    [we're good to move on]
+	 *   1 if unable to send everything yet [call PQflush again]
+	 *  -1 if it failed                     [emit an error]
+	 */
+	switch (walprop_flush(sk->conn))
+	{
+		case 0:
+			/* flush is done */
+			return true;
+		case 1:
+			/* Nothing to do; try again when the socket's ready */
+			return false;
+		case -1:
+			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ResetConnection(sk);
+			return false;
+		default:
+			Assert(false);
+			return false;
+	}
+}
+
+// Check if we need to suspend inserts because of lagging replication.
+static uint64
+backpressure_lag_impl(void)
+{
+	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
+	{
+		XLogRecPtr writePtr;
+		XLogRecPtr flushPtr;
+		XLogRecPtr applyPtr;
+		XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+#define MB ((XLogRecPtr)1024*1024)
+
+		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
+			 LSN_FORMAT_ARGS(myFlushLsn),
+			 LSN_FORMAT_ARGS(writePtr),
+			 LSN_FORMAT_ARGS(flushPtr),
+			 LSN_FORMAT_ARGS(applyPtr));
+
+		if ((writePtr != InvalidXLogRecPtr
+			 && max_replication_write_lag > 0
+			 && myFlushLsn > writePtr + max_replication_write_lag*MB))
+		{
+			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
+		}
+
+		if ((flushPtr != InvalidXLogRecPtr
+			 && max_replication_flush_lag > 0
+			 && myFlushLsn > flushPtr + max_replication_flush_lag*MB))
+		{
+			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
+		}
+
+		if ((applyPtr != InvalidXLogRecPtr
+			 && max_replication_apply_lag > 0
+			 && myFlushLsn > applyPtr + max_replication_apply_lag*MB))
+		{
+			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
+		}
+	}
+	return 0;
+}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
new file mode 100644
index 0000000000..b684d5264f
--- /dev/null
+++ b/pgxn/neon/walproposer.h
@@ -0,0 +1,540 @@
+#ifndef __NEON_WALPROPOSER_H__
+#define __NEON_WALPROPOSER_H__
+
+#include "access/xlogdefs.h"
+#include "postgres.h"
+#include "port.h"
+#include "access/xlog_internal.h"
+#include "access/transam.h"
+#include "nodes/replnodes.h"
+#include "utils/uuid.h"
+#include "replication/walreceiver.h"
+
+#define SK_MAGIC              0xCafeCeefu
+#define SK_PROTOCOL_VERSION   2
+
+#define MAX_SAFEKEEPERS        32
+#define MAX_SEND_SIZE         (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
+#define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
+#define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
+#define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
+
+/*
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
+ * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
+ */
+#define WL_NO_EVENTS 0
+
+extern char* wal_acceptors_list;
+extern int   wal_acceptor_reconnect_timeout;
+extern int   wal_acceptor_connect_timeout;
+extern bool  am_wal_proposer;
+
+struct WalProposerConn; /* Defined in libpqwalproposer */
+typedef struct WalProposerConn WalProposerConn;
+
+struct WalMessage;
+typedef struct WalMessage WalMessage;
+
+extern char *zenith_timeline_walproposer;
+extern char *zenith_tenant_walproposer;
+
+/* Possible return values from ReadPGAsync */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+	/* The read is ongoing. Wait until the connection is read-ready, then try
+	 * again. */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from WritePGAsync */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+	/* The write started, but you'll need to call PQflush some more times
+	 * to finish it off. We just tried, so it's best to wait until the
+	 * connection is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
+/*
+ * WAL safekeeper state, which is used to wait for some event.
+ *
+ * States are listed here in the order that they're executed.
+ *
+ * Most states, upon failure, will move back to SS_OFFLINE by calls to
+ * ResetConnection or ShutdownConnection.
+ */
+typedef enum
+{
+	/*
+	 * Does not have an active connection and will stay that way until
+	 * further notice.
+	 *
+	 * Moves to SS_CONNECTING_WRITE by calls to ResetConnection.
+	 */
+	SS_OFFLINE,
+
+	/*
+	 * Connecting states. "_READ" waits for the socket to be available for
+	 * reading, "_WRITE" waits for writing. There's no difference in the code
+	 * they execute when polled, but we have this distinction in order to
+	 * recreate the event set in HackyRemoveWalProposerEvent.
+	 *
+	 * After the connection is made, "START_WAL_PUSH" query is sent.
+	 */
+	SS_CONNECTING_WRITE,
+	SS_CONNECTING_READ,
+
+	/*
+	 * Waiting for the result of the "START_WAL_PUSH" command.
+	 *
+	 * After we get a successful result, sends handshake to safekeeper.
+	 */
+	SS_WAIT_EXEC_RESULT,
+
+	/*
+	 * Executing the receiving half of the handshake. After receiving, moves to
+	 * SS_VOTING.
+	 */
+	SS_HANDSHAKE_RECV,
+
+	/*
+	 * Waiting to participate in voting, but a quorum hasn't yet been reached.
+	 * This is an idle state - we do not expect AdvancePollState to be called.
+	 *
+	 * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
+	 * quorum of handshakes.
+	 */
+	SS_VOTING,
+
+	/*
+	 * Already sent voting information, waiting to receive confirmation from the
+	 * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet.
+	 */
+	SS_WAIT_VERDICT,
+
+	/* Need to flush ProposerElected message. */
+	SS_SEND_ELECTED_FLUSH,
+
+	/*
+	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
+	 * read-ready, the connection has been closed.
+	 *
+	 * Moves to SS_ACTIVE only by call to StartStreaming.
+	 */
+	SS_IDLE,
+
+	/*
+	 * Active phase, when we acquired quorum and have WAL to send or feedback
+	 * to read.
+	 */
+	SS_ACTIVE,
+} SafekeeperState;
+
+/* Consensus logical timestamp. */
+typedef uint64 term_t;
+
+/* neon storage node id */
+typedef uint64 NNodeId;
+
+/*
+ * Proposer <-> Acceptor messaging.
+ */
+
+/* Initial Proposer -> Acceptor message */
+typedef struct ProposerGreeting
+{
+	uint64	   tag;				  /* message tag */
+	uint32	   protocolVersion;	  /* proposer-safekeeper protocol version */
+	uint32	   pgVersion;
+	pg_uuid_t  proposerId;
+	uint64	   systemId;		  /* Postgres system identifier */
+	uint8	   ztimelineid[16];	  /* Zenith timeline id */
+	uint8	   ztenantid[16];
+	TimeLineID timeline;
+	uint32	   walSegSize;
+} ProposerGreeting;
+
+typedef struct AcceptorProposerMessage
+{
+	uint64 tag;
+} AcceptorProposerMessage;
+
+/*
+ * Acceptor -> Proposer initial response: the highest term acceptor voted for.
+ */
+typedef struct AcceptorGreeting
+{
+	AcceptorProposerMessage apm;
+	term_t		term;
+	NNodeId		nodeId;
+} AcceptorGreeting;
+
+/*
+ * Proposer -> Acceptor vote request.
+ */
+typedef struct VoteRequest
+{
+	uint64		tag;
+	term_t		term;
+	pg_uuid_t   proposerId; /* for monitoring/debugging */
+} VoteRequest;
+
+/* Element of term switching chain. */
+typedef struct TermSwitchEntry
+{
+	term_t term;
+	XLogRecPtr lsn;
+} TermSwitchEntry;
+
+typedef struct TermHistory
+{
+	uint32 n_entries;
+	TermSwitchEntry *entries;
+} TermHistory;
+
+/* Vote itself, sent from safekeeper to proposer */
+typedef struct VoteResponse {
+	AcceptorProposerMessage apm;
+	term_t term;
+	uint64 voteGiven;
+	/*
+	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
+     * proposer to choose the most advanced one.
+	 */
+	XLogRecPtr flushLsn;
+	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some safekeeper */
+	TermHistory termHistory;
+	XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
+} VoteResponse;
+
+/*
+ * Proposer -> Acceptor message announcing proposer is elected and communicating
+ * epoch history to it.
+ */
+typedef struct ProposerElected
+{
+	uint64 tag;
+	term_t term;
+	/* proposer will send since this point */
+	XLogRecPtr startStreamingAt;
+	/* history of term switches up to this proposer */
+	TermHistory *termHistory;
+	/* timeline globally starts at this LSN */
+	XLogRecPtr timelineStartLsn;
+} ProposerElected;
+
+/*
+ * Header of request with WAL message sent from proposer to safekeeper.
+ */
+typedef struct AppendRequestHeader
+{
+	uint64 tag;
+	term_t term; /* term of the proposer */
+	/*
+	 * LSN since which current proposer appends WAL (begin_lsn of its first
+	 * record); determines epoch switch point.
+	 */
+	XLogRecPtr epochStartLsn;
+	XLogRecPtr beginLsn;    /* start position of message in WAL */
+	XLogRecPtr endLsn;      /* end position of message in WAL */
+	XLogRecPtr commitLsn;   /* LSN committed by quorum of safekeepers */
+	/*
+	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
+	 *  + 1 of last chunk streamed to everyone)
+	 */
+    XLogRecPtr truncateLsn;
+    pg_uuid_t  proposerId; /* for monitoring/debugging */
+} AppendRequestHeader;
+
+/*
+ * Hot standby feedback received from replica
+ */
+typedef struct HotStandbyFeedback
+{
+	TimestampTz       ts;
+	FullTransactionId xmin;
+	FullTransactionId catalog_xmin;
+} HotStandbyFeedback;
+
+
+typedef	struct ReplicationFeedback
+{
+	// current size of the timeline on pageserver
+	uint64 currentClusterSize;
+	// standby_status_update fields that safekeeper received from pageserver
+	XLogRecPtr ps_writelsn;
+	XLogRecPtr ps_flushlsn;
+	XLogRecPtr ps_applylsn;
+	TimestampTz ps_replytime;
+} ReplicationFeedback;
+
+
+typedef struct WalproposerShmemState
+{
+	slock_t		mutex;
+	ReplicationFeedback feedback;
+	term_t		mineLastElectedTerm;
+} WalproposerShmemState;
+
+/*
+ * Report safekeeper state to proposer
+ */
+typedef struct AppendResponse
+{
+	AcceptorProposerMessage apm;
+	/*
+	 * Current term of the safekeeper; if it is higher than proposer's, the
+	 * compute is out of date.
+	 */
+	term_t     term;
+	// TODO: add comment
+	XLogRecPtr flushLsn;
+	// Safekeeper reports back his awareness about which WAL is committed, as
+	// this is a criterion for walproposer --sync mode exit
+	XLogRecPtr commitLsn;
+	HotStandbyFeedback hs;
+	// Feedback recieved from pageserver includes standby_status_update fields
+	// and custom zenith feedback.
+	// This part of the message is extensible.
+	ReplicationFeedback rf;
+} AppendResponse;
+
+// ReplicationFeedback is extensible part of the message that is parsed separately
+// Other fields are fixed part
+#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
+
+
+/*
+ * Descriptor of safekeeper
+ */
+typedef struct Safekeeper
+{
+	char const*        host;
+	char const*        port;
+	char               conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
+
+	/*
+	 * postgres protocol connection to the WAL acceptor
+	 *
+	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
+	 * reach SS_ACTIVE; not before.
+	 */
+	WalProposerConn*   conn;
+	/*
+	 * Temporary buffer for the message being sent to the safekeeper.
+	 */
+	StringInfoData outbuf;
+	/*
+	 * WAL reader, allocated for each safekeeper.
+	 */
+	XLogReaderState* xlogreader;
+
+	/*
+	 * Streaming will start here; must be record boundary.
+	 */
+	XLogRecPtr startStreamingAt;
+
+	bool                flushWrite;     /* set to true if we need to call AsyncFlush, to flush pending messages */
+	XLogRecPtr          streamingAt;    /* current streaming position */
+	AppendRequestHeader appendRequest;  /* request for sending to safekeeper */
+
+	int                 eventPos;       /* position in wait event set. Equal to -1 if no event */
+	SafekeeperState     state;          /* safekeeper state machine state */
+	TimestampTz         startedConnAt;  /* when connection attempt started */
+	AcceptorGreeting    greetResponse;  /* acceptor greeting */
+	VoteResponse        voteResponse;   /* the vote */
+	AppendResponse      appendResponse; /* feedback for master */
+} Safekeeper;
+
+
+extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
+void       WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
+void       WalProposerPoll(void);
+void       WalProposerRegister(void);
+void ParseReplicationFeedbackMessage(StringInfo reply_message,
+								ReplicationFeedback *rf);
+extern void StartProposerReplication(StartReplicationCmd *cmd);
+
+Size WalproposerShmemSize(void);
+bool WalproposerShmemInit(void);
+void replication_feedback_set(ReplicationFeedback *rf);
+void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+
+/* libpqwalproposer hooks & helper type */
+
+/* Re-exported PostgresPollingStatusType */
+typedef enum
+{
+	WP_CONN_POLLING_FAILED = 0,
+	WP_CONN_POLLING_READING,
+	WP_CONN_POLLING_WRITING,
+	WP_CONN_POLLING_OK,
+	/*
+	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
+	 * We've removed it here to avoid clutter.
+	 */
+} WalProposerConnectPollStatusType;
+
+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+	/* Any success result other than a single CopyBoth was received. The specifics of the result
+	 * were already logged, but it may be useful to provide an error message indicating which
+	 * safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set. */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+	/* No result available at this time. Wait until read-ready, then call again. Internally, this is
+	 * returned when PQisBusy indicates that PQgetResult would block. */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
+/* Re-exported ConnStatusType */
+typedef enum
+{
+	WP_CONNECTION_OK,
+	WP_CONNECTION_BAD,
+
+	/*
+	 * The original ConnStatusType has many more tags, but requests that
+	 * they not be relied upon (except for displaying to the user). We
+	 * don't need that extra functionality, so we collect them into a
+	 * single tag here.
+	 */
+	WP_CONNECTION_IN_PROGRESS,
+} WalProposerConnStatusType;
+
+/* Re-exported PQerrorMessage */
+typedef char* (*walprop_error_message_fn) (WalProposerConn* conn);
+
+/* Re-exported PQstatus */
+typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn);
+
+/* Re-exported PQconnectStart */
+typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
+
+/* Re-exported PQconectPoll */
+typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
+
+/* Blocking wrapper around PQsendQuery */
+typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
+
+/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
+typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
+
+/* Re-exported PQsocket */
+typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
+
+/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
+typedef int (*walprop_flush_fn) (WalProposerConn* conn);
+
+/* Re-exported PQfinish */
+typedef void (*walprop_finish_fn) (WalProposerConn* conn);
+
+/*
+ * Ergonomic wrapper around PGgetCopyData
+ *
+ * Reads a CopyData block from a safekeeper, setting *amount to the number
+ * of bytes returned.
+ *
+ * This function is allowed to assume certain properties specific to the
+ * protocol with the safekeepers, so it should not be used as-is for any
+ * other purpose.
+ *
+ * Note: If possible, using <AsyncRead> is generally preferred, because it
+ * performs a bit of extra checking work that's always required and is normally
+ * somewhat verbose.
+ */
+typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
+													char** buf,
+													int* amount);
+
+/*
+ * Ergonomic wrapper around PQputCopyData + PQflush
+ *
+ * Starts to write a CopyData block to a safekeeper.
+ *
+ * For information on the meaning of return codes, refer to PGAsyncWriteResult.
+ */
+typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
+													  void const* buf,
+													  size_t size);
+
+/*
+ * Blocking equivalent to walprop_async_write_fn
+ *
+ * Returns 'true' if successful, 'false' on failure.
+ */
+typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size);
+
+/* All libpqwalproposer exported functions collected together. */
+typedef struct WalProposerFunctionsType
+{
+	walprop_error_message_fn	walprop_error_message;
+	walprop_status_fn			walprop_status;
+	walprop_connect_start_fn	walprop_connect_start;
+	walprop_connect_poll_fn		walprop_connect_poll;
+	walprop_send_query_fn		walprop_send_query;
+	walprop_get_query_result_fn	walprop_get_query_result;
+	walprop_socket_fn			walprop_socket;
+	walprop_flush_fn			walprop_flush;
+	walprop_finish_fn			walprop_finish;
+	walprop_async_read_fn		walprop_async_read;
+	walprop_async_write_fn		walprop_async_write;
+	walprop_blocking_write_fn   walprop_blocking_write;
+} WalProposerFunctionsType;
+
+/* Allow the above functions to be "called" with normal syntax */
+#define walprop_error_message(conn) \
+	WalProposerFunctions->walprop_error_message(conn)
+#define walprop_status(conn) \
+	WalProposerFunctions->walprop_status(conn)
+#define walprop_connect_start(conninfo) \
+	WalProposerFunctions->walprop_connect_start(conninfo)
+#define walprop_connect_poll(conn) \
+	WalProposerFunctions->walprop_connect_poll(conn)
+#define walprop_send_query(conn, query) \
+	WalProposerFunctions->walprop_send_query(conn, query)
+#define walprop_get_query_result(conn) \
+	WalProposerFunctions->walprop_get_query_result(conn)
+#define walprop_set_nonblocking(conn, arg) \
+	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
+#define walprop_socket(conn) \
+	WalProposerFunctions->walprop_socket(conn)
+#define walprop_flush(conn) \
+	WalProposerFunctions->walprop_flush(conn)
+#define walprop_finish(conn) \
+	WalProposerFunctions->walprop_finish(conn)
+#define walprop_async_read(conn, buf, amount) \
+	WalProposerFunctions->walprop_async_read(conn, buf, amount)
+#define walprop_async_write(conn, buf, size) \
+	WalProposerFunctions->walprop_async_write(conn, buf, size)
+#define walprop_blocking_write(conn, buf, size) \
+	WalProposerFunctions->walprop_blocking_write(conn, buf, size)
+
+/*
+ * The runtime location of the libpqwalproposer functions.
+ *
+ * This pointer is set by the initializer in libpqwalproposer, so that we
+ * can use it later.
+ */
+extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
+
+#endif /* __NEON_WALPROPOSER_H__ */
diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c
new file mode 100644
index 0000000000..7b96fd580c
--- /dev/null
+++ b/pgxn/neon/walproposer_utils.c
@@ -0,0 +1,1110 @@
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlogutils.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include "funcapi.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "walproposer_utils.h"
+#include "replication/walsender_private.h"
+
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+
+#include "libpq-fe.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+/*
+ * These variables are used similarly to openLogFile/SegNo,
+ * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
+ * corresponding the filename of walpropFile.
+ */
+static int	walpropFile = -1;
+static TimeLineID walpropFileTLI = 0;
+static XLogSegNo walpropSegNo = 0;
+
+/* START cloned file-local variables and functions from walsender.c */
+
+/*
+ * xlogreader used for replication.  Note that a WAL sender doing physical
+ * replication does not need xlogreader to read WAL, but it needs one to
+ * keep a state of its work.
+ */
+static XLogReaderState *xlogreader = NULL;
+
+/*
+ * These variables keep track of the state of the timeline we're currently
+ * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
+ * the timeline is not the latest timeline on this server, and the server's
+ * history forked off from that timeline at sendTimeLineValidUpto.
+ */
+static TimeLineID sendTimeLine = 0;
+static TimeLineID sendTimeLineNextTLI = 0;
+static bool sendTimeLineIsHistoric = false;
+static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
+
+/*
+ * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
+ * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
+ */
+static TimestampTz last_reply_timestamp = 0;
+
+/* Have we sent a heartbeat message asking for reply, since last reply? */
+static bool waiting_for_ping_response = false;
+
+static bool streamingDoneSending;
+static bool streamingDoneReceiving;
+
+/* Are we there yet? */
+static bool WalSndCaughtUp = false;
+
+/* Flags set by signal handlers for later service in main loop */
+static volatile sig_atomic_t got_STOPPING = false;
+
+/*
+ * How far have we sent WAL already? This is also advertised in
+ * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
+ */
+static XLogRecPtr sentPtr = InvalidXLogRecPtr;
+
+/*
+ * This is set while we are streaming. When not set
+ * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
+ * the main loop is responsible for checking got_STOPPING and terminating when
+ * it's set (after streaming any remaining WAL).
+ */
+static volatile sig_atomic_t replication_active = false;
+
+typedef void (*WalSndSendDataCallback) (void);
+static void WalSndLoop(WalSndSendDataCallback send_data);
+static void XLogSendPhysical(void);
+static XLogRecPtr GetStandbyFlushRecPtr(void);
+
+static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+							  TimeLineID *tli_p);
+
+/* END cloned file-level variables and functions from walsender.c */
+
+int
+CompareLsn(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return -1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return 1;
+}
+
+/* Returns a human-readable string corresonding to the SafekeeperState
+ *
+ * The string should not be freed.
+ *
+ * The strings are intended to be used as a prefix to "state", e.g.:
+ *
+ *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *
+ * If this sort of phrasing doesn't fit the message, instead use something like:
+ *
+ *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ */
+char*
+FormatSafekeeperState(SafekeeperState state)
+{
+	char* return_val = NULL;
+
+	switch (state)
+	{
+		case SS_OFFLINE:
+			return_val = "offline";
+			break;
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
+			return_val = "connecting";
+			break;
+		case SS_WAIT_EXEC_RESULT:
+			return_val = "receiving query result";
+			break;
+		case SS_HANDSHAKE_RECV:
+			return_val = "handshake (receiving)";
+			break;
+		case SS_VOTING:
+			return_val = "voting";
+			break;
+		case SS_WAIT_VERDICT:
+			return_val = "wait-for-verdict";
+			break;
+		case SS_SEND_ELECTED_FLUSH:
+			return_val = "send-announcement-flush";
+			break;
+		case SS_IDLE:
+			return_val = "idle";
+			break;
+		case SS_ACTIVE:
+			return_val = "active";
+			break;
+	}
+
+	Assert(return_val != NULL);
+
+	return return_val;
+}
+
+/* Asserts that the provided events are expected for given safekeeper's state */
+void
+AssertEventsOkForState(uint32 events, Safekeeper* sk)
+{
+	uint32 expected = SafekeeperStateDesiredEvents(sk->state);
+
+	/* The events are in-line with what we're expecting, under two conditions:
+	 *   (a) if we aren't expecting anything, `events` has no read- or
+	 *       write-ready component.
+	 *   (b) if we are expecting something, there's overlap
+	 *       (i.e. `events & expected != 0`)
+	 */
+	bool events_ok_for_state; /* long name so the `Assert` is more clear later */
+
+	if (expected == WL_NO_EVENTS)
+		events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0);
+	else
+		events_ok_for_state = ((events & expected) != 0);
+
+	if (!events_ok_for_state)
+	{
+		/* To give a descriptive message in the case of failure, we use elog and
+		 * then an assertion that's guaranteed to fail. */
+		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+		Assert(events_ok_for_state);
+	}
+}
+
+/* Returns the set of events a safekeeper in this state should be waiting on
+ *
+ * This will return WL_NO_EVENTS (= 0) for some events. */
+uint32
+SafekeeperStateDesiredEvents(SafekeeperState state)
+{
+	uint32 result = WL_NO_EVENTS;
+
+	/* If the state doesn't have a modifier, we can check the base state */
+	switch (state)
+	{
+		/* Connecting states say what they want in the name */
+		case SS_CONNECTING_READ:
+			result = WL_SOCKET_READABLE;
+			break;
+		case SS_CONNECTING_WRITE:
+			result = WL_SOCKET_WRITEABLE;
+			break;
+
+		/* Reading states need the socket to be read-ready to continue */
+		case SS_WAIT_EXEC_RESULT:
+		case SS_HANDSHAKE_RECV:
+		case SS_WAIT_VERDICT:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* Idle states use read-readiness as a sign that the connection has been
+		 * disconnected. */
+		case SS_VOTING:
+		case SS_IDLE:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* 
+		 * Flush states require write-ready for flushing.
+		 * Active state does both reading and writing.
+		 * 
+		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
+		 * 	check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
+		 */
+		case SS_SEND_ELECTED_FLUSH:
+		case SS_ACTIVE:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
+		/* The offline state expects no events. */
+		case SS_OFFLINE:
+			result = WL_NO_EVENTS;
+			break;
+
+		default:
+			Assert(false);
+			break;
+	}
+
+	return result;
+}
+
+/* Returns a human-readable string corresponding to the event set
+ *
+ * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
+ * returned string may be meaingless.
+ *
+ * The string should not be freed. It should also not be expected to remain the same between
+ * function calls. */
+char*
+FormatEvents(uint32 events)
+{
+	static char return_str[8];
+
+	/* Helper variable to check if there's extra bits */
+	uint32 all_flags = WL_LATCH_SET
+		| WL_SOCKET_READABLE
+		| WL_SOCKET_WRITEABLE
+		| WL_TIMEOUT
+		| WL_POSTMASTER_DEATH
+		| WL_EXIT_ON_PM_DEATH
+		| WL_SOCKET_CONNECTED;
+
+	/* The formatting here isn't supposed to be *particularly* useful -- it's just to give an
+	 * sense of what events have been triggered without needing to remember your powers of two. */
+
+	return_str[0] = (events & WL_LATCH_SET       ) ? 'L' : '_';
+	return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_';
+	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
+	return_str[3] = (events & WL_TIMEOUT         ) ? 'T' : '_';
+	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
+	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
+	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
+
+	if (events & (~all_flags))
+	{
+		elog(WARNING, "Event formatting found unexpected component %d",
+				events & (~all_flags));
+		return_str[6] = '*';
+		return_str[7] = '\0';
+	}
+	else
+		return_str[6] = '\0';
+
+	return (char *) &return_str;
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
+
+/*
+ * Write XLOG data to disk.
+ */
+void
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
+{
+	int			startoff;
+	int			byteswritten;
+
+	while (nbytes > 0)
+	{
+		int			segbytes;
+
+		/* Close the current segment if it's completed */
+		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+			XLogWalPropClose(recptr);
+
+		if (walpropFile < 0)
+		{
+			bool		use_existent = true;
+
+			/* Create/use new log file */
+			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
+			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
+			walpropFileTLI = ThisTimeLineID;
+		}
+
+		/* Calculate the start offset of the received logs */
+		startoff = XLogSegmentOffset(recptr, wal_segment_size);
+
+		if (startoff + nbytes > wal_segment_size)
+			segbytes = wal_segment_size - startoff;
+		else
+			segbytes = nbytes;
+
+		/* OK to write the logs */
+		errno = 0;
+
+		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
+		if (byteswritten <= 0)
+		{
+			char		xlogfname[MAXFNAMELEN];
+			int			save_errno;
+
+			/* if write didn't set errno, assume no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+
+			save_errno = errno;
+			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+			errno = save_errno;
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not write to log segment %s "
+							"at offset %u, length %lu: %m",
+							xlogfname, startoff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for write */
+		recptr += byteswritten;
+
+		nbytes -= byteswritten;
+		buf += byteswritten;
+	}
+
+	/*
+	 * Close the current segment if it's fully written up in the last cycle of
+	 * the loop.
+	 */
+	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+	{
+		XLogWalPropClose(recptr);
+	}
+}
+
+/*
+ * Close the current segment.
+ */
+void
+XLogWalPropClose(XLogRecPtr recptr)
+{
+	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
+
+	if (close(walpropFile) != 0)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close log segment %s: %m",
+						xlogfname)));
+	}
+
+	walpropFile = -1;
+}
+
+/* START of cloned functions from walsender.c */
+
+/*
+ * Handle START_REPLICATION command.
+ *
+ * At the moment, this never returns, but an ereport(ERROR) will take us back
+ * to the main loop.
+ */
+void
+StartProposerReplication(StartReplicationCmd *cmd)
+{
+	XLogRecPtr	FlushPtr;
+
+	if (ThisTimeLineID == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
+
+	/* create xlogreader for physical replication */
+	xlogreader =
+		XLogReaderAllocate(wal_segment_size, NULL,
+						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
+									  .segment_close = wal_segment_close),
+						   NULL);
+
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+					errmsg("out of memory")));
+
+	/*
+	 * We assume here that we're logging enough information in the WAL for
+	 * log-shipping, since this is checked in PostmasterMain().
+	 *
+	 * NOTE: wal_level can only change at shutdown, so in most cases it is
+	 * difficult for there to be WAL data that we can still see that was
+	 * written at wal_level='minimal'.
+	 */
+
+	if (cmd->slotname)
+	{
+		ReplicationSlotAcquire(cmd->slotname, true);
+		if (SlotIsLogical(MyReplicationSlot))
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						errmsg("cannot use a logical replication slot for physical replication")));
+
+		/*
+		 * We don't need to verify the slot's restart_lsn here; instead we
+		 * rely on the caller requesting the starting point to use.  If the
+		 * WAL segment doesn't exist, we'll fail later.
+		 */
+	}
+
+	/*
+	 * Select the timeline. If it was given explicitly by the client, use
+	 * that. Otherwise use the timeline of the last replayed record, which is
+	 * kept in ThisTimeLineID.
+	 * 
+	 * Neon doesn't currently use PG Timelines, but it may in the future, so
+	 * we keep this code around to lighten the load for when we need it.
+	 */
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr();
+	}
+	else
+		FlushPtr = GetFlushRecPtr();
+
+	if (cmd->timeline != 0)
+	{
+		XLogRecPtr	switchpoint;
+
+		sendTimeLine = cmd->timeline;
+		if (sendTimeLine == ThisTimeLineID)
+		{
+			sendTimeLineIsHistoric = false;
+			sendTimeLineValidUpto = InvalidXLogRecPtr;
+		}
+		else
+		{
+			List	   *timeLineHistory;
+
+			sendTimeLineIsHistoric = true;
+
+			/*
+			 * Check that the timeline the client requested exists, and the
+			 * requested start location is on that timeline.
+			 */
+			timeLineHistory = readTimeLineHistory(ThisTimeLineID);
+			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
+										 &sendTimeLineNextTLI);
+			list_free_deep(timeLineHistory);
+
+			/*
+			 * Found the requested timeline in the history. Check that
+			 * requested startpoint is on that timeline in our history.
+			 *
+			 * This is quite loose on purpose. We only check that we didn't
+			 * fork off the requested timeline before the switchpoint. We
+			 * don't check that we switched *to* it before the requested
+			 * starting point. This is because the client can legitimately
+			 * request to start replication from the beginning of the WAL
+			 * segment that contains switchpoint, but on the new timeline, so
+			 * that it doesn't end up with a partial segment. If you ask for
+			 * too old a starting point, you'll get an error later when we
+			 * fail to find the requested WAL segment in pg_wal.
+			 *
+			 * XXX: we could be more strict here and only allow a startpoint
+			 * that's older than the switchpoint, if it's still in the same
+			 * WAL segment.
+			 */
+			if (!XLogRecPtrIsInvalid(switchpoint) &&
+				switchpoint < cmd->startpoint)
+			{
+				ereport(ERROR,
+						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
+								LSN_FORMAT_ARGS(cmd->startpoint),
+								cmd->timeline),
+							errdetail("This server's history forked from timeline %u at %X/%X.",
+									  cmd->timeline,
+									  LSN_FORMAT_ARGS(switchpoint))));
+			}
+			sendTimeLineValidUpto = switchpoint;
+		}
+	}
+	else
+	{
+		sendTimeLine = ThisTimeLineID;
+		sendTimeLineValidUpto = InvalidXLogRecPtr;
+		sendTimeLineIsHistoric = false;
+	}
+
+	streamingDoneSending = streamingDoneReceiving = false;
+
+	/* If there is nothing to stream, don't even enter COPY mode */
+	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
+	{
+		/*
+		 * When we first start replication the standby will be behind the
+		 * primary. For some applications, for example synchronous
+		 * replication, it is important to have a clear state for this initial
+		 * catchup mode, so we can trigger actions when we change streaming
+		 * state later. We may stay in this state for a long time, which is
+		 * exactly why we want to be able to monitor whether or not we are
+		 * still here.
+		 */
+		WalSndSetState(WALSNDSTATE_CATCHUP);
+
+		/*
+		 * Don't allow a request to stream from a future point in WAL that
+		 * hasn't been flushed to disk in this server yet.
+		 */
+		if (FlushPtr < cmd->startpoint)
+		{
+			ereport(ERROR,
+					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+							LSN_FORMAT_ARGS(cmd->startpoint),
+							LSN_FORMAT_ARGS(FlushPtr))));
+		}
+
+		/* Start streaming from the requested point */
+		sentPtr = cmd->startpoint;
+
+		/* Initialize shared memory status, too */
+		SpinLockAcquire(&MyWalSnd->mutex);
+		MyWalSnd->sentPtr = sentPtr;
+		SpinLockRelease(&MyWalSnd->mutex);
+
+		SyncRepInitConfig();
+
+		/* Main loop of walsender */
+		replication_active = true;
+
+		WalSndLoop(XLogSendPhysical);
+
+		replication_active = false;
+		if (got_STOPPING)
+			proc_exit(0);
+		WalSndSetState(WALSNDSTATE_STARTUP);
+
+		Assert(streamingDoneSending && streamingDoneReceiving);
+	}
+
+	if (cmd->slotname)
+		ReplicationSlotRelease();
+
+	/*
+	 * Copy is finished now. Send a single-row result set indicating the next
+	 * timeline.
+	 */
+	if (sendTimeLineIsHistoric)
+	{
+		char		startpos_str[8 + 1 + 8 + 1];
+		DestReceiver *dest;
+		TupOutputState *tstate;
+		TupleDesc	tupdesc;
+		Datum		values[2];
+		bool		nulls[2];
+
+		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
+				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
+
+		dest = CreateDestReceiver(DestRemoteSimple);
+		MemSet(nulls, false, sizeof(nulls));
+
+		/*
+		 * Need a tuple descriptor representing two columns. int8 may seem
+		 * like a surprising data type for this, but in theory int4 would not
+		 * be wide enough for this, as TimeLineID is unsigned.
+		 */
+		tupdesc = CreateTemplateTupleDesc(2);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
+								  INT8OID, -1, 0);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
+								  TEXTOID, -1, 0);
+
+		/* prepare for projection of tuple */
+		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
+
+		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
+		values[1] = CStringGetTextDatum(startpos_str);
+
+		/* send it to dest */
+		do_tup_output(tstate, values, nulls);
+
+		end_tup_output(tstate);
+	}
+
+	/* Send CommandComplete message */
+	EndReplicationCommand("START_STREAMING");
+}
+
+/*
+ * Returns the latest point in WAL that has been safely flushed to disk, and
+ * can be sent to the standby. This should only be called when in recovery,
+ * ie. we're streaming to a cascaded standby.
+ *
+ * As a side-effect, ThisTimeLineID is updated to the TLI of the last
+ * replayed WAL record.
+ */
+static XLogRecPtr
+GetStandbyFlushRecPtr(void)
+{
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	receivePtr;
+	TimeLineID	receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that it
+	 * has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	ThisTimeLineID = replayTLI;
+
+	result = replayPtr;
+	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+
+/* XLogReaderRoutine->segment_open callback */
+static void
+WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+				  TimeLineID *tli_p)
+{
+	char		path[MAXPGPATH];
+
+	/*-------
+	 * When reading from a historic timeline, and there is a timeline switch
+	 * within this segment, read from the WAL segment belonging to the new
+	 * timeline.
+	 *
+	 * For example, imagine that this server is currently on timeline 5, and
+	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
+	 * 0/13002088. In pg_wal, we have these files:
+	 *
+	 * ...
+	 * 000000040000000000000012
+	 * 000000040000000000000013
+	 * 000000050000000000000013
+	 * 000000050000000000000014
+	 * ...
+	 *
+	 * In this situation, when requested to send the WAL from segment 0x13, on
+	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
+	 * recovery prefers files from newer timelines, so if the segment was
+	 * restored from the archive on this server, the file belonging to the old
+	 * timeline, 000000040000000000000013, might not exist. Their contents are
+	 * equal up to the switchpoint, because at a timeline switch, the used
+	 * portion of the old segment is copied to the new file.  -------
+	 */
+	*tli_p = sendTimeLine;
+	if (sendTimeLineIsHistoric)
+	{
+		XLogSegNo	endSegNo;
+
+		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
+		if (nextSegNo == endSegNo)
+			*tli_p = sendTimeLineNextTLI;
+	}
+
+	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return;
+
+	/*
+	 * If the file is not found, assume it's because the standby asked for a
+	 * too old WAL segment that has already been removed or recycled.
+	 */
+	if (errno == ENOENT)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+					errmsg("requested WAL segment %s has already been removed",
+						   xlogfname)));
+	}
+	else
+		ereport(ERROR,
+				(errcode_for_file_access(),
+					errmsg("could not open file \"%s\": %m",
+						   path)));
+}
+
+
+/* Main loop of walsender process that streams the WAL over Copy messages. */
+static void
+WalSndLoop(WalSndSendDataCallback send_data)
+{
+	/*
+	 * Initialize the last reply timestamp. That enables timeout processing
+	 * from hereon.
+	 */
+	last_reply_timestamp = GetCurrentTimestamp();
+	waiting_for_ping_response = false;
+
+	/*
+	 * Loop until we reach the end of this timeline or the client requests to
+	 * stop streaming.
+	 */
+	for (;;)
+	{
+		/* Clear any already-pending wakeups */
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Process any requests or signals received recently */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			SyncRepInitConfig();
+		}
+
+		/* always true */
+		if (am_wal_proposer)
+		{
+			send_data();
+			if (WalSndCaughtUp)
+			{
+				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+					WalSndSetState(WALSNDSTATE_STREAMING);
+				WalProposerPoll();
+				WalSndCaughtUp = false;
+			}
+			continue;
+		}
+	}
+}
+
+/*
+ * Send out the WAL in its normal physical/stored form.
+ *
+ * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
+ * but not yet sent to the client, and buffer it in the libpq output
+ * buffer.
+ *
+ * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
+ * otherwise WalSndCaughtUp is set to false.
+ */
+static void
+XLogSendPhysical(void)
+{
+	XLogRecPtr	SendRqstPtr;
+	XLogRecPtr	startptr;
+	XLogRecPtr	endptr;
+	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
+
+	/* If requested switch the WAL sender to the stopping state. */
+	if (got_STOPPING)
+		WalSndSetState(WALSNDSTATE_STOPPING);
+
+	if (streamingDoneSending)
+	{
+		WalSndCaughtUp = true;
+		return;
+	}
+
+	/* Figure out how far we can safely send the WAL. */
+	if (sendTimeLineIsHistoric)
+	{
+		/*
+		 * Streaming an old timeline that's in this server's history, but is
+		 * not the one we're currently inserting or replaying. It can be
+		 * streamed up to the point where we switched off that timeline.
+		 */
+		SendRqstPtr = sendTimeLineValidUpto;
+	}
+	else if (am_cascading_walsender)
+	{
+		/*
+		 * Streaming the latest timeline on a standby.
+		 *
+		 * Attempt to send all WAL that has already been replayed, so that we
+		 * know it's valid. If we're receiving WAL through streaming
+		 * replication, it's also OK to send any WAL that has been received
+		 * but not replayed.
+		 *
+		 * The timeline we're recovering from can change, or we can be
+		 * promoted. In either case, the current timeline becomes historic. We
+		 * need to detect that so that we don't try to stream past the point
+		 * where we switched to another timeline. We check for promotion or
+		 * timeline switch after calculating FlushPtr, to avoid a race
+		 * condition: if the timeline becomes historic just after we checked
+		 * that it was still current, it's still be OK to stream it up to the
+		 * FlushPtr that was calculated before it became historic.
+		 */
+		bool		becameHistoric = false;
+
+		SendRqstPtr = GetStandbyFlushRecPtr();
+
+		if (!RecoveryInProgress())
+		{
+			/*
+			 * We have been promoted. RecoveryInProgress() updated
+			 * ThisTimeLineID to the new current timeline.
+			 */
+			am_cascading_walsender = false;
+			becameHistoric = true;
+		}
+		else
+		{
+			/*
+			 * Still a cascading standby. But is the timeline we're sending
+			 * still the one recovery is recovering from? ThisTimeLineID was
+			 * updated by the GetStandbyFlushRecPtr() call above.
+			 */
+			if (sendTimeLine != ThisTimeLineID)
+				becameHistoric = true;
+		}
+
+		if (becameHistoric)
+		{
+			/*
+			 * The timeline we were sending has become historic. Read the
+			 * timeline history file of the new timeline to see where exactly
+			 * we forked off from the timeline we were sending.
+			 */
+			List	   *history;
+
+			history = readTimeLineHistory(ThisTimeLineID);
+			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
+
+			Assert(sendTimeLine < sendTimeLineNextTLI);
+			list_free_deep(history);
+
+			sendTimeLineIsHistoric = true;
+
+			SendRqstPtr = sendTimeLineValidUpto;
+		}
+	}
+	else
+	{
+		/*
+		 * Streaming the current timeline on a primary.
+		 *
+		 * Attempt to send all data that's already been written out and
+		 * fsync'd to disk.  We cannot go further than what's been written out
+		 * given the current implementation of WALRead().  And in any case
+		 * it's unsafe to send WAL that is not securely down to disk on the
+		 * primary: if the primary subsequently crashes and restarts, standbys
+		 * must not have applied any WAL that got lost on the primary.
+		 */
+		SendRqstPtr = GetFlushRecPtr();
+	}
+
+	/*
+	 * Record the current system time as an approximation of the time at which
+	 * this WAL location was written for the purposes of lag tracking.
+	 *
+	 * In theory we could make XLogFlush() record a time in shmem whenever WAL
+	 * is flushed and we could get that time as well as the LSN when we call
+	 * GetFlushRecPtr() above (and likewise for the cascading standby
+	 * equivalent), but rather than putting any new code into the hot WAL path
+	 * it seems good enough to capture the time here.  We should reach this
+	 * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
+	 * may take some time, we read the WAL flush pointer and take the time
+	 * very close to together here so that we'll get a later position if it is
+	 * still moving.
+	 *
+	 * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
+	 * this gives us a cheap approximation for the WAL flush time for this
+	 * LSN.
+	 *
+	 * Note that the LSN is not necessarily the LSN for the data contained in
+	 * the present message; it's the end of the WAL, which might be further
+	 * ahead.  All the lag tracking machinery cares about is finding out when
+	 * that arbitrary LSN is eventually reported as written, flushed and
+	 * applied, so that it can measure the elapsed time.
+	 */
+	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
+
+	/*
+	 * If this is a historic timeline and we've reached the point where we
+	 * forked to the next timeline, stop streaming.
+	 *
+	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
+	 * startup process will normally replay all WAL that has been received
+	 * from the primary, before promoting, but if the WAL streaming is
+	 * terminated at a WAL page boundary, the valid portion of the timeline
+	 * might end in the middle of a WAL record. We might've already sent the
+	 * first half of that partial WAL record to the cascading standby, so that
+	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
+	 * replay the partial WAL record either, so it can still follow our
+	 * timeline switch.
+	 */
+	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
+	{
+		/* close the current file. */
+		if (xlogreader->seg.ws_file >= 0)
+			wal_segment_close(xlogreader);
+
+		/* Send CopyDone */
+		pq_putmessage_noblock('c', NULL, 0);
+		streamingDoneSending = true;
+
+		WalSndCaughtUp = true;
+
+		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
+			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
+			 LSN_FORMAT_ARGS(sentPtr));
+		return;
+	}
+
+	/* Do we have any work to do? */
+	Assert(sentPtr <= SendRqstPtr);
+	if (SendRqstPtr <= sentPtr)
+	{
+		WalSndCaughtUp = true;
+		return;
+	}
+
+	/*
+	 * Figure out how much to send in one message. If there's no more than
+	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
+	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
+	 *
+	 * The rounding is not only for performance reasons. Walreceiver relies on
+	 * the fact that we never split a WAL record across two messages. Since a
+	 * long WAL record is split at page boundary into continuation records,
+	 * page boundary is always a safe cut-off point. We also assume that
+	 * SendRqstPtr never points to the middle of a WAL record.
+	 */
+	startptr = sentPtr;
+	endptr = startptr;
+	endptr += MAX_SEND_SIZE;
+
+	/* if we went beyond SendRqstPtr, back off */
+	if (SendRqstPtr <= endptr)
+	{
+		endptr = SendRqstPtr;
+		if (sendTimeLineIsHistoric)
+			WalSndCaughtUp = false;
+		else
+			WalSndCaughtUp = true;
+	}
+	else
+	{
+		/* round down to page boundary. */
+		endptr -= (endptr % XLOG_BLCKSZ);
+		WalSndCaughtUp = false;
+	}
+
+	nbytes = endptr - startptr;
+	Assert(nbytes <= MAX_SEND_SIZE);
+
+	/* always true */
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, endptr);
+	}
+	else
+	{
+		/* code removed for brevity */
+	}
+	sentPtr = endptr;
+
+	/* Update shared memory status */
+	{
+		WalSnd	   *walsnd = MyWalSnd;
+
+		SpinLockAcquire(&walsnd->mutex);
+		walsnd->sentPtr = sentPtr;
+		SpinLockRelease(&walsnd->mutex);
+	}
+
+	/* Report progress of XLOG streaming in PS display */
+	if (update_process_title)
+	{
+		char		activitymsg[50];
+
+		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
+				 LSN_FORMAT_ARGS(sentPtr));
+		set_ps_display(activitymsg);
+	}
+}
+
diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h
new file mode 100644
index 0000000000..4771d3ff82
--- /dev/null
+++ b/pgxn/neon/walproposer_utils.h
@@ -0,0 +1,19 @@
+#ifndef __NEON_WALPROPOSER_UTILS_H__
+#define __NEON_WALPROPOSER_UTILS_H__
+
+#include "walproposer.h"
+
+int        CompareLsn(const void *a, const void *b);
+char*      FormatSafekeeperState(SafekeeperState state);
+void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
+uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
+char*      FormatEvents(uint32 events);
+bool       HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32     pq_getmsgint32_le(StringInfo msg);
+uint64     pq_getmsgint64_le(StringInfo msg);
+void       pq_sendint32_le(StringInfo buf, uint32 i);
+void       pq_sendint64_le(StringInfo buf, uint64 i);
+void       XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
+void       XLogWalPropClose(XLogRecPtr recptr);
+
+#endif /* __NEON_WALPROPOSER_UTILS_H__ */
diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
new file mode 100644
index 0000000000..9c774ec185
--- /dev/null
+++ b/pgxn/neon_test_utils/Makefile
@@ -0,0 +1,15 @@
+# pgxs/neon_test_utils/Makefile
+
+
+MODULE_big = neon_test_utils
+OBJS = \
+	$(WIN32RES) \
+	neontest.o
+
+EXTENSION = neon_test_utils
+DATA = neon_test_utils--1.0.sql
+PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
new file mode 100644
index 0000000000..402981a9a6
--- /dev/null
+++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
@@ -0,0 +1,29 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit
+
+CREATE FUNCTION test_consume_xids(nxids int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_xids'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION clear_buffer_cache()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'clear_buffer_cache'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'neon_xlogflush'
+LANGUAGE C PARALLEL UNSAFE;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
new file mode 100644
index 0000000000..94e6720503
--- /dev/null
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -0,0 +1,5 @@
+# neon_test_utils extension
+comment = 'helpers for neon testing and debugging'
+default_version = '1.0'
+module_pathname = '$libdir/neon_test_utils'
+relocatable = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
new file mode 100644
index 0000000000..3e30065cd3
--- /dev/null
+++ b/pgxn/neon_test_utils/neontest.c
@@ -0,0 +1,304 @@
+/*-------------------------------------------------------------------------
+ *
+ * neontest.c
+ *	  Helpers for neon testing and debugging
+ *
+ * IDENTIFICATION
+ *	 contrib/neon_test_utils/neontest.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/namespace.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "utils/builtins.h"
+#include "utils/pg_lsn.h"
+#include "utils/rel.h"
+#include "utils/varlena.h"
+#include "../neon/pagestore_client.h"
+
+PG_MODULE_MAGIC;
+
+extern void _PG_init(void);
+
+PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(clear_buffer_cache);
+PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
+PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
+PG_FUNCTION_INFO_V1(neon_xlogflush);
+
+/*
+ * Linkage to functions in zenith module.
+ * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
+ */
+typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer);
+
+static zenith_read_at_lsn_type zenith_read_at_lsn_ptr;
+
+/*
+ * Module initialize function: fetch function pointers for cross-module calls.
+ */
+void
+_PG_init(void)
+{
+	/* Asserts verify that typedefs above match original declarations */
+	AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
+	zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
+		load_external_function("$libdir/neon", "zenith_read_at_lsn",
+							   true, NULL);
+}
+
+#define zenith_read_at_lsn zenith_read_at_lsn_ptr
+
+/*
+ * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
+ */
+Datum
+test_consume_xids(PG_FUNCTION_ARGS)
+{
+	int32		nxids = PG_GETARG_INT32(0);
+	TransactionId topxid;
+	FullTransactionId fullxid;
+	TransactionId xid;
+	TransactionId targetxid;
+
+	/* make sure we have a top-XID first */
+	topxid = GetTopTransactionId();
+
+	xid = ReadNextTransactionId();
+
+	targetxid = xid + nxids;
+	while (targetxid < FirstNormalTransactionId)
+		targetxid++;
+
+	while (TransactionIdPrecedes(xid, targetxid))
+	{
+		fullxid = GetNewTransactionId(true);
+		xid = XidFromFullTransactionId(fullxid);
+		elog(DEBUG1, "topxid: %u xid: %u", topxid, xid);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Flush the buffer cache, evicting all pages that are not currently pinned.
+ */
+Datum
+clear_buffer_cache(PG_FUNCTION_ARGS)
+{
+	bool		save_zenith_test_evict;
+
+	/*
+	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
+	 * buffers, as there is no explicit "evict this buffer" function in the
+	 * buffer manager.
+	 */
+	save_zenith_test_evict = zenith_test_evict;
+	zenith_test_evict = true;
+	PG_TRY();
+	{
+		/* Scan through all the buffers */
+		for (int i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *bufHdr;
+			uint32		buf_state;
+			Buffer		bufferid;
+			bool		isvalid;
+			RelFileNode rnode;
+			ForkNumber	forknum;
+			BlockNumber blocknum;
+
+			/* Peek into the buffer header to see what page it holds. */
+			bufHdr = GetBufferDescriptor(i);
+			buf_state = LockBufHdr(bufHdr);
+
+			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+				isvalid = true;
+			else
+				isvalid = false;
+			bufferid = BufferDescriptorGetBuffer(bufHdr);
+			rnode = bufHdr->tag.rnode;
+			forknum = bufHdr->tag.forkNum;
+			blocknum = bufHdr->tag.blockNum;
+
+			UnlockBufHdr(bufHdr, buf_state);
+
+			/*
+			 * Pin the buffer, and release it again. Because we have
+			 * zenith_test_evict==true, this will evict the page from
+			 * the buffer cache if no one else is holding a pin on it.
+			 */
+			if (isvalid)
+			{
+				if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
+					ReleaseBuffer(bufferid);
+			}
+		}
+	}
+	PG_FINALLY();
+	{
+		/* restore the GUC */
+		zenith_test_evict = save_zenith_test_evict;
+	}
+	PG_END_TRY();
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * Reads the page from page server without buffer cache
+ * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN
+ * NULL read lsn will result in reading the latest version.
+ *
+ * Note: reading latest version will result in waiting for latest changes to reach the page server,
+ *       if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
+ */
+Datum
+get_raw_page_at_lsn(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page;
+	ForkNumber	forknum;
+	RangeVar   *relrv;
+	Relation	rel;
+	char	   *raw_page_data;
+	text	   *relname;
+	text	   *forkname;
+	uint32		blkno;
+
+	bool request_latest = PG_ARGISNULL(3);
+	uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
+		PG_RETURN_NULL();
+
+	relname = PG_GETARG_TEXT_PP(0);
+	forkname = PG_GETARG_TEXT_PP(1);
+	blkno = PG_GETARG_UINT32(2);
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	/* Check that this relation has storage */
+	if (rel->rd_rel->relkind == RELKIND_VIEW)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from view \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from composite type \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from foreign table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned index \"%s\"",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+
+	forknum = forkname_to_number(text_to_cstring(forkname));
+
+	/* Initialize buffer to copy to */
+	raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+	raw_page_data = VARDATA(raw_page);
+
+	zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_BYTEA_P(raw_page);
+}
+
+/*
+ * Another option to read a relation page from page server without cache
+ * this version doesn't validate input and allows reading blocks of dropped relations
+ *
+ * Note: reading latest version will result in waiting for latest changes to reach the page server,
+ *  if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
+ */
+Datum
+get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
+{
+	char	   *raw_page_data;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				errmsg("must be superuser to use raw page functions")));
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) ||
+		PG_ARGISNULL(3) || PG_ARGISNULL(4))
+		PG_RETURN_NULL();
+
+	{
+		RelFileNode rnode = {
+			.spcNode = PG_GETARG_OID(0),
+			.dbNode  = PG_GETARG_OID(1),
+			.relNode = PG_GETARG_OID(2)
+		};
+
+		ForkNumber forknum = PG_GETARG_UINT32(3);
+
+		uint32 blkno = PG_GETARG_UINT32(4);
+		bool request_latest = PG_ARGISNULL(5);
+		uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
+
+
+		/* Initialize buffer to copy to */
+		bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+		raw_page_data = VARDATA(raw_page);
+
+		zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		PG_RETURN_BYTEA_P(raw_page);
+	}
+}
+
+/*
+ * Directly calls XLogFlush(lsn) to flush WAL buffers.
+ */
+Datum
+neon_xlogflush(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr lsn = PG_GETARG_LSN(0);
+	XLogFlush(lsn);
+	PG_RETURN_VOID();
+}
diff --git a/vendor/postgres b/vendor/postgres
index a479855158..8f132d968c 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit a4798551587fb5a52740687a341af83b28733dc6
+Subproject commit 8f132d968cd44068fc6f72e4047f7d3d6320f4bb