diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 4361c74905..41546eae85 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1349,6 +1349,10 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +/* + * A page is being evicted from the shared buffer cache. Update the + * last-written LSN of the page, and WAL-log it if needed. + */ static void #if PG_MAJORVERSION_NUM < 16 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) @@ -1357,12 +1361,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); - - if (ShutdownRequestPending) - return; - /* Don't log any pages if we're not allowed to do so. */ - if (!XLogInsertAllowed()) - return; + bool log_page; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1371,9 +1370,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ - if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress()) + log_page = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_page = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_page = true; + } + + if (log_page) { - /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, @@ -1386,7 +1397,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } - else if (lsn == InvalidXLogRecPtr) + + if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an @@ -1422,19 +1434,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { - ereport(PANIC, + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, - (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); @@ -1527,8 +1551,92 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) if (RecoveryInProgress()) { - /* Request the page at the last replayed LSN. */ - result.request_lsn = GetXLogReplayRecPtr(NULL); + /*--- + * In broad strokes, a replica always requests the page at the current + * replay LSN. But looking closer, what exactly is the replay LSN? Is + * it the last replayed record, or the record being replayed? And does + * the startup process performing the replay need to do something + * differently than backends running queries? Let's take a closer look + * at the different scenarios: + * + * 1. Startup process reads a page, last_written_lsn is old. + * + * Read the old version of the page. We will apply the WAL record on + * it to bring it up-to-date. + * + * We could read the new version, with the changes from this WAL + * record already applied, to offload the work of replaying the record + * to the pageserver. The pageserver might not have received the WAL + * record yet, though, so a read of the old page version and applying + * the record ourselves is likely faster. Also, the redo function + * might be surprised if the changes have already applied. That's + * normal during crash recovery, but not in hot standby. + * + * 2. Startup process reads a page, last_written_lsn == record we're + * replaying. + * + * Can this happen? There are a few theoretical cases when it might: + * + * A) The redo function reads the same page twice. We had already read + * and applied the changes once, and now we're reading it for the + * second time. That would be a rather silly thing for a redo + * function to do, and I'm not aware of any that would do it. + * + * B) The redo function modifies multiple pages, and it already + * applied the changes to one of the pages, released the lock on + * it, and is now reading a second page. Furthermore, the first + * page was already evicted from the buffer cache, and also from + * the last-written LSN cache, so that the per-relation or global + * last-written LSN was already updated. All the WAL redo functions + * hold the locks on pages that they modify, until all the changes + * have been modified (?), which would make that impossible. + * However, we skip the locking, if the page isn't currently in the + * page cache (see neon_redo_read_buffer_filter below). + * + * Even if the one of the above cases were possible in theory, they + * would also require the pages being modified by the redo function to + * be immediately evicted from the page cache. + * + * So this probably does not happen in practice. But if it does, we + * request the new version, including the changes from the record + * being replayed. That seems like the correct behavior in any case. + * + * 3. Backend process reads a page with old last-written LSN + * + * Nothing special here. Read the old version. + * + * 4. Backend process reads a page with last_written_lsn == record being replayed + * + * This can happen, if the redo function has started to run, and saw + * that the page isn't present in the page cache (see + * neon_redo_read_buffer_filter below). Normally, in a normal + * Postgres server, the redo function would hold a lock on the page, + * so we would get blocked waiting the redo function to release the + * lock. To emulate that, wait for the WAL replay of the record to + * finish. + */ + /* Request the page at the end of the last fully replayed LSN. */ + XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ +#if PG_VERSION_NUM >= 150000 + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); +#endif + + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result.request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result.request_lsn = replay_lsn; + } result.not_modified_since = last_written_lsn; result.effective_request_lsn = result.request_lsn; Assert(last_written_lsn <= result.request_lsn); @@ -3258,16 +3366,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) no_redo_needed = buf_id < 0; } - /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); /* * we don't have the buffer in memory, update lwLsn past this record, also * evict page from file cache */ if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); lfc_evict(rinfo, forknum, blkno); - + } LWLockRelease(partitionLock); diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 244d482c18..cf7a1c56ee 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,3 +1,4 @@ +import asyncio import os import re import threading @@ -292,3 +293,62 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): assert slot_xmin is None wait_until(10, 1.0, xmin_is_null) + + +# Test race condition between WAL replay and backends performing queries +# https://github.com/neondatabase/neon/issues/7791 +def test_replica_query_race(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary_ep = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + + with primary_ep.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter") + + standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby") + time.sleep(1) + + # In primary, run a lot of UPDATEs on a single page + finished = False + writecounter = 1 + + async def primary_workload(): + nonlocal writecounter, finished + conn = await primary_ep.connect_async() + while writecounter < 10000: + writecounter += 1 + await conn.execute(f"UPDATE test SET counter = {writecounter}") + finished = True + + # In standby, at the same time, run queries on it. And repeatedly drop caches + async def standby_workload(): + nonlocal writecounter, finished + conn = await standby_ep.connect_async() + reads = 0 + while not finished: + readcounter = await conn.fetchval("SELECT counter FROM test") + + # Check that the replica is keeping up with the primary. In local + # testing, the lag between primary and standby is much smaller, in + # the ballpark of 2-3 counter values. But be generous in case there's + # some hiccup. + # assert(writecounter - readcounter < 1000) + assert readcounter <= writecounter + if reads % 100 == 0: + log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") + reads += 1 + + await conn.execute("SELECT clear_buffer_cache()") + + async def both(): + await asyncio.gather( + primary_workload(), + standby_workload(), + ) + + asyncio.run(both())