From cb8782a51a5fca52363c946a2484b2c6319e8d63 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 11 Jun 2025 22:03:54 +0200 Subject: [PATCH] Fix recovery IO deadlock Previously, it was possible for backends to request a page with the LSN of the record currently being replayed. This could cause a deadlock when the redo process wanted to read that same page at the same time. This LSN could only appear when the page was not present in the LwLSN cache, and the highest evicted LSN also was the LSN of the currently-replayed WAL record. The issue is fixed by splitting maxLastWrittenLsn into two: one for data pages, and one for metadata. This allows us to keep track of metadata changes separately, removing the implicit dependency of page IO on metadata LSNs where appropriate. Additionally, we stop evicting LwLSNs for pages with an LSN that is yet to be replayed. This means the global data page LwLsn will never return an LSN of a record that has yet to be replayed, *unless* the startup process has already determined that it won't access that page again, making page IO and Replay waits by other backends using that LSN safe for those pages. --- pgxn/neon/neon_lwlsncache.c | 156 +++++++++++++++++++++++++----------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- 5 files changed, 111 insertions(+), 53 deletions(-) diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c index a8cfa0f825..a0b09ac5fb 100644 --- a/pgxn/neon/neon_lwlsncache.c +++ b/pgxn/neon/neon_lwlsncache.c @@ -11,6 +11,9 @@ #include "utils/guc.h" #include "utils/hsearch.h" +#if PG_MAJORVERSION_NUM > 14 +#include "access/xlogrecovery.h" +#endif typedef struct LastWrittenLsnCacheEntry @@ -24,14 +27,20 @@ typedef struct LastWrittenLsnCacheEntry typedef struct LwLsnCacheCtl { int lastWrittenLsnCacheSize; /* - * Maximal last written LSN for pages not present in lastWrittenLsnCache - */ - XLogRecPtr maxLastWrittenLsn; + * Highest (most recent) last written LSN, for pages not present in + * lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsnData; /* - * Double linked list to implement LRU replacement policy for last written LSN cache. - * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. - */ + * Maximal last written LSN for metadata, not present in lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsnMetadata; + + /* + * Double linked list to implement LRU replacement policy for last written LSN cache. + * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. + */ dlist_head lastWrittenLsnLRU; } LwLsnCacheCtl; @@ -108,19 +117,20 @@ init_lwlsncache(void) #else shmemrequest(); #endif - - prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook; - set_lwlsn_block_range_hook = neon_set_lwlsn_block_range; - prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook; - set_lwlsn_block_v_hook = neon_set_lwlsn_block_v; - prev_set_lwlsn_block_hook = set_lwlsn_block_hook; - set_lwlsn_block_hook = neon_set_lwlsn_block; - prev_set_max_lwlsn_hook = set_max_lwlsn_hook; - set_max_lwlsn_hook = neon_set_max_lwlsn; - prev_set_lwlsn_relation_hook = set_lwlsn_relation_hook; - set_lwlsn_relation_hook = neon_set_lwlsn_relation; - prev_set_lwlsn_db_hook = set_lwlsn_db_hook; - set_lwlsn_db_hook = neon_set_lwlsn_db; + +#define SET_HOOK(name) do { \ + prev_##name##_hook = name##_hook; \ + name##_hook = neon_##name; \ +} while (false) + + SET_HOOK(set_lwlsn_block_range); + SET_HOOK(set_lwlsn_block_v); + SET_HOOK(set_lwlsn_block); + SET_HOOK(set_max_lwlsn); + SET_HOOK(set_lwlsn_relation); + SET_HOOK(set_lwlsn_db); + +#undef SET_HOOK } @@ -139,24 +149,34 @@ static void shmemrequest(void) { static void shmeminit(void) { static HASHCTL info; - bool found; + bool found = true; + if (lwlsn_cache_size > 0) { info.keysize = sizeof(BufferTag); info.entrysize = sizeof(LastWrittenLsnCacheEntry); + lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", - lwlsn_cache_size, lwlsn_cache_size, - &info, - HASH_ELEM | HASH_BLOBS); - LwLsnCache = ShmemInitStruct("neon/LwLsnCacheCtl", sizeof(LwLsnCacheCtl), &found); - // Now set the size in the struct - LwLsnCache->lastWrittenLsnCacheSize = lwlsn_cache_size; - if (found) { - return; - } + lwlsn_cache_size, lwlsn_cache_size, + &info, + HASH_ELEM | HASH_BLOBS); + LwLsnCache = ShmemInitStruct("neon/LwLsnCacheCtl", + sizeof(LwLsnCacheCtl), &found); } - dlist_init(&LwLsnCache->lastWrittenLsnLRU); - LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr(); + + /* initialize the shmem struct if we allocated it */ + if (!found) { + XLogRecPtr redoPtr; + LwLsnCache->lastWrittenLsnCacheSize = lwlsn_cache_size; + + dlist_init(&LwLsnCache->lastWrittenLsnLRU); + + redoPtr = GetRedoRecPtr(); + + LwLsnCache->maxLastWrittenLsnMetadata = redoPtr; + LwLsnCache->maxLastWrittenLsnData = redoPtr; + } + if (prev_shmem_startup_hook) { prev_shmem_startup_hook(); } @@ -180,17 +200,18 @@ neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) LWLockAcquire(LastWrittenLsnLock, LW_SHARED); - /* Maximal last written LSN among all non-cached pages */ - lsn = LwLsnCache->maxLastWrittenLsn; - - if (NInfoGetRelNumber(rlocator) != InvalidOid) + if (NInfoGetRelNumber(rlocator) != InvalidOid) /* data page*/ { BufferTag key; Oid spcOid = NInfoGetSpcOid(rlocator); Oid dbOid = NInfoGetDbOid(rlocator); Oid relNumber = NInfoGetRelNumber(rlocator); + BufTagInit(key, relNumber, forknum, blkno, spcOid, dbOid); - + + /* Maximal last written LSN among all non-cached data pages */ + lsn = LwLsnCache->maxLastWrittenLsnData; + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); if (entry != NULL) lsn = entry->lsn; @@ -212,9 +233,13 @@ neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, blkno, 1); } } - else + else /* metadata */ { HASH_SEQ_STATUS seq; + /* Maximal last written LSN for metadata */ + lsn = Max(LwLsnCache->maxLastWrittenLsnMetadata, + LwLsnCache->maxLastWrittenLsnData); + /* Find maximum of all cached LSNs */ hash_seq_init(&seq, lastWrittenLsnCache); while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) @@ -230,7 +255,8 @@ neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno) static void neon_set_max_lwlsn(XLogRecPtr lsn) { LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); - LwLsnCache->maxLastWrittenLsn = lsn; + LwLsnCache->maxLastWrittenLsnMetadata = lsn; + LwLsnCache->maxLastWrittenLsnData = lsn; LWLockRelease(LastWrittenLsnLock); } @@ -291,7 +317,7 @@ neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum, LWLockRelease(LastWrittenLsnLock); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); - lsn = LwLsnCache->maxLastWrittenLsn; + lsn = LwLsnCache->maxLastWrittenLsnData; for (int i = 0; i < nblocks; i++) { @@ -306,7 +332,8 @@ neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum, else { HASH_SEQ_STATUS seq; - lsn = LwLsnCache->maxLastWrittenLsn; + Assert(nblocks == 1); + lsn = LwLsnCache->maxLastWrittenLsnMetadata; /* Find maximum of all cached LSNs */ hash_seq_init(&seq, lastWrittenLsnCache); while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) @@ -334,10 +361,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, { if (NInfoGetRelNumber(rlocator) == InvalidOid) { - if (lsn > LwLsnCache->maxLastWrittenLsn) - LwLsnCache->maxLastWrittenLsn = lsn; + if (lsn > LwLsnCache->maxLastWrittenLsnMetadata) + LwLsnCache->maxLastWrittenLsnMetadata = lsn; else - lsn = LwLsnCache->maxLastWrittenLsn; + lsn = LwLsnCache->maxLastWrittenLsnMetadata; } else { @@ -369,10 +396,19 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, if (hash_get_num_entries(lastWrittenLsnCache) > LwLsnCache->lastWrittenLsnCacheSize) { /* Replace least recently used entry */ - LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + LastWrittenLsnCacheEntry* victim = NULL; + victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + + while (!XLogRecordReplayFinished(victim->lsn)) + { + /* in recovery, we don't allow eviction of entries with the LSN of a record that has yet to be returned */ + dlist_push_tail(&LwLsnCache->lastWrittenLsnLRU, &entry->lru_node); + victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + } + /* Adjust max LSN for not cached relations/chunks if needed */ - if (victim->lsn > LwLsnCache->maxLastWrittenLsn) - LwLsnCache->maxLastWrittenLsn = victim->lsn; + if (victim->lsn > LwLsnCache->maxLastWrittenLsnMetadata) + LwLsnCache->maxLastWrittenLsnMetadata = victim->lsn; hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); } @@ -433,6 +469,13 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, Oid dbOid = NInfoGetDbOid(relfilenode); Oid relNumber = NInfoGetRelNumber(relfilenode); + /* + * We ignore the operation when the input is invalid: + * - we must have gotten LSNs to set + * - we must have pages to write + * - the cache must be enabled + * - we must be processing a data page, not a metadata request + */ if (lsns == NULL || nblocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0 || NInfoGetRelNumber(relfilenode) == InvalidOid) return InvalidXLogRecPtr; @@ -466,10 +509,25 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, if (hash_get_num_entries(lastWrittenLsnCache) > LwLsnCache->lastWrittenLsnCacheSize) { /* Replace least recently used entry */ - LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, + dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + + /* + * If replay is still working on this LSN, we can't evict the + * page. Therefore, we must find a different victim, and return + * the one we just found to the pool. + */ + while (!XLogRecordReplayFinished(victim->lsn)) + { + dlist_push_tail(&LwLsnCache->lastWrittenLsnLRU, + &entry->lru_node); + victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, + dlist_pop_head_node(&LwLsnCache->lastWrittenLsnLRU)); + } + /* Adjust max LSN for not cached relations/chunks if needed */ - if (victim->lsn > LwLsnCache->maxLastWrittenLsn) - LwLsnCache->maxLastWrittenLsn = victim->lsn; + if (victim->lsn > LwLsnCache->maxLastWrittenLsnData) + LwLsnCache->maxLastWrittenLsnData = victim->lsn; hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 6770bc2513..9f8dd2fa12 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 6770bc251301ef40c66f7ecb731741dc435b5051 +Subproject commit 9f8dd2fa127f1113c236cb6d90ca357906a91873 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8c3249f36c..d49bea7d54 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9 +Subproject commit d49bea7d54bf40c3050bf111bae6eb69fd032dac diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 7a4c0eacae..ec96c1ce86 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1 +Subproject commit ec96c1ce86a1c9b43ca273fe62d4f88569d8d74e diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index db424d42d7..876755144d 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f +Subproject commit 876755144d3c4661880b0f50024e3a80e61e9810