From ae15acdee7d435d8fc61036227dde02ca7fa7462 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Apr 2024 13:28:22 +0300 Subject: [PATCH] Fix bug in prefetch cleanup (#7277) ## Problem Running test_pageserver_restarts_under_workload in POR #7275 I get the following assertion failure in prefetch: ``` #5 0x00005587220d4bf0 in ExceptionalCondition ( conditionName=0x7fbf24d003c8 "(ring_index) < MyPState->ring_unused && (ring_index) >= MyPState->ring_last", fileName=0x7fbf24d00240 "/home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c", lineNumber=644) at /home/knizhnik/neon.main//vendor/postgres-v16/src/backend/utils/error/assert.c:66 #6 0x00007fbf24cebc9b in prefetch_set_unused (ring_index=1509) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:644 #7 0x00007fbf24cec613 in prefetch_register_buffer (tag=..., force_latest=0x0, force_lsn=0x0) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:891 #8 0x00007fbf24cef21e in neon_prefetch (reln=0x5587233b7388, forknum=MAIN_FORKNUM, blocknum=14110) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:2055 (gdb) p ring_index $1 = 1509 (gdb) p MyPState->ring_unused $2 = 1636 (gdb) p MyPState->ring_last $3 = 1636 ``` ## Summary of changes Check status of `prefetch_wait_for` ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/libpagestore.c | 21 +++++++++++---------- pgxn/neon/pagestore_smgr.c | 18 +++++++++++------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1bc8a2e87c..2276b4e807 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -495,16 +495,17 @@ retry: static void pageserver_disconnect(shardno_t shard_no) { - if (page_servers[shard_no].conn) - { - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); - } + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + * + * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, + * because prefetch request may be registered before connection is established. + */ + prefetch_on_ps_disconnect(); + pageserver_disconnect_shard(shard_no); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index b33cfab2bb..57a16e00ca 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -641,13 +641,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -806,7 +805,8 @@ Retry: { if (*force_lsn > slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -821,7 +821,8 @@ Retry: { if (*force_lsn != slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -887,7 +888,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -2140,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ + Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) @@ -2161,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index);