From 3293e4685ee668e6fe29210ab476eaf671813c58 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 3 Jul 2025 16:12:41 +0300 Subject: [PATCH] Fix cases where pageserver gets stuck waiting for LSN The compute might make a request with an LSN that it hasn't even flushed yet. --- .../communicator/src/backend_interface.rs | 4 +--- .../src/worker_process/main_loop.rs | 22 +++++++++++++++---- pgxn/neon/communicator_new.c | 9 ++++++++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs index 3aa0fc673f..17fee7d000 100644 --- a/pgxn/neon/communicator/src/backend_interface.rs +++ b/pgxn/neon/communicator/src/backend_interface.rs @@ -158,7 +158,6 @@ pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> } } - /// Check if the local file cache contians the given block #[unsafe(no_mangle)] pub extern "C" fn bcomm_cache_contains( @@ -176,11 +175,10 @@ pub extern "C" fn bcomm_cache_contains( relnode: rel_number, forknum: fork_number, }, - block_number + block_number, ) } - impl<'t> CommunicatorBackendStruct<'t> { /// Send a wakeup to the communicator process fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) { diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs index 0c960249d4..6d72a62131 100644 --- a/pgxn/neon/communicator/src/worker_process/main_loop.rs +++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs @@ -239,12 +239,26 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { // Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we // shouldn't evict a page from the buffer cache before all its modifications have been // safely flushed. That's the "WAL before data" rule. However, such case does exist at index - // building, _bt_blwritepage logs the full page without flushing WAL before smgrextend + // building: _bt_blwritepage logs the full page without flushing WAL before smgrextend // (files are fsynced before build ends). // - // FIXME: I'm seeing some other cases of this too in the regression tests. - // Maybe it's OK? Would be nice to dig a little deeper. - // See the old logic in neon_get_request_lsns() C function + // XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would + // block waiting for the WAL arrive, until we flush it and it propagates through the + // safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed, + // the pageserver would get stuck waiting forever. To avoid that, all the write- + // functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to + // perform the flush relatively soon. + // + // It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into + // Postgres code to do that from here. That's why we rely on communicator_new.c to do the + // calls "pre-emptively". + // + // FIXME: Because of the above, it can still happen that the flush LSN is ahead of + // not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know + // if there are other cases like that that we have mised, but unfortunately we cannot turn + // this into an assertion because of that legit case. + // + // See also the old logic in neon_get_request_lsns() C function if not_modified_since_lsn > request_lsn { tracing::info!( "not_modified_since_lsn {} is ahead of last flushed LSN {}", diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c index f71c6d1bac..86ac402c74 100644 --- a/pgxn/neon/communicator_new.c +++ b/pgxn/neon/communicator_new.c @@ -847,6 +847,9 @@ communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber }; NeonIOResult result; + /* FIXME: see `request_lsns` in main_loop.rs for why this is needed */ + XLogSetAsyncXactLSN(lsn); + perform_request(&request, &result); switch (result.tag) { @@ -883,6 +886,9 @@ communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber }; NeonIOResult result; + /* FIXME: see `request_lsns` in main_loop.rs for why this is needed */ + XLogSetAsyncXactLSN(lsn); + perform_request(&request, &result); switch (result.tag) { @@ -918,6 +924,9 @@ communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNum }; NeonIOResult result; + /* FIXME: see `request_lsns` in main_loop.rs for why this is needed */ + XLogSetAsyncXactLSN(lsn); + perform_request(&request, &result); switch (result.tag) {