From df362de0ddef6bd90d014eea12529c59c2ee1b76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 24 Feb 2025 17:38:13 +0100 Subject: [PATCH 1/2] Reject basebackup requests for archived timelines (#10828) For archived timelines, we would like to prevent all non-pageserver issued getpage requests, as users are not supposed to send these. Instead, they should unarchive a timeline before issuing any external read traffic. As this is non-trivial to do, at least prevent launches of new computes, by errorring on basebackup requests for archived timelines. In #10688, we started issuing a warning instead of an error, because an error would mean a stuck project. Now after we can confirm the the warning is not present in the logs for about a week, we can issue errors. Follow-up of #10688 Related: #9548 --- pageserver/src/page_service.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7285697040..b9b8e32753 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -2097,9 +2097,8 @@ impl PageServerHandler { set_tracing_field_shard_id(&timeline); if timeline.is_archived() == Some(true) { - // TODO after a grace period, turn this log line into a hard error - tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."); - //return Err(QueryError::NotFound("timeline is archived".into())) + tracing::info!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."); + return Err(QueryError::NotFound("timeline is archived".into())); } let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); From 40acb0c06df769d69e31bb99174a3c4162b164f8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 24 Feb 2025 19:15:07 +0200 Subject: [PATCH 2/2] Fix usage of WaitEventSetWait() with timeout (#10947) WaitEventSetWait() returns the number of "events" that happened, and only that many events in the WaitEvent array are updated. When the timeout is reached, it returns 0 and does not modify the WaitEvent array at all. We were reading 'event.events' without checking the return value, which would be uninitialized when the timeout was hit. No test included, as this is harmless at the moment. But this caused the test I'm including in PR #10882 to fail. That PR changes the logic to loop back to retry the PQgetCopyData() call if WL_SOCKET_READABLE was set. Currently, making an extra call to PQconsumeInput() is harmless, but with that change in logic, it turns into a busy-wait. --- pgxn/neon/libpagestore.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index fc1aecd340..f5801b379b 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -716,20 +716,21 @@ retry: if (ret == 0) { - WaitEvent event; + WaitEvent occurred_event; + int noccurred; long timeout; timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log)); /* Sleep until there's something to do */ - (void) WaitEventSetWait(shard->wes_read, timeout, &event, 1, - WAIT_EVENT_NEON_PS_READ); + noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1, + WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ - if (event.events & WL_SOCKET_READABLE) + if (noccurred > 0 && (occurred_event.events & WL_SOCKET_READABLE) != 0) { if (!PQconsumeInput(pageserver_conn)) {