From d9de65ee8f5cdaf50bcaa0070dc47e95670aa223 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 22 Nov 2024 09:24:23 +0000 Subject: [PATCH] pageserver: permit reads behind GC cutoff during LSN grace period (#9833) ## Problem In https://github.com/neondatabase/neon/issues/9754 and the flakiness of `test_readonly_node_gc`, we saw that although our logic for controlling GC was sound, the validation of getpage requests was not, because it could not consider LSN leases when requests arrived shortly after restart. Closes https://github.com/neondatabase/neon/issues/9754 ## Summary of changes This is the "Option 3" discussed verbally -- rather than holding back gc cutoff, we waive the usual validation of request LSN if we are still waiting for leases to be sent after startup - When validating LSN in `wait_or_get_last_lsn`, skip the validation relative to GC cutoff if the timeline is still in its LSN lease grace period - Re-enable test_readonly_node_gc --- pageserver/src/page_service.rs | 23 ++++++++++++++--------- pageserver/src/tenant/timeline.rs | 5 +++++ test_runner/regress/test_readonly_node.py | 1 - 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a429dff1fd..5fd02d8749 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1068,21 +1068,26 @@ impl PageServerHandler { )); } - if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus + if request_lsn == Lsn::INVALID { + return Err(PageStreamError::BadRequest( + "invalid LSN(0) in request".into(), + )); + } + + // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease. + // + // We may have older data available, but we make a best effort to detect this case and return an error, + // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). + if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); if !gc_info.leases.contains_key(&request_lsn) { - // The requested LSN is below gc cutoff and is not guarded by a lease. - - // Check explicitly for INVALID just to get a less scary error message if the - // request is obviously bogus - return Err(if request_lsn == Lsn::INVALID { - PageStreamError::BadRequest("invalid LSN(0) in request".into()) - } else { + return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) - }); + ); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 95864af4d0..0c7f3204f6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2085,6 +2085,11 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } + pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf.is_gc_blocked_by_lsn_lease_deadline() + } + pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 2b274003a3..fcebf8d23a 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -121,7 +121,6 @@ def test_readonly_node(neon_simple_env: NeonEnv): ) -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754") def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): """ Test static endpoint is protected from GC by acquiring and renewing lsn leases.