diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3f66960edd..ba13671c93 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -565,6 +565,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'basebackup_cache_enabled' as bool")?, + standby_horizon_lease_length: settings + .remove("standby_horizon_lease_length") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'standby_horizon_lease_length' as duration")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 99d7e0ca3a..9448b9e655 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -638,13 +638,15 @@ impl PageServerConf { ..Default::default() }; - // Test authors tend to forget about the default 10min initial lease deadline + // Test authors tend to forget about the default 10min initial lsn lease deadline // when writing tests, which turns their immediate gc requests via mgmt API // into no-ops. Override the binary default here, such that there is no initial - // lease deadline by default in tests. Tests that care can always override it + // lsn lease deadline by default in tests. Tests that care can always override it // themselves. // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329 config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0); + // Same argument applies to the initial standby_horizon lease deadline. + config_toml.tenant_config.standby_horizon_lease_length = Duration::from_secs(0); PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8fb77ae04e..ae9dbdc021 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -179,6 +179,9 @@ pub(super) struct AttachedTenantConf { /// The deadline before which we are blocked from GC so that /// leases have a chance to be renewed. lsn_lease_deadline: Option, + /// The deadline before which we are blocked from GC so that + /// standby horizon leases have a chance to be renewed. + standby_horizon_lease_deadline: Option, } impl AttachedTenantConf { @@ -203,10 +206,27 @@ impl AttachedTenantConf { None }; + // Sets a deadline before which we cannot proceed to GC due to standby horizon lease. + // + // Similar to LSN leases, standby horizon leases are not persisted to disk. By delaying GC by lease + // length, we guarantee that all the standby horizon leases we granted before will have a chance to renew + // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. + let standby_horizon_lease_deadline = if location.attach_mode == AttachmentMode::Single { + Some( + tokio::time::Instant::now() + + TenantShard::get_standby_horizon_lease_length_impl(conf, &tenant_conf), + ) + } else { + // We don't use `standby_horizon_lease_deadline` to delay GC in AttachedMulti and AttachedStale + // because we don't do GC in these modes. + None + }; + Self { tenant_conf, location, lsn_lease_deadline, + standby_horizon_lease_deadline, } } @@ -231,6 +251,12 @@ impl AttachedTenantConf { .map(|d| tokio::time::Instant::now() < d) .unwrap_or(false) } + + fn is_gc_blocked_by_standby_horizon_lease_deadline(&self) -> bool { + self.standby_horizon_lease_deadline + .map(|d| tokio::time::Instant::now() < d) + .unwrap_or(false) + } } struct TimelinePreload { timeline_id: TimelineId, @@ -3111,7 +3137,7 @@ impl TenantShard { return Ok(GcResult::default()); } - if todo!("block for standby horizon lease deadline") { + if conf.is_gc_blocked_by_standby_horizon_lease_deadline() { info!("Skipping GC because standby horizon lease deadline is not reached"); return Ok(GcResult::default()); } @@ -4280,6 +4306,7 @@ impl TenantShard { tenant_conf: update(attached_conf.tenant_conf.clone())?, location: attached_conf.location, lsn_lease_deadline: attached_conf.lsn_lease_deadline, + standby_horizon_lease_deadline: attached_conf.standby_horizon_lease_deadline, })) })?; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c867dbe81d..57810eacc4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1884,8 +1884,14 @@ impl Timeline { .checked_add(Duration::from_secs(5 * 60)) .unwrap()); } - let applied_gc_cutoff_lsn = - todo!("think about boundary conditions? we didn't have any before though"); + let applied_gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn(); + if lsn < *applied_gc_cutoff_lsn_guard { + bail!( + "tried to request a standby horizon lease for an lsn below the applied gc cutoff. requested at {} gc cutoff {}", + lsn, + *applied_gc_cutoff_lsn_guard + ); + } let length = self.get_standby_horizon_lease_length(); self.standby_horizons .upsert_lease(lease_id, lsn, length) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f54d5be635..d922be55d7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1322,6 +1322,8 @@ class NeonEnv: # themselves. # Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329 tenant_config["lsn_lease_length"] = "0s" + # Same argument applies to the initial standby_horizon lease deadline. + tenant_config["standby_horizon_lease_length"] = "0s" if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 7788faceb4..009df34990 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -181,6 +181,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "image_layer_creation_check_threshold": 1, "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", + "standby_horizon_lease_length": "5s", "timeline_offloading": False, "rel_size_v2_enabled": True, "relsize_snapshot_cache_capacity": 10000,