enforce standby_horizon leases are always above applied_gc_cutoff (check against cutoff on upsert + block gc for lease length to allow renewals after attach)

This commit is contained in:
Christian Schwarz
2025-07-25 22:30:04 +02:00
parent bc09df8823
commit 3365c8c648
6 changed files with 48 additions and 5 deletions

View File

@@ -565,6 +565,11 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'basebackup_cache_enabled' as bool")?,
standby_horizon_lease_length: settings
.remove("standby_horizon_lease_length")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'standby_horizon_lease_length' as duration")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")

View File

@@ -638,13 +638,15 @@ impl PageServerConf {
..Default::default()
};
// Test authors tend to forget about the default 10min initial lease deadline
// Test authors tend to forget about the default 10min initial lsn lease deadline
// when writing tests, which turns their immediate gc requests via mgmt API
// into no-ops. Override the binary default here, such that there is no initial
// lease deadline by default in tests. Tests that care can always override it
// lsn lease deadline by default in tests. Tests that care can always override it
// themselves.
// Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0);
// Same argument applies to the initial standby_horizon lease deadline.
config_toml.tenant_config.standby_horizon_lease_length = Duration::from_secs(0);
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
}

View File

@@ -179,6 +179,9 @@ pub(super) struct AttachedTenantConf {
/// The deadline before which we are blocked from GC so that
/// leases have a chance to be renewed.
lsn_lease_deadline: Option<tokio::time::Instant>,
/// The deadline before which we are blocked from GC so that
/// standby horizon leases have a chance to be renewed.
standby_horizon_lease_deadline: Option<tokio::time::Instant>,
}
impl AttachedTenantConf {
@@ -203,10 +206,27 @@ impl AttachedTenantConf {
None
};
// Sets a deadline before which we cannot proceed to GC due to standby horizon lease.
//
// Similar to LSN leases, standby horizon leases are not persisted to disk. By delaying GC by lease
// length, we guarantee that all the standby horizon leases we granted before will have a chance to renew
// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
let standby_horizon_lease_deadline = if location.attach_mode == AttachmentMode::Single {
Some(
tokio::time::Instant::now()
+ TenantShard::get_standby_horizon_lease_length_impl(conf, &tenant_conf),
)
} else {
// We don't use `standby_horizon_lease_deadline` to delay GC in AttachedMulti and AttachedStale
// because we don't do GC in these modes.
None
};
Self {
tenant_conf,
location,
lsn_lease_deadline,
standby_horizon_lease_deadline,
}
}
@@ -231,6 +251,12 @@ impl AttachedTenantConf {
.map(|d| tokio::time::Instant::now() < d)
.unwrap_or(false)
}
fn is_gc_blocked_by_standby_horizon_lease_deadline(&self) -> bool {
self.standby_horizon_lease_deadline
.map(|d| tokio::time::Instant::now() < d)
.unwrap_or(false)
}
}
struct TimelinePreload {
timeline_id: TimelineId,
@@ -3111,7 +3137,7 @@ impl TenantShard {
return Ok(GcResult::default());
}
if todo!("block for standby horizon lease deadline") {
if conf.is_gc_blocked_by_standby_horizon_lease_deadline() {
info!("Skipping GC because standby horizon lease deadline is not reached");
return Ok(GcResult::default());
}
@@ -4280,6 +4306,7 @@ impl TenantShard {
tenant_conf: update(attached_conf.tenant_conf.clone())?,
location: attached_conf.location,
lsn_lease_deadline: attached_conf.lsn_lease_deadline,
standby_horizon_lease_deadline: attached_conf.standby_horizon_lease_deadline,
}))
})?;

View File

@@ -1884,8 +1884,14 @@ impl Timeline {
.checked_add(Duration::from_secs(5 * 60))
.unwrap());
}
let applied_gc_cutoff_lsn =
todo!("think about boundary conditions? we didn't have any before though");
let applied_gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
if lsn < *applied_gc_cutoff_lsn_guard {
bail!(
"tried to request a standby horizon lease for an lsn below the applied gc cutoff. requested at {} gc cutoff {}",
lsn,
*applied_gc_cutoff_lsn_guard
);
}
let length = self.get_standby_horizon_lease_length();
self.standby_horizons
.upsert_lease(lease_id, lsn, length)

View File

@@ -1322,6 +1322,8 @@ class NeonEnv:
# themselves.
# Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
tenant_config["lsn_lease_length"] = "0s"
# Same argument applies to the initial standby_horizon lease deadline.
tenant_config["standby_horizon_lease_length"] = "0s"
if self.pageserver_remote_storage is not None:
ps_cfg["remote_storage"] = remote_storage_to_toml_dict(

View File

@@ -181,6 +181,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"image_layer_creation_check_threshold": 1,
"lsn_lease_length": "1m",
"lsn_lease_length_for_ts": "5s",
"standby_horizon_lease_length": "5s",
"timeline_offloading": False,
"rel_size_v2_enabled": True,
"relsize_snapshot_cache_capacity": 10000,