fix(tests): periodic and immediate gc is effectively a no-op in tests (#12431)

The introduction of the default lease deadline feature 9 months ago made
it so
that after PS restart, `.timeline_gc()` calls in Python tests are no-ops
for 10 minute after pageserver startup: the `gc_iteration()` bails with
`Skipping GC because lsn lease deadline is not reached`.

I did some impact analysis in the following PR. About 30 Python tests
are affected:
- https://github.com/neondatabase/neon/pull/12411

Rust tests that don't explicitly enable periodic GC or invoke GC
manually
are unaffected because we disable periodic GC by default in
the `TenantHarness`'s tenant config.
Two tests explicitly did `start_paused=true` + `tokio::time::advance()`,
but it would add cognitive and code bloat to each existing and future
test case that uses TenantHarness if we took that route.

So, this PR sets the default lease deadline feature in both Python
and Rust tests to zero by default. Tests that test the feature were
thus identified by failing the test:
- Python test `test_readonly_node_gc` + `test_lsn_lease_size`
- Rust test `test_lsn_lease`.

To accomplish the above, I changed the code that computes the initial
lease
deadline to respect the pageserver.toml's default tenant config, which
it didn't before (and I would consider a bug). The Python test harness
and the Rust TenantHarness test harness then simply set the default
tenant
config field to zero.

Drive-by:
- `test_lsn_lease_size` was writing a lot of data unnecessarily; reduce
the amount and speed up the test

refs
- PR that introduced default lease deadline:
https://github.com/neondatabase/neon/pull/9055/files
- fixes https://databricks.atlassian.net/browse/LKB-92

---------

Co-authored-by: Christian Schwarz <Christian Schwarz>
This commit is contained in:
Christian Schwarz
2025-07-08 14:56:22 +02:00
committed by GitHub
parent 59e393aef3
commit 2b2a547671
6 changed files with 72 additions and 45 deletions

View File

@@ -635,7 +635,7 @@ impl PageServerConf {
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
let config_toml = pageserver_api::config::ConfigToml {
let mut config_toml = pageserver_api::config::ConfigToml {
wait_lsn_timeout: Duration::from_secs(60),
wal_redo_timeout: Duration::from_secs(60),
pg_distrib_dir: Some(pg_distrib_dir),
@@ -647,6 +647,15 @@ impl PageServerConf {
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
..Default::default()
};
// Test authors tend to forget about the default 10min initial lease deadline
// when writing tests, which turns their immediate gc requests via mgmt API
// into no-ops. Override the binary default here, such that there is no initial
// lease deadline by default in tests. Tests that care can always override it
// themselves.
// Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0);
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
}
}

View File

@@ -34,7 +34,7 @@ use once_cell::sync::Lazy;
pub use pageserver_api::models::TenantState;
use pageserver_api::models::{self, RelSizeMigration};
use pageserver_api::models::{
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem,
WalRedoManagerStatus,
};
use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
@@ -180,6 +180,7 @@ pub(super) struct AttachedTenantConf {
impl AttachedTenantConf {
fn new(
conf: &'static PageServerConf,
tenant_conf: pageserver_api::models::TenantConfig,
location: AttachedLocationConfig,
) -> Self {
@@ -191,9 +192,7 @@ impl AttachedTenantConf {
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
Some(
tokio::time::Instant::now()
+ tenant_conf
.lsn_lease_length
.unwrap_or(LsnLease::DEFAULT_LENGTH),
+ TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf),
)
} else {
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
@@ -208,10 +207,13 @@ impl AttachedTenantConf {
}
}
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
fn try_from(
conf: &'static PageServerConf,
location_conf: LocationConf,
) -> anyhow::Result<Self> {
match &location_conf.mode {
LocationMode::Attached(attach_conf) => {
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf))
}
LocationMode::Secondary(_) => {
anyhow::bail!(
@@ -4205,10 +4207,16 @@ impl TenantShard {
}
pub fn get_lsn_lease_length(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf)
}
pub fn get_lsn_lease_length_impl(
conf: &'static PageServerConf,
tenant_conf: &pageserver_api::models::TenantConfig,
) -> Duration {
tenant_conf
.lsn_lease_length
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
.unwrap_or(conf.default_tenant_conf.lsn_lease_length)
}
pub fn get_timeline_offloading_enabled(&self) -> bool {
@@ -6020,11 +6028,14 @@ pub(crate) mod harness {
let tenant = Arc::new(TenantShard::new(
TenantState::Attaching,
self.conf,
AttachedTenantConf::try_from(LocationConf::attached_single(
AttachedTenantConf::try_from(
self.conf,
LocationConf::attached_single(
self.tenant_conf.clone(),
self.generation,
ShardParameters::default(),
))
),
)
.unwrap(),
self.shard_identity,
Some(walredo_mgr),
@@ -6125,7 +6136,7 @@ mod tests {
use pageserver_api::keyspace::KeySpace;
#[cfg(feature = "testing")]
use pageserver_api::keyspace::KeySpaceRandomAccum;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
use pageserver_compaction::helpers::overlaps_with;
#[cfg(feature = "testing")]
use rand::SeedableRng;
@@ -6675,17 +6686,13 @@ mod tests {
tline.freeze_and_flush().await.map_err(|e| e.into())
}
#[tokio::test(start_paused = true)]
#[tokio::test]
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
.await?
.load()
.await;
// Advance to the lsn lease deadline so that GC is not blocked by
// initial transition into AttachedSingle.
tokio::time::advance(tenant.get_lsn_lease_length()).await;
tokio::time::resume();
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -9384,17 +9391,21 @@ mod tests {
Ok(())
}
#[tokio::test(start_paused = true)]
#[tokio::test]
async fn test_lsn_lease() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
.await
.unwrap()
.load()
.await;
// Advance to the lsn lease deadline so that GC is not blocked by
// initial transition into AttachedSingle.
tokio::time::advance(tenant.get_lsn_lease_length()).await;
tokio::time::resume();
// set a non-zero lease length to test the feature
tenant
.update_tenant_config(|mut conf| {
conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH);
Ok(conf)
})
.unwrap();
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let end_lsn = Lsn(0x100);

View File

@@ -664,7 +664,7 @@ pub async fn init_tenant_mgr(
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
@@ -842,7 +842,10 @@ impl TenantManager {
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(
AttachedTenantConf::try_from(new_location_config.clone())
AttachedTenantConf::try_from(
self.conf,
new_location_config.clone(),
)
.map_err(UpsertLocationError::BadRequest)?,
);
@@ -1046,7 +1049,7 @@ impl TenantManager {
// Testing hack: if we are configured with no control plane, then drop the generation
// from upserts. This enables creating generation-less tenants even though neon_local
// always uses generations when calling the location conf API.
let attached_conf = AttachedTenantConf::try_from(new_location_config)
let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config)
.map_err(UpsertLocationError::BadRequest)?;
let tenant = tenant_spawn(
@@ -1250,7 +1253,7 @@ impl TenantManager {
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config)?,
AttachedTenantConf::try_from(self.conf, config)?,
shard_identity,
None,
SpawnMode::Eager,
@@ -2131,7 +2134,7 @@ impl TenantManager {
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?,
shard_identity,
None,
SpawnMode::Eager,

View File

@@ -1315,6 +1315,14 @@ class NeonEnv:
# This feature is pending rollout.
# tenant_config["rel_size_v2_enabled"] = True
# Test authors tend to forget about the default 10min initial lease deadline
# when writing tests, which turns their immediate gc requests via mgmt API
# into no-ops. Override the binary default here, such that there is no initial
# lease deadline by default in tests. Tests that care can always override it
# themselves.
# Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
tenant_config["lsn_lease_length"] = "0s"
if self.pageserver_remote_storage is not None:
ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
self.pageserver_remote_storage

View File

@@ -201,11 +201,11 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
for shard, ps in tenant_get_shards(env, env.initial_tenant):
client = ps.http_client()
layers_guarded_before_gc = get_layers_protected_by_lease(
client, shard, env.initial_timeline, lease_lsn=lsn
client, shard, env.initial_timeline, lease_lsn=lease_lsn
)
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
layers_guarded_after_gc = get_layers_protected_by_lease(
client, shard, env.initial_timeline, lease_lsn=lsn
client, shard, env.initial_timeline, lease_lsn=lease_lsn
)
# Note: cannot assert on `layers_removed` here because it could be layers

View File

@@ -740,6 +740,10 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
"pitr_interval": "0s" if zero_gc else "3600s",
"gc_period": "0s",
"compaction_period": "0s",
# The test exercises leases API, so we need non-zero lease length.
# If this tests ever does GC, we need to accomodate for the initial lease deadline
# after tenant attach, which is also controlled by this variable.
"lsn_lease_length": "600s",
}
env = neon_env_builder.init_start(initial_tenant_conf=conf)
@@ -824,9 +828,7 @@ def insert_with_action(
log.info(f"initial size: {initial_size}")
with ep.cursor() as cur:
cur.execute(
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
if action == "lease":
@@ -841,15 +843,9 @@ def insert_with_action(
raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
with ep.cursor() as cur:
cur.execute(
"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute(
"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute(
"CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
cur.execute("CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
cur.execute("CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)