mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
fix(tests): periodic and immediate gc is effectively a no-op in tests (#12431)
The introduction of the default lease deadline feature 9 months ago made it so that after PS restart, `.timeline_gc()` calls in Python tests are no-ops for 10 minute after pageserver startup: the `gc_iteration()` bails with `Skipping GC because lsn lease deadline is not reached`. I did some impact analysis in the following PR. About 30 Python tests are affected: - https://github.com/neondatabase/neon/pull/12411 Rust tests that don't explicitly enable periodic GC or invoke GC manually are unaffected because we disable periodic GC by default in the `TenantHarness`'s tenant config. Two tests explicitly did `start_paused=true` + `tokio::time::advance()`, but it would add cognitive and code bloat to each existing and future test case that uses TenantHarness if we took that route. So, this PR sets the default lease deadline feature in both Python and Rust tests to zero by default. Tests that test the feature were thus identified by failing the test: - Python test `test_readonly_node_gc` + `test_lsn_lease_size` - Rust test `test_lsn_lease`. To accomplish the above, I changed the code that computes the initial lease deadline to respect the pageserver.toml's default tenant config, which it didn't before (and I would consider a bug). The Python test harness and the Rust TenantHarness test harness then simply set the default tenant config field to zero. Drive-by: - `test_lsn_lease_size` was writing a lot of data unnecessarily; reduce the amount and speed up the test refs - PR that introduced default lease deadline: https://github.com/neondatabase/neon/pull/9055/files - fixes https://databricks.atlassian.net/browse/LKB-92 --------- Co-authored-by: Christian Schwarz <Christian Schwarz>
This commit is contained in:
committed by
GitHub
parent
59e393aef3
commit
2b2a547671
@@ -635,7 +635,7 @@ impl PageServerConf {
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
|
||||
|
||||
let config_toml = pageserver_api::config::ConfigToml {
|
||||
let mut config_toml = pageserver_api::config::ConfigToml {
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
wal_redo_timeout: Duration::from_secs(60),
|
||||
pg_distrib_dir: Some(pg_distrib_dir),
|
||||
@@ -647,6 +647,15 @@ impl PageServerConf {
|
||||
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Test authors tend to forget about the default 10min initial lease deadline
|
||||
// when writing tests, which turns their immediate gc requests via mgmt API
|
||||
// into no-ops. Override the binary default here, such that there is no initial
|
||||
// lease deadline by default in tests. Tests that care can always override it
|
||||
// themselves.
|
||||
// Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
|
||||
config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0);
|
||||
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ use once_cell::sync::Lazy;
|
||||
pub use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{self, RelSizeMigration};
|
||||
use pageserver_api::models::{
|
||||
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
WalRedoManagerStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
|
||||
@@ -180,6 +180,7 @@ pub(super) struct AttachedTenantConf {
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: pageserver_api::models::TenantConfig,
|
||||
location: AttachedLocationConfig,
|
||||
) -> Self {
|
||||
@@ -191,9 +192,7 @@ impl AttachedTenantConf {
|
||||
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
|
||||
Some(
|
||||
tokio::time::Instant::now()
|
||||
+ tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(LsnLease::DEFAULT_LENGTH),
|
||||
+ TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf),
|
||||
)
|
||||
} else {
|
||||
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
|
||||
@@ -208,10 +207,13 @@ impl AttachedTenantConf {
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
fn try_from(
|
||||
conf: &'static PageServerConf,
|
||||
location_conf: LocationConf,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
|
||||
Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf))
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
anyhow::bail!(
|
||||
@@ -4205,10 +4207,16 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
pub fn get_lsn_lease_length(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf)
|
||||
}
|
||||
|
||||
pub fn get_lsn_lease_length_impl(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: &pageserver_api::models::TenantConfig,
|
||||
) -> Duration {
|
||||
tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
.unwrap_or(conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
pub fn get_timeline_offloading_enabled(&self) -> bool {
|
||||
@@ -6020,11 +6028,14 @@ pub(crate) mod harness {
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
TenantState::Attaching,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
ShardParameters::default(),
|
||||
))
|
||||
AttachedTenantConf::try_from(
|
||||
self.conf,
|
||||
LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
ShardParameters::default(),
|
||||
),
|
||||
)
|
||||
.unwrap(),
|
||||
self.shard_identity,
|
||||
Some(walredo_mgr),
|
||||
@@ -6125,7 +6136,7 @@ mod tests {
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::keyspace::KeySpaceRandomAccum;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
#[cfg(feature = "testing")]
|
||||
use rand::SeedableRng;
|
||||
@@ -6675,17 +6686,13 @@ mod tests {
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -9384,17 +9391,21 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
|
||||
.await
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
// set a non-zero lease length to test the feature
|
||||
tenant
|
||||
.update_tenant_config(|mut conf| {
|
||||
conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH);
|
||||
Ok(conf)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
|
||||
@@ -664,7 +664,7 @@ pub async fn init_tenant_mgr(
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
@@ -842,8 +842,11 @@ impl TenantManager {
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
AttachedTenantConf::try_from(
|
||||
self.conf,
|
||||
new_location_config.clone(),
|
||||
)
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
@@ -1046,7 +1049,7 @@ impl TenantManager {
|
||||
// Testing hack: if we are configured with no control plane, then drop the generation
|
||||
// from upserts. This enables creating generation-less tenants even though neon_local
|
||||
// always uses generations when calling the location conf API.
|
||||
let attached_conf = AttachedTenantConf::try_from(new_location_config)
|
||||
let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config)
|
||||
.map_err(UpsertLocationError::BadRequest)?;
|
||||
|
||||
let tenant = tenant_spawn(
|
||||
@@ -1250,7 +1253,7 @@ impl TenantManager {
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
AttachedTenantConf::try_from(self.conf, config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
@@ -2131,7 +2134,7 @@ impl TenantManager {
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
|
||||
AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
|
||||
@@ -1315,6 +1315,14 @@ class NeonEnv:
|
||||
# This feature is pending rollout.
|
||||
# tenant_config["rel_size_v2_enabled"] = True
|
||||
|
||||
# Test authors tend to forget about the default 10min initial lease deadline
|
||||
# when writing tests, which turns their immediate gc requests via mgmt API
|
||||
# into no-ops. Override the binary default here, such that there is no initial
|
||||
# lease deadline by default in tests. Tests that care can always override it
|
||||
# themselves.
|
||||
# Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
|
||||
tenant_config["lsn_lease_length"] = "0s"
|
||||
|
||||
if self.pageserver_remote_storage is not None:
|
||||
ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
|
||||
self.pageserver_remote_storage
|
||||
|
||||
@@ -201,11 +201,11 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||
client = ps.http_client()
|
||||
layers_guarded_before_gc = get_layers_protected_by_lease(
|
||||
client, shard, env.initial_timeline, lease_lsn=lsn
|
||||
client, shard, env.initial_timeline, lease_lsn=lease_lsn
|
||||
)
|
||||
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
||||
layers_guarded_after_gc = get_layers_protected_by_lease(
|
||||
client, shard, env.initial_timeline, lease_lsn=lsn
|
||||
client, shard, env.initial_timeline, lease_lsn=lease_lsn
|
||||
)
|
||||
|
||||
# Note: cannot assert on `layers_removed` here because it could be layers
|
||||
|
||||
@@ -740,6 +740,10 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
|
||||
"pitr_interval": "0s" if zero_gc else "3600s",
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# The test exercises leases API, so we need non-zero lease length.
|
||||
# If this tests ever does GC, we need to accomodate for the initial lease deadline
|
||||
# after tenant attach, which is also controlled by this variable.
|
||||
"lsn_lease_length": "600s",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
@@ -824,9 +828,7 @@ def insert_with_action(
|
||||
log.info(f"initial size: {initial_size}")
|
||||
|
||||
with ep.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
|
||||
|
||||
if action == "lease":
|
||||
@@ -841,15 +843,9 @@ def insert_with_action(
|
||||
raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
|
||||
|
||||
with ep.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute(
|
||||
"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute(
|
||||
"CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
cur.execute("CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
cur.execute("CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
|
||||
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user