From 15db566420e957dd521af6e2592395b373fa9bf0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 25 Nov 2022 19:16:11 +0200 Subject: [PATCH] Allow setting gc/compaction_period to 0, to disable automatic GC/compaction Many python tests were setting the GC/compaction period to large values, to effectively disable GC / compaction. Reserve value 0 to mean "explicitly disabled". We also set them to 0 in unit tests now, although currently, unit tests don't launch the background jobs at all, so it won't have any effect. Fixes https://github.com/neondatabase/neon/issues/2917 --- pageserver/src/tenant.rs | 6 +++- pageserver/src/tenant_config.rs | 2 ++ pageserver/src/tenant_tasks.rs | 34 +++++++++++++-------- test_runner/performance/test_layer_map.py | 8 ++--- test_runner/regress/test_ancestor_branch.py | 10 ++---- test_runner/regress/test_branch_and_gc.py | 6 ++-- test_runner/regress/test_tenant_size.py | 8 ++--- test_runner/regress/test_timeline_size.py | 2 +- test_runner/regress/test_truncate.py | 5 +-- 9 files changed, 46 insertions(+), 35 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1ec9b0dbc7..6486fbe6f5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1717,7 +1717,11 @@ pub mod harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenant_conf = TenantConf::dummy_conf(); + // Disable automatic GC and compaction to make the unit tests more deterministic. + // The tests perform them manually if needed. + let mut tenant_conf = TenantConf::dummy_conf(); + tenant_conf.gc_period = Duration::ZERO; + tenant_conf.compaction_period = Duration::ZERO; let tenant_id = TenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 34ab3b1a26..10b8a589c3 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -51,6 +51,7 @@ pub struct TenantConf { // This parameter determines L1 layer file size. pub compaction_target_size: u64, // How often to check if there's compaction work to be done. + // Duration::ZERO means automatic compaction is disabled. #[serde(with = "humantime_serde")] pub compaction_period: Duration, // Level0 delta layer threshold for compaction. @@ -61,6 +62,7 @@ pub struct TenantConf { // Page versions older than this are garbage collected away. pub gc_horizon: u64, // Interval at which garbage collection is triggered. + // Duration::ZERO means automatic GC is disabled #[serde(with = "humantime_serde")] pub gc_period: Duration, // Delta layer churn threshold to create L1 image layers. diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index d8a4d5521c..1372b35de0 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -65,13 +65,17 @@ async fn compaction_loop(tenant_id: TenantId) { }, }; - // Run blocking part of the task - - // Run compaction let mut sleep_duration = tenant.get_compaction_period(); - if let Err(e) = tenant.compaction_iteration() { - sleep_duration = wait_duration; - error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); + if sleep_duration == Duration::ZERO { + info!("automatic compaction is disabled"); + // check again in 10 seconds, in case it's been enabled again. + sleep_duration = Duration::from_secs(10); + } else { + // Run compaction + if let Err(e) = tenant.compaction_iteration() { + sleep_duration = wait_duration; + error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); + } } // Sleep @@ -112,15 +116,21 @@ async fn gc_loop(tenant_id: TenantId) { }, }; - // Run gc let gc_period = tenant.get_gc_period(); let gc_horizon = tenant.get_gc_horizon(); let mut sleep_duration = gc_period; - if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await - { - sleep_duration = wait_duration; - error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); + if sleep_duration == Duration::ZERO { + info!("automatic GC is disabled"); + // check again in 10 seconds, in case it's been enabled again. + sleep_duration = Duration::from_secs(10); + } else { + // Run gc + if gc_horizon > 0 { + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await + { + sleep_duration = wait_duration; + error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); + } } } diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index d71fb6d12c..ac49ea9051 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -12,12 +12,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): n_iters = 10 n_records = 100000 - # We want to have a lot of lot of layer files to exercise the layer map. Make - # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + # We want to have a lot of lot of layer files to exercise the layer map. Disable + # GC, and make checkpoint_distance very small, so that we get a lot of small layer + # files. tenant, _ = env.neon_cli.create_tenant( conf={ - "gc_period": "100 m", - "gc_horizon": "1048576", + "gc_period": "0s", "checkpoint_distance": "8192", "compaction_period": "1 s", "compaction_threshold": "1", diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index d7aebfb938..2406102756 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -11,16 +11,12 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - # Override defaults, 1M gc_horizon and 4M checkpoint_distance. - # Extend compaction_period and gc_period to disable background compaction and gc. + # Override defaults: 4M checkpoint_distance, disable background compaction and gc. tenant, _ = env.neon_cli.create_tenant( conf={ - "gc_period": "10 m", - "gc_horizon": "1048576", "checkpoint_distance": "4194304", - "compaction_period": "10 m", - "compaction_threshold": "2", - "compaction_target_size": "4194304", + "gc_period": "0s", + "compaction_period": "0s", } ) diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index fad4b4c79e..dfbf956568 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -52,8 +52,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - "gc_period": "10 m", - "gc_horizon": f"{10 * 1024 ** 3}", + "gc_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1024 ** 2}", # set the target size to be large to allow the image layer to cover the whole key space @@ -127,8 +126,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - "gc_period": "10 m", - "gc_horizon": f"{10 * 1024 ** 3}", + "gc_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1024 ** 2}", # set the target size to be large to allow the image layer to cover the whole key space diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index f8197696bc..1a2e6d22b7 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -44,8 +44,8 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): Operate on single branch reading the tenants size after each transaction. """ - # gc and compaction is not wanted automatically - # the pitr_interval here is quite problematic, so we cannot really use it. + # Disable automatic gc and compaction. + # The pitr_interval here is quite problematic, so we cannot really use it. # it'd have to be calibrated per test executing env. # there was a bug which was hidden if the create table and first batch of @@ -53,7 +53,7 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): # that there next_gc_cutoff could be smaller than initdb_lsn, which will # obviously lead to issues when calculating the size. gc_horizon = 0x30000 - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}" + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" env = neon_env_builder.init_start() @@ -162,7 +162,7 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder gc_horizon = 128 * 1024 - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}" + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index ec2bed7fee..cef1f365cd 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -295,7 +295,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}" env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index 8a45276f5a..a358f94192 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -16,8 +16,9 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): # by image layer generation. So adjust default parameters to make it happen more frequently. tenant, _ = env.neon_cli.create_tenant( conf={ - "gc_period": "100 m", - "gc_horizon": "1048576", + # disable automatic GC + "gc_period": "0s", + # Compact and create images aggressively "checkpoint_distance": "1000000", "compaction_period": "1 s", "compaction_threshold": "3",