From d6d78a050f4d3b807718ba2a1f2fbc0d779b5cc8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 22:08:28 +0100 Subject: [PATCH] pageserver: disable `l0_flush_wait_upload` by default (#11215) ## Problem This is already disabled in production, as it is replaced by L0 flush delays. It will be removed in a later PR, once the config option is no longer specified in production. ## Summary of changes Disable `l0_flush_wait_upload` by default. --- libs/pageserver_api/src/config.rs | 4 +- .../regress/test_attach_tenant_config.py | 2 +- test_runner/regress/test_branching.py | 7 +-- test_runner/regress/test_remote_storage.py | 55 +------------------ 4 files changed, 8 insertions(+), 60 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index e112a57c9d..b12ef65780 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -289,6 +289,8 @@ pub struct TenantConfigToml { /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next /// layer. This is a temporary backpressure mechanism which should be removed once /// l0_flush_{delay,stall}_threshold is fully enabled. + /// + /// TODO: this is no longer enabled, remove it when the config option is no longer set. pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. @@ -576,7 +578,7 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; - pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true; + pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 07600dd911..b56fcd3500 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -144,7 +144,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": False, + "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 34e4e994cb..85d0cfbf1d 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException +from requests.exceptions import RetryError # Test branch creation @@ -180,7 +181,6 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.endpoints.create_start( initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 ) - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: env.pageserver.stop(immediate=True) @@ -221,10 +221,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises( - PageserverApiException, - match="Cannot branch off the timeline that's not present in pageserver", - ): + with pytest.raises(RetryError, match="too many 503 error responses"): ps_http.timeline_create( env.pg_version, env.initial_tenant, diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index c39c74fa2a..e8721f1ea0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -29,7 +29,6 @@ from fixtures.remote_storage import ( from fixtures.utils import ( assert_eq, assert_ge, - assert_gt, print_gc_result, query_scalar, wait_until, @@ -334,14 +333,12 @@ def test_remote_storage_upload_queue_retries( # Exponential back-off in upload queue, so, gracious timeouts. wait_until( - lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + lambda: assert_ge(get_queued_count(file_kind="layer", op_kind="upload"), 1), timeout=30 ) wait_until( lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30 ) - wait_until( - lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 - ) + # There may or may not be deletes queued up behind conflicting uploads; don't check. # unblock churn operations configure_storage_sync_failpoints("off") @@ -786,54 +783,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_paused_upload_stalls_checkpoint( - neon_env_builder: NeonEnvBuilder, -): - """ - This test checks that checkpoints block on uploads to remote storage. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - # Set a small compaction threshold - "compaction_threshold": "3", - # Disable GC - "gc_period": "0s", - # disable PITR - "pitr_interval": "0s", - } - ) - - env.pageserver.allowed_errors.append( - f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - client = env.pageserver.http_client() - layers_at_creation = client.layer_map_info(tenant_id, timeline_id) - deltas_at_creation = len(layers_at_creation.delta_layers()) - assert ( - deltas_at_creation == 1 - ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle" - - # Make new layer uploads get stuck. - # Note that timeline creation waits for the initial layers to reach remote storage. - # So at this point, the `layers_at_creation` are in remote storage. - client.configure_failpoints(("before-upload-layer-pausable", "pause")) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - # Build two tables with some data inside - endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - with pytest.raises(ReadTimeout): - client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ):