pageserver: disable l0_flush_wait_upload by default (#11215)

## Problem

This is already disabled in production, as it is replaced by L0 flush
delays. It will be removed in a later PR, once the config option is no
longer specified in production.

## Summary of changes

Disable `l0_flush_wait_upload` by default.
This commit is contained in:
Erik Grinaker
2025-03-13 22:08:28 +01:00
committed by GitHub
parent 4ff000c042
commit d6d78a050f
4 changed files with 8 additions and 60 deletions

View File

@@ -289,6 +289,8 @@ pub struct TenantConfigToml {
/// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
/// layer. This is a temporary backpressure mechanism which should be removed once
/// l0_flush_{delay,stall}_threshold is fully enabled.
///
/// TODO: this is no longer enabled, remove it when the config option is no longer set.
pub l0_flush_wait_upload: bool,
// Determines how much history is retained, to allow
// branching and read replicas at an older point in time.
@@ -576,7 +578,7 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false;
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

View File

@@ -144,7 +144,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"compaction_l0_semaphore": False,
"l0_flush_delay_threshold": 25,
"l0_flush_stall_threshold": 42,
"l0_flush_wait_upload": False,
"l0_flush_wait_upload": True,
"compaction_target_size": 1048576,
"checkpoint_distance": 10000,
"checkpoint_timeout": "13m",

View File

@@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active
from fixtures.utils import query_scalar
from performance.test_perf_pgbench import get_scales_matrix
from requests import RequestException
from requests.exceptions import RetryError
# Test branch creation
@@ -180,7 +181,6 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
env.endpoints.create_start(
initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
)
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
finally:
env.pageserver.stop(immediate=True)
@@ -221,10 +221,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
branch_id = TimelineId.generate()
with pytest.raises(
PageserverApiException,
match="Cannot branch off the timeline that's not present in pageserver",
):
with pytest.raises(RetryError, match="too many 503 error responses"):
ps_http.timeline_create(
env.pg_version,
env.initial_tenant,

View File

@@ -29,7 +29,6 @@ from fixtures.remote_storage import (
from fixtures.utils import (
assert_eq,
assert_ge,
assert_gt,
print_gc_result,
query_scalar,
wait_until,
@@ -334,14 +333,12 @@ def test_remote_storage_upload_queue_retries(
# Exponential back-off in upload queue, so, gracious timeouts.
wait_until(
lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
lambda: assert_ge(get_queued_count(file_kind="layer", op_kind="upload"), 1), timeout=30
)
wait_until(
lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30
)
wait_until(
lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
)
# There may or may not be deletes queued up behind conflicting uploads; don't check.
# unblock churn operations
configure_storage_sync_failpoints("off")
@@ -786,54 +783,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
create_thread.join()
def test_paused_upload_stalls_checkpoint(
neon_env_builder: NeonEnvBuilder,
):
"""
This test checks that checkpoints block on uploads to remote storage.
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
initial_tenant_conf={
# Set a small compaction threshold
"compaction_threshold": "3",
# Disable GC
"gc_period": "0s",
# disable PITR
"pitr_interval": "0s",
}
)
env.pageserver.allowed_errors.append(
f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
client = env.pageserver.http_client()
layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
deltas_at_creation = len(layers_at_creation.delta_layers())
assert (
deltas_at_creation == 1
), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
# Make new layer uploads get stuck.
# Note that timeline creation waits for the initial layers to reach remote storage.
# So at this point, the `layers_at_creation` are in remote storage.
client.configure_failpoints(("before-upload-layer-pausable", "pause"))
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
# Build two tables with some data inside
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
with pytest.raises(ReadTimeout):
client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
client.configure_failpoints(("before-upload-layer-pausable", "off"))
def wait_upload_queue_empty(
client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
):