mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 13:40:37 +00:00
This reapplies #10135. Just removing this flush backpressure without further mitigations caused read amp increases during bulk ingestion (predictably), so it was reverted. We will replace it by compaction-based backpressure. ## Problem In #8550, we made the flush loop wait for uploads after every layer. This was to avoid unbounded buildup of uploads, and to reduce compaction debt. However, the approach has several problems: * It prevents upload parallelism. * It prevents flush and upload pipelining. * It slows down ingestion even when there is no need to backpressure. * It does not directly backpressure based on compaction debt and read amplification. We will instead implement compaction-based backpressure in a PR immediately following this removal (#5415). Touches #5415. Touches #10095. ## Summary of changes Remove waiting on the upload queue in the flush loop.
This commit is contained in:
@@ -165,7 +165,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
"pageserver_aux_file_estimated_size",
|
||||
"pageserver_valid_lsn_lease_count",
|
||||
"pageserver_flush_wait_upload_seconds",
|
||||
counter("pageserver_tenant_throttling_count_accounted_start"),
|
||||
counter("pageserver_tenant_throttling_count_accounted_finish"),
|
||||
counter("pageserver_tenant_throttling_wait_usecs_sum"),
|
||||
|
||||
@@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active
|
||||
from fixtures.utils import query_scalar
|
||||
from performance.test_perf_pgbench import get_scales_matrix
|
||||
from requests import RequestException
|
||||
from requests.exceptions import RetryError
|
||||
|
||||
|
||||
# Test branch creation
|
||||
@@ -176,11 +177,8 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
|
||||
|
||||
env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
|
||||
env.endpoints.create_start(
|
||||
initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
|
||||
)
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
|
||||
with pytest.raises(RuntimeError, match="is not active, state: Loading"):
|
||||
env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
|
||||
finally:
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
@@ -221,10 +219,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
branch_id = TimelineId.generate()
|
||||
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match="Cannot branch off the timeline that's not present in pageserver",
|
||||
):
|
||||
with pytest.raises(RetryError, match="too many 503 error responses"):
|
||||
ps_http.timeline_create(
|
||||
env.pg_version,
|
||||
env.initial_tenant,
|
||||
|
||||
@@ -784,54 +784,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
||||
create_thread.join()
|
||||
|
||||
|
||||
def test_paused_upload_stalls_checkpoint(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
This test checks that checkpoints block on uploads to remote storage.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
# Set a small compaction threshold
|
||||
"compaction_threshold": "3",
|
||||
# Disable GC
|
||||
"gc_period": "0s",
|
||||
# disable PITR
|
||||
"pitr_interval": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
|
||||
deltas_at_creation = len(layers_at_creation.delta_layers())
|
||||
assert (
|
||||
deltas_at_creation == 1
|
||||
), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
|
||||
|
||||
# Make new layer uploads get stuck.
|
||||
# Note that timeline creation waits for the initial layers to reach remote storage.
|
||||
# So at this point, the `layers_at_creation` are in remote storage.
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "pause"))
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
# Build two tables with some data inside
|
||||
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
with pytest.raises(ReadTimeout):
|
||||
client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
||||
|
||||
|
||||
def wait_upload_queue_empty(
|
||||
client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user