pageserver: do not bump priority of background task for timeline status requests (#6301)

## Problem Previously, `GET /v1/tenant/:tenant_id/timeline` and `GET /v1/tenant/:tenant_id/timeline/:timeline_id` would bump the priority of the background task which computes the initial logical size by cancelling the wait on the synchronisation semaphore. However, the request would still return an approximate logical size. It's undesirable to force background work for a status request. ## Summary of changes This PR updates the priority used by the timeline status request such that they don't do priority boosting by default anymore. An optional query parameter, `force-await-initial-logical-size`, is added for both mentioned endpoints. When set to true, it will skip the concurrency limiting semaphore and wait for the background task to complete before returning the exact logical size. In order to exercise this behaviour in a test I had to add an extra failpoint. If you think it's too intrusive, it can be removed. Also fixeda small bug where the cancellation of a download is reported as an opaque download failure upstream. This caused `test_location_conf_churn` to fail at teardown due to a WARN log line. Closes https://github.com/neondatabase/neon/issues/6168
2026-07-13 17:10:39 +00:00 · 2024-01-11 15:55:32 +00:00
parent 551f0cc097
commit da7a7c867e
9 changed files with 138 additions and 14 deletions
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -441,6 +441,7 @@ class PageserverHttpClient(requests.Session):
        timeline_id: TimelineId,
        include_non_incremental_logical_size: bool = False,
        include_timeline_dir_layer_file_size_sum: bool = False,
+        force_await_initial_logical_size: bool = False,
        **kwargs,
    ) -> Dict[Any, Any]:
        params = {}
@@ -448,6 +449,8 @@ class PageserverHttpClient(requests.Session):
            params["include-non-incremental-logical-size"] = "true"
        if include_timeline_dir_layer_file_size_sum:
            params["include-timeline-dir-layer-file-size-sum"] = "true"
+        if force_await_initial_logical_size:
+            params["force-await-initial-logical-size"] = "true"

        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -923,3 +923,68 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    # Check that all the stuck tenants proceed to active (apart from the one that deletes)
    wait_until(10, 1, all_active)
    assert len(get_tenant_states()) == n_tenants - 1
+
+
+def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
+    """
+    /v1/tenant/:tenant_shard_id/timeline and /v1/tenant/:tenant_shard_id
+    should not bump the priority of the initial logical size computation
+    background task, unless the force-await-initial-logical-size query param
+    is set to true.
+
+    This test verifies the invariant stated above. A couple of tricks are involved:
+    1. Detach the tenant and re-attach it after the page server is restarted. This circumvents
+    the warm-up which forces the initial logical size calculation.
+    2. A fail point (initial-size-calculation-permit-pause) is used to block the initial
+    computation of the logical size until forced.
+    3. A fail point (walreceiver-after-ingest) is used to pause the walreceiver since
+    otherwise it would force the logical size computation.
+    """
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # load in some data
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    endpoint.safe_psql_many(
+        [
+            "CREATE TABLE foo (x INTEGER)",
+            "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
+        ]
+    )
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    # restart with failpoint inside initial size calculation task
+    log.info(f"Detaching tenant {tenant_id} and stopping pageserver...")
+
+    endpoint.stop()
+    env.pageserver.tenant_detach(tenant_id)
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    log.info(f"Re-attaching tenant {tenant_id}...")
+    env.pageserver.tenant_attach(tenant_id)
+
+    # kick off initial size calculation task (the response we get here is the estimated size)
+    def assert_initial_logical_size_not_prioritised():
+        details = client.timeline_detail(tenant_id, timeline_id)
+        assert details["current_logical_size_is_accurate"] is False
+
+    assert_initial_logical_size_not_prioritised()
+
+    # ensure that's actually the case
+    time.sleep(2)
+    assert_initial_logical_size_not_prioritised()
+
+    details = client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True)
+    assert details["current_logical_size_is_accurate"] is True
+
+    client.configure_failpoints(
+        [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
+    )