feat(metrics): per-timeline metric for on-demand downloads, remove calls_started histogram (#6834)

refs #6737 # Problem Before this PR, on-demand downloads weren't measured per tenant_id. This makes root-cause analysis of latency spikes harder, requiring us to resort to log scraping for ``` {neon_service="pageserver"} |= `downloading on-demand` |= `$tenant_id` ``` which can be expensive when zooming out in Grafana. Context: https://neondb.slack.com/archives/C033RQ5SPDH/p1707809037868189 # Solution / Changes - Remove the calls_started histogram - I did the dilegence, there are only 2 dashboards using this histogram, and in fact only one uses it as a histogram, the other just as a a counter. - [Link 1](8115b54d9f/neonprod/dashboards/hkXNF7oVz/dashboard-Z31XmM24k.yaml (L1454)): `Pageserver Thrashing` dashboard, linked from playbook, will fix. - [Link 2](8115b54d9f/neonprod/dashboards/CEllzAO4z/dashboard-sJqfNFL4k.yaml (L599)): one of my personal dashboards, unused for a long time, already broken in other ways, no need to fix. - replace `pageserver_remote_timeline_client_calls_unfinished` gauge with a counter pair - Required `Clone`-able `IntCounterPair`, made the necessary changes in the `libs/metrics` crate - fix tests to deal with the fallout A subsequent PR will remove a timeline-scoped metric to compensate. Note that we don't need additional global counters for the per-timeline counters affected by this PR; we can use the `remote_storage` histogram for those, which, conveniently, also include the secondary-mode downloads, which aren't covered by the remote timeline client metrics (should they?).
2026-01-08 05:52:55 +00:00 · 2024-02-20 17:52:23 +01:00
parent eb02f4619e
commit e49602ecf5
7 changed files with 125 additions and 136 deletions
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -54,7 +54,7 @@ class MetricsGetter:
        return results[0].value

    def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
+        self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
    ) -> Dict[str, float]:
        """
        When fetching multiple named metrics, it is more efficient to use this
@@ -63,6 +63,10 @@ class MetricsGetter:
        Throws RuntimeError if no metrics matching `names` are found, or if
        not all of `names` are found: this method is intended for loading sets
        of metrics whose existence is coupled.
+
+        If it's expected that there may be no results for some of the metrics,
+        specify `absence_ok=True`. The returned dict will then not contain values
+        for these metrics.
        """
        metrics = self.get_metrics()
        samples = []
@@ -75,9 +79,10 @@ class MetricsGetter:
                raise RuntimeError(f"Multiple values found for {sample.name}")
            result[sample.name] = sample.value

-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+        if not absence_ok:
+            if len(result) != len(names):
+                log.info(f"Metrics found: {metrics.metrics}")
+                raise RuntimeError(f"could not find all metrics {' '.join(names)}")

        return result

@@ -98,7 +103,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:


 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
-    "pageserver_remote_timeline_client_calls_unfinished",
+    "pageserver_remote_timeline_client_calls_started_total",
+    "pageserver_remote_timeline_client_calls_finished_total",
    "pageserver_remote_physical_size",
    "pageserver_remote_timeline_client_bytes_started_total",
    "pageserver_remote_timeline_client_bytes_finished_total",
@@ -127,7 +133,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
    *histogram("pageserver_wait_lsn_seconds"),
    *histogram("pageserver_remote_operation_seconds"),
-    *histogram("pageserver_remote_timeline_client_calls_started"),
    *histogram("pageserver_io_operations_seconds"),
    "pageserver_tenant_states_count",
 )
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -694,32 +694,33 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
            },
        ).value

-    def get_remote_timeline_client_metric(
+    def get_remote_timeline_client_queue_count(
        self,
-        metric_name: str,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        file_kind: str,
        op_kind: str,
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        matches = metrics.query_all(
-            name=metric_name,
+    ) -> Optional[int]:
+        metrics = [
+            "pageserver_remote_timeline_client_calls_started_total",
+            "pageserver_remote_timeline_client_calls_finished_total",
+        ]
+        res = self.get_metrics_values(
+            metrics,
            filter={
                "tenant_id": str(tenant_id),
                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
+            absence_ok=True,
        )
-        if len(matches) == 0:
-            value = None
-        elif len(matches) == 1:
-            value = matches[0].value
-            assert value is not None
-        else:
-            assert len(matches) < 2, "above filter should uniquely identify metric"
-        return value
+        if len(res) != 2:
+            return None
+        inc, dec = [res[metric] for metric in metrics]
+        queue_count = int(inc) - int(dec)
+        assert queue_count >= 0
+        return queue_count

    def layer_map_info(
        self,
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,5 +1,5 @@
 import time
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 from mypy_boto3_s3.type_defs import (
    DeleteObjectOutputTypeDef,
@@ -221,16 +221,40 @@ def wait_for_upload_queue_empty(
 ):
    while True:
        all_metrics = pageserver_http.get_metrics()
-        tl = all_metrics.query_all(
-            "pageserver_remote_timeline_client_calls_unfinished",
+        started = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_started_total",
            {
                "tenant_id": str(tenant_id),
                "timeline_id": str(timeline_id),
            },
        )
-        assert len(tl) > 0
-        log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
-        if all(m.value == 0 for m in tl):
+        finished = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_finished_total",
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        )
+        assert len(started) == len(finished)
+        # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
+        remaining_labels = ["shard_id", "file_kind", "op_kind"]
+        tl: List[Tuple[Any, float]] = []
+        for s in started:
+            found = False
+            for f in finished:
+                if all([s.labels[label] == f.labels[label] for label in remaining_labels]):
+                    assert (
+                        not found
+                    ), "duplicate match, remaining_labels don't uniquely identify sample"
+                    tl.append((s.labels, int(s.value) - int(f.value)))
+                    found = True
+            if not found:
+                tl.append((s.labels, int(s.value)))
+        assert len(tl) == len(started), "something broken with join logic"
+        log.info(f"upload queue for {tenant_id}/{timeline_id}:")
+        for labels, queue_count in tl:
+            log.info(f"  {labels}: {queue_count}")
+        if all(queue_count == 0 for (_, queue_count) in tl):
            return
        time.sleep(0.2)

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -274,15 +274,9 @@ def test_remote_storage_upload_queue_retries(
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
+        return client.get_remote_timeline_client_queue_count(
+            tenant_id, timeline_id, file_kind, op_kind
        )
-        assert val is not None, "expecting metric to be present"
-        return int(val)

    # create some layers & wait for uploads to finish
    overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -434,7 +428,7 @@ def test_remote_timeline_client_calls_started_metric(
        assert timeline_id is not None
        for (file_kind, op_kind), observations in calls_started.items():
            val = client.get_metric_value(
-                name="pageserver_remote_timeline_client_calls_started_count",
+                name="pageserver_remote_timeline_client_calls_started_total",
                filter={
                    "file_kind": str(file_kind),
                    "op_kind": str(op_kind),
@@ -537,16 +531,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(

    client = env.pageserver.http_client()

-    def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
-        )
-        return int(val) if val is not None else val
-
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

    client.configure_failpoints(("before-upload-layer", "return"))
@@ -580,7 +564,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    def assert_compacted_and_uploads_queued():
        assert timeline_path.exists()
        assert len(list(timeline_path.glob("*"))) >= 8
-        assert get_queued_count(file_kind="index", op_kind="upload") > 0
+        assert (
+            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload")
+            > 0
+        )

    wait_until(20, 0.1, assert_compacted_and_uploads_queued)

@@ -618,7 +605,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    assert len(filtered) == 0

    # timeline deletion should kill ongoing uploads, so, the metric will be gone
-    assert get_queued_count(file_kind="index", op_kind="upload") is None
+    assert (
+        get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload")
+        is None
+    )

    # timeline deletion should be unblocking checkpoint ops
    checkpoint_thread.join(2.0)
@@ -919,16 +909,8 @@ def get_queued_count(
    file_kind: str,
    op_kind: str,
 ):
-    val = client.get_remote_timeline_client_metric(
-        "pageserver_remote_timeline_client_calls_unfinished",
-        tenant_id,
-        timeline_id,
-        file_kind,
-        op_kind,
-    )
-    if val is None:
-        return val
-    return int(val)
+    """The most important aspect of this function is shorter name & no return type so asserts are more concise."""
+    return client.get_remote_timeline_client_queue_count(tenant_id, timeline_id, file_kind, op_kind)


 def assert_nothing_to_upload(