mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
This patch adds a LaunchTimestamp type to the `metrics` crate, along with a `libmetric_` Prometheus metric. The initial user is pageserver. In addition to exposing the Prometheus metric, it also reproduces the launch timestamp as a header in the API responses. The motivation for this is that we plan to scrape the pageserver's /v1/tenant/:tenant_id/timeline/:timeline_id/layer HTTP endpoint over time. It will soon expose access metrics (#3496) which reset upon process restart. We will use the pageserver's launch ID to identify a restart between two scrape points. However, there are other potential uses. For example, we could use the Prometheus metric to annotate Grafana plots whenever the launch timestamp changes.
82 lines
2.9 KiB
Python
82 lines
2.9 KiB
Python
from collections import defaultdict
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
from prometheus_client.parser import text_string_to_metric_families
|
|
from prometheus_client.samples import Sample
|
|
|
|
|
|
class Metrics:
|
|
metrics: Dict[str, List[Sample]]
|
|
name: str
|
|
|
|
def __init__(self, name: str = ""):
|
|
self.metrics = defaultdict(list)
|
|
self.name = name
|
|
|
|
def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]:
|
|
res = []
|
|
for sample in self.metrics[name]:
|
|
try:
|
|
if all(sample.labels[k] == v for k, v in filter.items()):
|
|
res.append(sample)
|
|
except KeyError:
|
|
pass
|
|
return res
|
|
|
|
def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
|
|
res = self.query_all(name, filter or {})
|
|
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
|
|
return res[0]
|
|
|
|
|
|
def parse_metrics(text: str, name: str = "") -> Metrics:
|
|
metrics = Metrics(name)
|
|
gen = text_string_to_metric_families(text)
|
|
for family in gen:
|
|
for sample in family.samples:
|
|
metrics.metrics[sample.name].append(sample)
|
|
|
|
return metrics
|
|
|
|
|
|
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
|
"pageserver_remote_timeline_client_calls_unfinished",
|
|
*[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
|
|
*[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
|
"pageserver_remote_physical_size",
|
|
)
|
|
|
|
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
|
"pageserver_storage_operations_seconds_global_count",
|
|
"pageserver_storage_operations_seconds_global_sum",
|
|
"pageserver_storage_operations_seconds_global_bucket",
|
|
"libmetrics_launch_timestamp",
|
|
"libmetrics_build_info",
|
|
)
|
|
|
|
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
|
"pageserver_current_logical_size",
|
|
"pageserver_resident_physical_size",
|
|
"pageserver_getpage_reconstruct_seconds_bucket",
|
|
"pageserver_getpage_reconstruct_seconds_count",
|
|
"pageserver_getpage_reconstruct_seconds_sum",
|
|
"pageserver_io_operations_bytes_total",
|
|
"pageserver_io_operations_seconds_bucket",
|
|
"pageserver_io_operations_seconds_count",
|
|
"pageserver_io_operations_seconds_sum",
|
|
"pageserver_last_record_lsn",
|
|
"pageserver_materialized_cache_hits_total",
|
|
"pageserver_smgr_query_seconds_bucket",
|
|
"pageserver_smgr_query_seconds_count",
|
|
"pageserver_smgr_query_seconds_sum",
|
|
"pageserver_storage_operations_seconds_count_total",
|
|
"pageserver_storage_operations_seconds_sum_total",
|
|
"pageserver_wait_lsn_seconds_bucket",
|
|
"pageserver_wait_lsn_seconds_count",
|
|
"pageserver_wait_lsn_seconds_sum",
|
|
"pageserver_created_persistent_files_total",
|
|
"pageserver_written_persistent_bytes_total",
|
|
"pageserver_tenant_states_count",
|
|
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
|
)
|