mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 20:50:37 +00:00
For the "worst-case /storage usage panel", we need to compute ``` remote size + local-only size ``` We currently don't have a metric for local-only layers. The number of in-flight layers in the upload queue is just that, so, let Prometheus scrape it. The metric is two counters (started and finished). The delta is the amount of in-flight uploads in the queue. The metrics are incremented in the respective `call_unfinished_metric_*` functions. These track ongoing operations by file_kind and op_kind. We only need this metric for layer uploads, so, there's the new RemoteTimelineClientMetricsCallTrackSize type that forces all call sites to decide whether they want the size tracked or not. If we find that other file_kinds or op_kinds are interesting (metadata uploads, layer downloads, layer deletes) are interesting, we can just enable them, and they'll be just another label combination within the metrics that this PR adds. fixes https://github.com/neondatabase/neon/issues/3922
88 lines
3.2 KiB
Python
88 lines
3.2 KiB
Python
from collections import defaultdict
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
from prometheus_client.parser import text_string_to_metric_families
|
|
from prometheus_client.samples import Sample
|
|
|
|
|
|
class Metrics:
|
|
metrics: Dict[str, List[Sample]]
|
|
name: str
|
|
|
|
def __init__(self, name: str = ""):
|
|
self.metrics = defaultdict(list)
|
|
self.name = name
|
|
|
|
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
|
|
filter = filter or {}
|
|
res = []
|
|
for sample in self.metrics[name]:
|
|
try:
|
|
if all(sample.labels[k] == v for k, v in filter.items()):
|
|
res.append(sample)
|
|
except KeyError:
|
|
pass
|
|
return res
|
|
|
|
def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
|
|
res = self.query_all(name, filter or {})
|
|
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
|
|
return res[0]
|
|
|
|
|
|
def parse_metrics(text: str, name: str = "") -> Metrics:
|
|
metrics = Metrics(name)
|
|
gen = text_string_to_metric_families(text)
|
|
for family in gen:
|
|
for sample in family.samples:
|
|
metrics.metrics[sample.name].append(sample)
|
|
|
|
return metrics
|
|
|
|
|
|
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
|
"pageserver_remote_timeline_client_calls_unfinished",
|
|
*[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
|
|
*[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
|
"pageserver_remote_physical_size",
|
|
"pageserver_remote_timeline_client_bytes_started_total",
|
|
"pageserver_remote_timeline_client_bytes_finished_total",
|
|
)
|
|
|
|
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
|
"pageserver_storage_operations_seconds_global_count",
|
|
"pageserver_storage_operations_seconds_global_sum",
|
|
"pageserver_storage_operations_seconds_global_bucket",
|
|
"libmetrics_launch_timestamp",
|
|
"libmetrics_build_info",
|
|
"libmetrics_tracing_event_count_total",
|
|
)
|
|
|
|
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
|
"pageserver_current_logical_size",
|
|
"pageserver_resident_physical_size",
|
|
"pageserver_getpage_reconstruct_seconds_bucket",
|
|
"pageserver_getpage_reconstruct_seconds_count",
|
|
"pageserver_getpage_reconstruct_seconds_sum",
|
|
"pageserver_io_operations_bytes_total",
|
|
"pageserver_io_operations_seconds_bucket",
|
|
"pageserver_io_operations_seconds_count",
|
|
"pageserver_io_operations_seconds_sum",
|
|
"pageserver_last_record_lsn",
|
|
"pageserver_materialized_cache_hits_total",
|
|
"pageserver_smgr_query_seconds_bucket",
|
|
"pageserver_smgr_query_seconds_count",
|
|
"pageserver_smgr_query_seconds_sum",
|
|
"pageserver_storage_operations_seconds_count_total",
|
|
"pageserver_storage_operations_seconds_sum_total",
|
|
"pageserver_wait_lsn_seconds_bucket",
|
|
"pageserver_wait_lsn_seconds_count",
|
|
"pageserver_wait_lsn_seconds_sum",
|
|
"pageserver_created_persistent_files_total",
|
|
"pageserver_written_persistent_bytes_total",
|
|
"pageserver_tenant_states_count",
|
|
"pageserver_evictions_total",
|
|
"pageserver_evictions_with_low_residence_duration_total",
|
|
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
|
)
|