Files
neon/test_runner/fixtures/metrics.py
Arpad Müller ee7bbdda0e Create new metric for directory counts (#6736)
There is O(n^2) issues due to how we store these directories (#6626), so
it's good to keep an eye on them and ensure the numbers stay low.

The new per-timeline metric `pageserver_directory_entries_count`
isn't perfect, namely we don't calculate it every time we attach
the timeline, but only if there is an actual change.
Also, it is a collective metric over multiple scalars. Lastly,
we only emit the metric if it is above a certain threshold.

However, the metric still give a feel for the general size of the timeline.
We care less for small values as the metric is mainly there to
detect and track tenants with large directory counts.

We also expose the directory counts in `TimelineInfo` so that one can
get the detailed size distribution directly via the pageserver's API.

Related: #6642 , https://github.com/neondatabase/cloud/issues/10273
2024-02-14 02:12:00 +01:00

102 lines
3.9 KiB
Python

from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from prometheus_client.parser import text_string_to_metric_families
from prometheus_client.samples import Sample
class Metrics:
metrics: Dict[str, List[Sample]]
name: str
def __init__(self, name: str = ""):
self.metrics = defaultdict(list)
self.name = name
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
filter = filter or {}
res = []
for sample in self.metrics[name]:
try:
if all(sample.labels[k] == v for k, v in filter.items()):
res.append(sample)
except KeyError:
pass
return res
def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
res = self.query_all(name, filter or {})
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
return res[0]
def parse_metrics(text: str, name: str = "") -> Metrics:
metrics = Metrics(name)
gen = text_string_to_metric_families(text)
for family in gen:
for sample in family.samples:
metrics.metrics[sample.name].append(sample)
return metrics
def histogram(prefix_without_trailing_underscore: str) -> List[str]:
assert not prefix_without_trailing_underscore.endswith("_")
return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_timeline_client_calls_unfinished",
"pageserver_remote_physical_size",
"pageserver_remote_timeline_client_bytes_started_total",
"pageserver_remote_timeline_client_bytes_finished_total",
)
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"pageserver_storage_operations_seconds_global_count",
"pageserver_storage_operations_seconds_global_sum",
"pageserver_storage_operations_seconds_global_bucket",
"pageserver_unexpected_ondemand_downloads_count_total",
"libmetrics_launch_timestamp",
"libmetrics_build_info",
"libmetrics_tracing_event_count_total",
"pageserver_materialized_cache_hits_total",
"pageserver_materialized_cache_hits_direct_total",
"pageserver_page_cache_read_hits_total",
"pageserver_page_cache_read_accesses_total",
"pageserver_page_cache_size_current_bytes",
"pageserver_page_cache_size_max_bytes",
"pageserver_getpage_reconstruct_seconds_bucket",
"pageserver_getpage_reconstruct_seconds_count",
"pageserver_getpage_reconstruct_seconds_sum",
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
*histogram("pageserver_smgr_query_seconds_global"),
*histogram("pageserver_read_num_fs_layers"),
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
*histogram("pageserver_wait_lsn_seconds"),
*histogram("pageserver_remote_operation_seconds"),
*histogram("pageserver_remote_timeline_client_calls_started"),
*histogram("pageserver_io_operations_seconds"),
"pageserver_tenant_states_count",
)
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_current_logical_size",
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",
"pageserver_last_record_lsn",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
"pageserver_smgr_query_seconds_sum",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total",
"pageserver_evictions_total",
"pageserver_evictions_with_low_residence_duration_total",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold
# "pageserver_broken_tenants_count" -- used only for broken
)