Files
neon/test_runner/fixtures/metrics.py
John Spray 1678dea20f pageserver: add layer visibility calculation (#8511)
## Problem

We recently added a "visibility" state to layers, but nothing
initializes it.

Part of:
- #8398 

## Summary of changes

- Add a dependency on `range-set-blaze`, which is used as a fast
incrementally updated alternative to KeySpace. We could also use this to
replace the internals of KeySpaceRandomAccum if we wanted to. Writing a
type that does this kind of "BtreeMap & merge overlapping entries" thing
isn't super complicated, but no reason to write this ourselves when
there's a third party impl available.
- Add a function to layermap to calculate visibilities for each layer
- Add a function to Timeline to call into layermap and then apply these
visibilities to the Layer objects.
- Invoke the calculation during startup, after image layer creations,
and when removing branches. Branch removal and image layer creation are
the two ways that a layer can go from Visible to Covered.
- Add unit test & benchmark for the visibility calculation
- Expose `pageserver_visible_physical_size` metric, which should always
be <= `pageserver_remote_physical_size`.
- This metric will feed into the /v1/utilization endpoint later: the
visible size indicates how much space we would like to use on this
pageserver for this tenant.
- When `pageserver_visible_physical_size` is greater than
`pageserver_resident_physical_size`, this is a sign that the tenant has
long-idle branches, which result in layers that are visible in
principle, but not used in practice.

This does not keep visibility hints up to date in all cases:
particularly, when creating a child timeline, any previously covered
layers will not get marked Visible until they are accessed.

Updates after image layer creation could be implemented as more of a
special case, but this would require more new code: the existing depth
calculation code doesn't maintain+yield the list of deltas that would be
covered by an image layer.

## Performance

This operation is done rarely (at startup and at timeline deletion), so
needs to be efficient but not ultra-fast.

There is a new `visibility` bench that measures runtime for a synthetic
100k layers case (`sequential`) and a real layer map (`real_map`) with
~26k layers.

The benchmark shows runtimes of single digit milliseconds (on a ryzen
7950). This confirms that the runtime shouldn't be a problem at startup
(as we already incur S3-level latencies there), but that it's slow
enough that we definitely shouldn't call it more often than necessary,
and it may be worthwhile to optimize further later (things like: when
removing a branch, only bother scanning layers below the branchpoint)

```
visibility/sequential   time:   [4.5087 ms 4.5894 ms 4.6775 ms]
                        change: [+2.0826% +3.9097% +5.8995%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 24 outliers among 100 measurements (24.00%)
  2 (2.00%) high mild
  22 (22.00%) high severe
min: 0/1696070, max: 93/1C0887F0
visibility/real_map     time:   [7.0796 ms 7.0832 ms 7.0871 ms]
                        change: [+0.3900% +0.4505% +0.5164%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
  3 (3.00%) high mild
  1 (1.00%) high severe
min: 0/1696070, max: 93/1C0887F0
visibility/real_map_many_branches
                        time:   [4.5285 ms 4.5355 ms 4.5434 ms]
                        change: [-1.0012% -0.8004% -0.5969%] (p = 0.00 < 0.05)
                        Change within noise threshold.
```
2024-08-01 09:25:35 +00:00

164 lines
6.1 KiB
Python

from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from prometheus_client.parser import text_string_to_metric_families
from prometheus_client.samples import Sample
from fixtures.log_helper import log
class Metrics:
metrics: Dict[str, List[Sample]]
name: str
def __init__(self, name: str = ""):
self.metrics = defaultdict(list)
self.name = name
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
filter = filter or {}
res = []
for sample in self.metrics[name]:
try:
if all(sample.labels[k] == v for k, v in filter.items()):
res.append(sample)
except KeyError:
pass
return res
def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
res = self.query_all(name, filter or {})
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
return res[0]
class MetricsGetter:
"""
Mixin for types that implement a `get_metrics` function and would like associated
helpers for querying the metrics
"""
def get_metrics(self) -> Metrics:
raise NotImplementedError()
def get_metric_value(
self, name: str, filter: Optional[Dict[str, str]] = None
) -> Optional[float]:
metrics = self.get_metrics()
results = metrics.query_all(name, filter=filter)
if not results:
log.info(f'could not find metric "{name}"')
return None
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
return results[0].value
def get_metrics_values(
self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
) -> Dict[str, float]:
"""
When fetching multiple named metrics, it is more efficient to use this
than to call `get_metric_value` repeatedly.
Throws RuntimeError if no metrics matching `names` are found, or if
not all of `names` are found: this method is intended for loading sets
of metrics whose existence is coupled.
If it's expected that there may be no results for some of the metrics,
specify `absence_ok=True`. The returned dict will then not contain values
for these metrics.
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))
result = {}
for sample in samples:
if sample.name in result:
raise RuntimeError(f"Multiple values found for {sample.name}")
result[sample.name] = sample.value
if not absence_ok:
if len(result) != len(names):
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
return result
def parse_metrics(text: str, name: str = "") -> Metrics:
metrics = Metrics(name)
gen = text_string_to_metric_families(text)
for family in gen:
for sample in family.samples:
metrics.metrics[sample.name].append(sample)
return metrics
def histogram(prefix_without_trailing_underscore: str) -> List[str]:
assert not prefix_without_trailing_underscore.endswith("_")
return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_timeline_client_calls_started_total",
"pageserver_remote_timeline_client_calls_finished_total",
"pageserver_remote_physical_size",
"pageserver_remote_timeline_client_bytes_started_total",
"pageserver_remote_timeline_client_bytes_finished_total",
)
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"pageserver_storage_operations_seconds_global_count",
"pageserver_storage_operations_seconds_global_sum",
"pageserver_storage_operations_seconds_global_bucket",
"pageserver_unexpected_ondemand_downloads_count_total",
"libmetrics_launch_timestamp",
"libmetrics_build_info",
"libmetrics_tracing_event_count_total",
"pageserver_page_cache_read_hits_total",
"pageserver_page_cache_read_accesses_total",
"pageserver_page_cache_size_current_bytes",
"pageserver_page_cache_size_max_bytes",
"pageserver_getpage_reconstruct_seconds_bucket",
"pageserver_getpage_reconstruct_seconds_count",
"pageserver_getpage_reconstruct_seconds_sum",
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
*histogram("pageserver_smgr_query_seconds_global"),
*histogram("pageserver_layers_visited_per_read_global"),
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
*histogram("pageserver_wait_lsn_seconds"),
*histogram("pageserver_remote_operation_seconds"),
*histogram("pageserver_io_operations_seconds"),
"pageserver_tenant_states_count",
"pageserver_circuit_breaker_broken_total",
"pageserver_circuit_breaker_unbroken_total",
)
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_current_logical_size",
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",
"pageserver_last_record_lsn",
"pageserver_standby_horizon",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
"pageserver_smgr_query_seconds_sum",
"pageserver_archive_size",
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_visible_physical_size",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",
"pageserver_evictions_with_low_residence_duration_total",
"pageserver_aux_file_estimated_size",
"pageserver_valid_lsn_lease_count",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold
# "pageserver_broken_tenants_count" -- used only for broken
)