mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
## Problem We recently added a "visibility" state to layers, but nothing initializes it. Part of: - #8398 ## Summary of changes - Add a dependency on `range-set-blaze`, which is used as a fast incrementally updated alternative to KeySpace. We could also use this to replace the internals of KeySpaceRandomAccum if we wanted to. Writing a type that does this kind of "BtreeMap & merge overlapping entries" thing isn't super complicated, but no reason to write this ourselves when there's a third party impl available. - Add a function to layermap to calculate visibilities for each layer - Add a function to Timeline to call into layermap and then apply these visibilities to the Layer objects. - Invoke the calculation during startup, after image layer creations, and when removing branches. Branch removal and image layer creation are the two ways that a layer can go from Visible to Covered. - Add unit test & benchmark for the visibility calculation - Expose `pageserver_visible_physical_size` metric, which should always be <= `pageserver_remote_physical_size`. - This metric will feed into the /v1/utilization endpoint later: the visible size indicates how much space we would like to use on this pageserver for this tenant. - When `pageserver_visible_physical_size` is greater than `pageserver_resident_physical_size`, this is a sign that the tenant has long-idle branches, which result in layers that are visible in principle, but not used in practice. This does not keep visibility hints up to date in all cases: particularly, when creating a child timeline, any previously covered layers will not get marked Visible until they are accessed. Updates after image layer creation could be implemented as more of a special case, but this would require more new code: the existing depth calculation code doesn't maintain+yield the list of deltas that would be covered by an image layer. ## Performance This operation is done rarely (at startup and at timeline deletion), so needs to be efficient but not ultra-fast. There is a new `visibility` bench that measures runtime for a synthetic 100k layers case (`sequential`) and a real layer map (`real_map`) with ~26k layers. The benchmark shows runtimes of single digit milliseconds (on a ryzen 7950). This confirms that the runtime shouldn't be a problem at startup (as we already incur S3-level latencies there), but that it's slow enough that we definitely shouldn't call it more often than necessary, and it may be worthwhile to optimize further later (things like: when removing a branch, only bother scanning layers below the branchpoint) ``` visibility/sequential time: [4.5087 ms 4.5894 ms 4.6775 ms] change: [+2.0826% +3.9097% +5.8995%] (p = 0.00 < 0.05) Performance has regressed. Found 24 outliers among 100 measurements (24.00%) 2 (2.00%) high mild 22 (22.00%) high severe min: 0/1696070, max: 93/1C0887F0 visibility/real_map time: [7.0796 ms 7.0832 ms 7.0871 ms] change: [+0.3900% +0.4505% +0.5164%] (p = 0.00 < 0.05) Change within noise threshold. Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe min: 0/1696070, max: 93/1C0887F0 visibility/real_map_many_branches time: [4.5285 ms 4.5355 ms 4.5434 ms] change: [-1.0012% -0.8004% -0.5969%] (p = 0.00 < 0.05) Change within noise threshold. ```
164 lines
6.1 KiB
Python
164 lines
6.1 KiB
Python
from collections import defaultdict
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
from prometheus_client.parser import text_string_to_metric_families
|
|
from prometheus_client.samples import Sample
|
|
|
|
from fixtures.log_helper import log
|
|
|
|
|
|
class Metrics:
|
|
metrics: Dict[str, List[Sample]]
|
|
name: str
|
|
|
|
def __init__(self, name: str = ""):
|
|
self.metrics = defaultdict(list)
|
|
self.name = name
|
|
|
|
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
|
|
filter = filter or {}
|
|
res = []
|
|
|
|
for sample in self.metrics[name]:
|
|
try:
|
|
if all(sample.labels[k] == v for k, v in filter.items()):
|
|
res.append(sample)
|
|
except KeyError:
|
|
pass
|
|
return res
|
|
|
|
def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
|
|
res = self.query_all(name, filter or {})
|
|
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
|
|
return res[0]
|
|
|
|
|
|
class MetricsGetter:
|
|
"""
|
|
Mixin for types that implement a `get_metrics` function and would like associated
|
|
helpers for querying the metrics
|
|
"""
|
|
|
|
def get_metrics(self) -> Metrics:
|
|
raise NotImplementedError()
|
|
|
|
def get_metric_value(
|
|
self, name: str, filter: Optional[Dict[str, str]] = None
|
|
) -> Optional[float]:
|
|
metrics = self.get_metrics()
|
|
results = metrics.query_all(name, filter=filter)
|
|
if not results:
|
|
log.info(f'could not find metric "{name}"')
|
|
return None
|
|
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
|
return results[0].value
|
|
|
|
def get_metrics_values(
|
|
self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
|
|
) -> Dict[str, float]:
|
|
"""
|
|
When fetching multiple named metrics, it is more efficient to use this
|
|
than to call `get_metric_value` repeatedly.
|
|
|
|
Throws RuntimeError if no metrics matching `names` are found, or if
|
|
not all of `names` are found: this method is intended for loading sets
|
|
of metrics whose existence is coupled.
|
|
|
|
If it's expected that there may be no results for some of the metrics,
|
|
specify `absence_ok=True`. The returned dict will then not contain values
|
|
for these metrics.
|
|
"""
|
|
metrics = self.get_metrics()
|
|
samples = []
|
|
for name in names:
|
|
samples.extend(metrics.query_all(name, filter=filter))
|
|
|
|
result = {}
|
|
for sample in samples:
|
|
if sample.name in result:
|
|
raise RuntimeError(f"Multiple values found for {sample.name}")
|
|
result[sample.name] = sample.value
|
|
|
|
if not absence_ok:
|
|
if len(result) != len(names):
|
|
log.info(f"Metrics found: {metrics.metrics}")
|
|
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
|
|
|
|
return result
|
|
|
|
|
|
def parse_metrics(text: str, name: str = "") -> Metrics:
|
|
metrics = Metrics(name)
|
|
gen = text_string_to_metric_families(text)
|
|
for family in gen:
|
|
for sample in family.samples:
|
|
metrics.metrics[sample.name].append(sample)
|
|
|
|
return metrics
|
|
|
|
|
|
def histogram(prefix_without_trailing_underscore: str) -> List[str]:
|
|
assert not prefix_without_trailing_underscore.endswith("_")
|
|
return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
|
|
|
|
|
|
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
|
|
"pageserver_remote_timeline_client_calls_started_total",
|
|
"pageserver_remote_timeline_client_calls_finished_total",
|
|
"pageserver_remote_physical_size",
|
|
"pageserver_remote_timeline_client_bytes_started_total",
|
|
"pageserver_remote_timeline_client_bytes_finished_total",
|
|
)
|
|
|
|
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
|
"pageserver_storage_operations_seconds_global_count",
|
|
"pageserver_storage_operations_seconds_global_sum",
|
|
"pageserver_storage_operations_seconds_global_bucket",
|
|
"pageserver_unexpected_ondemand_downloads_count_total",
|
|
"libmetrics_launch_timestamp",
|
|
"libmetrics_build_info",
|
|
"libmetrics_tracing_event_count_total",
|
|
"pageserver_page_cache_read_hits_total",
|
|
"pageserver_page_cache_read_accesses_total",
|
|
"pageserver_page_cache_size_current_bytes",
|
|
"pageserver_page_cache_size_max_bytes",
|
|
"pageserver_getpage_reconstruct_seconds_bucket",
|
|
"pageserver_getpage_reconstruct_seconds_count",
|
|
"pageserver_getpage_reconstruct_seconds_sum",
|
|
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
|
*histogram("pageserver_smgr_query_seconds_global"),
|
|
*histogram("pageserver_layers_visited_per_read_global"),
|
|
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
|
|
*histogram("pageserver_wait_lsn_seconds"),
|
|
*histogram("pageserver_remote_operation_seconds"),
|
|
*histogram("pageserver_io_operations_seconds"),
|
|
"pageserver_tenant_states_count",
|
|
"pageserver_circuit_breaker_broken_total",
|
|
"pageserver_circuit_breaker_unbroken_total",
|
|
)
|
|
|
|
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
|
"pageserver_current_logical_size",
|
|
"pageserver_resident_physical_size",
|
|
"pageserver_io_operations_bytes_total",
|
|
"pageserver_last_record_lsn",
|
|
"pageserver_standby_horizon",
|
|
"pageserver_smgr_query_seconds_bucket",
|
|
"pageserver_smgr_query_seconds_count",
|
|
"pageserver_smgr_query_seconds_sum",
|
|
"pageserver_archive_size",
|
|
"pageserver_pitr_history_size",
|
|
"pageserver_layer_bytes",
|
|
"pageserver_layer_count",
|
|
"pageserver_visible_physical_size",
|
|
"pageserver_storage_operations_seconds_count_total",
|
|
"pageserver_storage_operations_seconds_sum_total",
|
|
"pageserver_evictions_total",
|
|
"pageserver_evictions_with_low_residence_duration_total",
|
|
"pageserver_aux_file_estimated_size",
|
|
"pageserver_valid_lsn_lease_count",
|
|
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
|
# "pageserver_directory_entries_count", -- only used if above a certain threshold
|
|
# "pageserver_broken_tenants_count" -- used only for broken
|
|
)
|