mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 04:52:55 +00:00
Add more common storage metrics (#1722)
- Enabled process exporter for storage services - Changed zenith_proxy prefix to just proxy - Removed old `monitoring` directory - Removed common prefix for metrics, now our common metrics have `libmetrics_` prefix, for example `libmetrics_serve_metrics_count` - Added `test_metrics_normal_work`
This commit is contained in:
committed by
GitHub
parent
55ea3f262e
commit
134eeeb096
@@ -1,8 +1,12 @@
|
||||
from contextlib import closing
|
||||
|
||||
from datetime import datetime
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.utils import lsn_to_hex
|
||||
|
||||
|
||||
@pytest.mark.parametrize('with_safekeepers', [False, True])
|
||||
@@ -38,3 +42,79 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
cur.execute("SELECT sum(key) FROM t")
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
|
||||
|
||||
def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder):
|
||||
zenith_env_builder.num_safekeepers = 3
|
||||
|
||||
env = zenith_env_builder.init_start()
|
||||
tenant_1, _ = env.zenith_cli.create_tenant()
|
||||
tenant_2, _ = env.zenith_cli.create_tenant()
|
||||
|
||||
timeline_1 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1)
|
||||
timeline_2 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2)
|
||||
|
||||
pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1)
|
||||
pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2)
|
||||
|
||||
for pg in [pg_tenant1, pg_tenant2]:
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
cur.execute("SELECT sum(key) FROM t")
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
|
||||
collected_metrics = {
|
||||
"pageserver": env.pageserver.http_client().get_metrics(),
|
||||
}
|
||||
for sk in env.safekeepers:
|
||||
collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str()
|
||||
|
||||
for name in collected_metrics:
|
||||
basepath = os.path.join(zenith_env_builder.repo_dir, f'{name}.metrics')
|
||||
|
||||
with open(basepath, 'w') as stdout_f:
|
||||
print(collected_metrics[name], file=stdout_f, flush=True)
|
||||
|
||||
all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()]
|
||||
ps_metrics = all_metrics[0]
|
||||
sk_metrics = all_metrics[1:]
|
||||
|
||||
ttids = [{
|
||||
'tenant_id': tenant_1.hex, 'timeline_id': timeline_1.hex
|
||||
}, {
|
||||
'tenant_id': tenant_2.hex, 'timeline_id': timeline_2.hex
|
||||
}]
|
||||
|
||||
# Test metrics per timeline
|
||||
for tt in ttids:
|
||||
log.info(f"Checking metrics for {tt}")
|
||||
|
||||
ps_lsn = int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value)
|
||||
sk_lsns = [int(sk.query_one("safekeeper_commit_lsn", filter=tt).value) for sk in sk_metrics]
|
||||
|
||||
log.info(f"ps_lsn: {lsn_to_hex(ps_lsn)}")
|
||||
log.info(f"sk_lsns: {list(map(lsn_to_hex, sk_lsns))}")
|
||||
|
||||
assert ps_lsn <= max(sk_lsns)
|
||||
assert ps_lsn > 0
|
||||
|
||||
# Test common metrics
|
||||
for metrics in all_metrics:
|
||||
log.info(f"Checking common metrics for {metrics.name}")
|
||||
|
||||
log.info(
|
||||
f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}")
|
||||
log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}")
|
||||
log.info(
|
||||
f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}"
|
||||
)
|
||||
log.info(
|
||||
f"process_virtual_memory_bytes (MB): {metrics.query_one('process_virtual_memory_bytes').value / 1024 / 1024}"
|
||||
)
|
||||
log.info(f"process_open_fds: {int(metrics.query_one('process_open_fds').value)}")
|
||||
log.info(f"process_max_fds: {int(metrics.query_one('process_max_fds').value)}")
|
||||
log.info(
|
||||
f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
|
||||
)
|
||||
|
||||
@@ -236,14 +236,14 @@ class ZenithBenchmarker:
|
||||
"""
|
||||
Fetch the "cumulative # of bytes written" metric from the pageserver
|
||||
"""
|
||||
metric_name = r'pageserver_disk_io_bytes{io_operation="write"}'
|
||||
metric_name = r'libmetrics_disk_io_bytes{io_operation="write"}'
|
||||
return self.get_int_counter_value(pageserver, metric_name)
|
||||
|
||||
def get_peak_mem(self, pageserver) -> int:
|
||||
"""
|
||||
Fetch the "maxrss" metric from the pageserver
|
||||
"""
|
||||
metric_name = r'pageserver_maxrss_kb'
|
||||
metric_name = r'libmetrics_maxrss_kb'
|
||||
return self.get_int_counter_value(pageserver, metric_name)
|
||||
|
||||
def get_int_counter_value(self, pageserver, metric_name) -> int:
|
||||
|
||||
38
test_runner/fixtures/metrics.py
Normal file
38
test_runner/fixtures/metrics.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from dataclasses import dataclass
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from prometheus_client.samples import Sample
|
||||
from typing import Dict, List
|
||||
from collections import defaultdict
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
class Metrics:
|
||||
metrics: Dict[str, List[Sample]]
|
||||
name: str
|
||||
|
||||
def __init__(self, name: str = ""):
|
||||
self.metrics = defaultdict(list)
|
||||
self.name = name
|
||||
|
||||
def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]:
|
||||
res = []
|
||||
for sample in self.metrics[name]:
|
||||
if all(sample.labels[k] == v for k, v in filter.items()):
|
||||
res.append(sample)
|
||||
return res
|
||||
|
||||
def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample:
|
||||
res = self.query_all(name, filter)
|
||||
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
|
||||
return res[0]
|
||||
|
||||
|
||||
def parse_metrics(text: str, name: str = ""):
|
||||
metrics = Metrics(name)
|
||||
gen = text_string_to_metric_families(text)
|
||||
for family in gen:
|
||||
for sample in family.samples:
|
||||
metrics.metrics[sample.name].append(sample)
|
||||
|
||||
return metrics
|
||||
@@ -1833,10 +1833,13 @@ class SafekeeperHttpClient(requests.Session):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
def get_metrics_str(self) -> str:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
all_metrics_text = request_result.text
|
||||
return request_result.text
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
all_metrics_text = self.get_metrics_str()
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(
|
||||
|
||||
Reference in New Issue
Block a user