Add more common storage metrics (#1722)

- Enabled process exporter for storage services
- Changed zenith_proxy prefix to just proxy
- Removed old `monitoring` directory
- Removed common prefix for metrics, now our common metrics have `libmetrics_` prefix, for example `libmetrics_serve_metrics_count`
- Added `test_metrics_normal_work`
This commit is contained in:
Arthur Petukhovsky
2022-05-17 19:29:01 +03:00
committed by GitHub
parent 55ea3f262e
commit 134eeeb096
18 changed files with 198 additions and 102 deletions

View File

@@ -355,7 +355,7 @@ jobs:
when: always
command: |
du -sh /tmp/test_output/*
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "etcd.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
du -sh /tmp/test_output/*
- store_artifacts:
path: /tmp/test_output

39
Cargo.lock generated
View File

@@ -166,7 +166,7 @@ dependencies = [
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"miniz_oxide 0.4.4",
"object",
"rustc-demangle",
]
@@ -868,6 +868,18 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e"
[[package]]
name = "flate2"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39522e96686d38f4bc984b9198e3a0613264abaebaff2c5c918bfa6b6da09af"
dependencies = [
"cfg-if",
"crc32fast",
"libc",
"miniz_oxide 0.5.1",
]
[[package]]
name = "fnv"
version = "1.0.7"
@@ -1527,6 +1539,15 @@ dependencies = [
"autocfg",
]
[[package]]
name = "miniz_oxide"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082"
dependencies = [
"adler",
]
[[package]]
name = "mio"
version = "0.8.2"
@@ -2088,6 +2109,20 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "procfs"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95e344cafeaeefe487300c361654bcfc85db3ac53619eeccced29f5ea18c4c70"
dependencies = [
"bitflags",
"byteorder",
"flate2",
"hex",
"lazy_static",
"libc",
]
[[package]]
name = "prometheus"
version = "0.13.0"
@@ -2097,8 +2132,10 @@ dependencies = [
"cfg-if",
"fnv",
"lazy_static",
"libc",
"memchr",
"parking_lot 0.11.2",
"procfs",
"thiserror",
]

View File

@@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"
[dependencies]
prometheus = {version = "0.13", default_features=false} # removes protobuf dependency
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
libc = "0.2"
lazy_static = "1.4"
once_cell = "1.8.0"

View File

@@ -3,7 +3,6 @@
//! Otherwise, we might not see all metrics registered via
//! a default registry.
use lazy_static::lazy_static;
use once_cell::race::OnceBox;
pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_gauge, Gauge};
pub use prometheus::{register_gauge_vec, GaugeVec};
@@ -27,48 +26,15 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
prometheus::gather()
}
static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
/// Sets a prefix which will be used for all common metrics, typically a service
/// name like 'pageserver'. Should be executed exactly once in the beginning of
/// any executable which uses common metrics.
pub fn set_common_metrics_prefix(prefix: &'static str) {
// Not unwrap() because metrics may be initialized after multiple threads have been started.
COMMON_METRICS_PREFIX
.set(prefix.into())
.unwrap_or_else(|_| {
eprintln!(
"set_common_metrics_prefix() was called second time with '{}', exiting",
prefix
);
std::process::exit(1);
});
}
/// Prepends a prefix to a common metric name so they are distinguished between
/// different services, see <https://github.com/zenithdb/zenith/pull/681>
/// A call to set_common_metrics_prefix() is necessary prior to calling this.
pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
// Not unwrap() because metrics may be initialized after multiple threads have been started.
format!(
"{}_{}",
COMMON_METRICS_PREFIX.get().unwrap_or_else(|| {
eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting");
std::process::exit(1);
}),
unprefixed_metric_name
)
}
lazy_static! {
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
new_common_metric_name("disk_io_bytes"),
"libmetrics_disk_io_bytes",
"Bytes written and read from disk, grouped by the operation (read|write)",
&["io_operation"]
)
.expect("Failed to register disk i/o bytes int gauge vec");
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
new_common_metric_name("maxrss_kb"),
"libmetrics_maxrss_kb",
"Memory usage (Maximum Resident Set Size)"
)
.expect("Failed to register maxrss_kb int gauge");

View File

@@ -5,7 +5,7 @@ use anyhow::anyhow;
use hyper::header::AUTHORIZATION;
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use lazy_static::lazy_static;
use metrics::{new_common_metric_name, register_int_counter, Encoder, IntCounter, TextEncoder};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use routerify::ext::RequestExt;
use routerify::RequestInfo;
use routerify::{Middleware, Router, RouterBuilder, RouterService};
@@ -18,7 +18,7 @@ use super::error::ApiError;
lazy_static! {
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
new_common_metric_name("serve_metrics_count"),
"libmetrics_serve_metrics_count",
"Number of metric requests made"
)
.expect("failed to define a metric");

View File

@@ -1,25 +0,0 @@
version: "3"
services:
prometheus:
container_name: prometheus
image: prom/prometheus:latest
volumes:
- ./prometheus.yaml:/etc/prometheus/prometheus.yml
# ports:
# - "9090:9090"
# TODO: find a proper portable solution
network_mode: "host"
grafana:
image: grafana/grafana:latest
volumes:
- ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
- GF_AUTH_DISABLE_LOGIN_FORM=true
# ports:
# - "3000:3000"
# TODO: find a proper portable solution
network_mode: "host"

View File

@@ -1,12 +0,0 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
orgId: 1
url: http://localhost:9090
basicAuth: false
isDefault: false
version: 1
editable: false

View File

@@ -1,5 +0,0 @@
scrape_configs:
- job_name: 'default'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9898']

View File

@@ -38,7 +38,6 @@ fn version() -> String {
}
fn main() -> anyhow::Result<()> {
metrics::set_common_metrics_prefix("pageserver");
let arg_matches = App::new("Zenith page server")
.about("Materializes WAL stream to pages and serves them to the postgres")
.version(&*version())

30
poetry.lock generated
View File

@@ -822,7 +822,7 @@ python-versions = "*"
[[package]]
name = "moto"
version = "3.1.7"
version = "3.1.9"
description = "A library that allows your python tests to easily mock out the boto library"
category = "main"
optional = false
@@ -868,6 +868,7 @@ ds = ["sshpubkeys (>=3.1.0)"]
dynamodb = ["docker (>=2.5.1)"]
dynamodb2 = ["docker (>=2.5.1)"]
dynamodbstreams = ["docker (>=2.5.1)"]
ebs = ["sshpubkeys (>=3.1.0)"]
ec2 = ["sshpubkeys (>=3.1.0)"]
efs = ["sshpubkeys (>=3.1.0)"]
glue = ["pyparsing (>=3.0.0)"]
@@ -953,6 +954,17 @@ importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "prometheus-client"
version = "0.14.1"
description = "Python client for the Prometheus monitoring system."
category = "main"
optional = false
python-versions = ">=3.6"
[package.extras]
twisted = ["twisted"]
[[package]]
name = "psycopg2-binary"
version = "2.9.3"
@@ -1003,7 +1015,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "pyjwt"
version = "2.3.0"
version = "2.4.0"
description = "JSON Web Token implementation in Python"
category = "main"
optional = false
@@ -1375,7 +1387,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
[metadata]
lock-version = "1.1"
python-versions = "^3.7"
content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e"
content-hash = "d2fcba2af0a32cde3a1d0c8cfdfe5fb26531599b0c8c376bf16e200a74b55553"
[metadata.files]
aiopg = [
@@ -1693,8 +1705,8 @@ mccabe = [
{file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
]
moto = [
{file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"},
{file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"},
{file = "moto-3.1.9-py3-none-any.whl", hash = "sha256:8928ec168e5fd88b1127413b2fa570a80d45f25182cdad793edd208d07825269"},
{file = "moto-3.1.9.tar.gz", hash = "sha256:ba683e70950b6579189bc12d74c1477aa036c090c6ad8b151a22f5896c005113"},
]
mypy = [
{file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"},
@@ -1741,6 +1753,10 @@ pluggy = [
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
]
prometheus-client = [
{file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"},
{file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"},
]
psycopg2-binary = [
{file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
@@ -1831,8 +1847,8 @@ pyflakes = [
{file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
]
pyjwt = [
{file = "PyJWT-2.3.0-py3-none-any.whl", hash = "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"},
{file = "PyJWT-2.3.0.tar.gz", hash = "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41"},
{file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
{file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},
]
pyparsing = [
{file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"},

View File

@@ -38,7 +38,6 @@ async fn flatten_err(
#[tokio::main]
async fn main() -> anyhow::Result<()> {
metrics::set_common_metrics_prefix("zenith_proxy");
let arg_matches = App::new("Neon proxy/router")
.version(GIT_VERSION)
.arg(

View File

@@ -5,7 +5,7 @@ use crate::stream::{MetricsStream, PqStream, Stream};
use anyhow::{bail, Context};
use futures::TryFutureExt;
use lazy_static::lazy_static;
use metrics::{new_common_metric_name, register_int_counter, IntCounter};
use metrics::{register_int_counter, IntCounter};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use utils::pq_proto::{BeMessage as Be, *};
@@ -15,17 +15,17 @@ const ERR_PROTO_VIOLATION: &str = "protocol violation";
lazy_static! {
static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!(
new_common_metric_name("num_connections_accepted"),
"proxy_accepted_connections",
"Number of TCP client connections accepted."
)
.unwrap();
static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!(
new_common_metric_name("num_connections_closed"),
"proxy_closed_connections",
"Number of TCP client connections closed."
)
.unwrap();
static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!(
new_common_metric_name("num_bytes_proxied"),
"proxy_io_bytes",
"Number of bytes sent/received between any client and backend."
)
.unwrap();

View File

@@ -23,6 +23,7 @@ boto3-stubs = "^1.20.40"
moto = {version = "^3.0.0", extras = ["server"]}
backoff = "^1.11.1"
pytest-lazy-fixture = "^0.6.3"
prometheus-client = "^0.14.1"
[tool.poetry.dev-dependencies]
yapf = "==0.31.0"

View File

@@ -32,7 +32,6 @@ const ID_FILE_NAME: &str = "safekeeper.id";
project_git_version!(GIT_VERSION);
fn main() -> anyhow::Result<()> {
metrics::set_common_metrics_prefix("safekeeper");
let arg_matches = App::new("Zenith safekeeper")
.about("Store WAL stream to local file system and push it to WAL receivers")
.version(GIT_VERSION)

View File

@@ -1,8 +1,12 @@
from contextlib import closing
from datetime import datetime
import os
import pytest
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.utils import lsn_to_hex
@pytest.mark.parametrize('with_safekeepers', [False, True])
@@ -38,3 +42,79 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (5000050000, )
def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
tenant_1, _ = env.zenith_cli.create_tenant()
tenant_2, _ = env.zenith_cli.create_tenant()
timeline_1 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1)
timeline_2 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2)
pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1)
pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2)
for pg in [pg_tenant1, pg_tenant2]:
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (5000050000, )
collected_metrics = {
"pageserver": env.pageserver.http_client().get_metrics(),
}
for sk in env.safekeepers:
collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str()
for name in collected_metrics:
basepath = os.path.join(zenith_env_builder.repo_dir, f'{name}.metrics')
with open(basepath, 'w') as stdout_f:
print(collected_metrics[name], file=stdout_f, flush=True)
all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()]
ps_metrics = all_metrics[0]
sk_metrics = all_metrics[1:]
ttids = [{
'tenant_id': tenant_1.hex, 'timeline_id': timeline_1.hex
}, {
'tenant_id': tenant_2.hex, 'timeline_id': timeline_2.hex
}]
# Test metrics per timeline
for tt in ttids:
log.info(f"Checking metrics for {tt}")
ps_lsn = int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value)
sk_lsns = [int(sk.query_one("safekeeper_commit_lsn", filter=tt).value) for sk in sk_metrics]
log.info(f"ps_lsn: {lsn_to_hex(ps_lsn)}")
log.info(f"sk_lsns: {list(map(lsn_to_hex, sk_lsns))}")
assert ps_lsn <= max(sk_lsns)
assert ps_lsn > 0
# Test common metrics
for metrics in all_metrics:
log.info(f"Checking common metrics for {metrics.name}")
log.info(
f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}")
log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}")
log.info(
f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}"
)
log.info(
f"process_virtual_memory_bytes (MB): {metrics.query_one('process_virtual_memory_bytes').value / 1024 / 1024}"
)
log.info(f"process_open_fds: {int(metrics.query_one('process_open_fds').value)}")
log.info(f"process_max_fds: {int(metrics.query_one('process_max_fds').value)}")
log.info(
f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
)

View File

@@ -236,14 +236,14 @@ class ZenithBenchmarker:
"""
Fetch the "cumulative # of bytes written" metric from the pageserver
"""
metric_name = r'pageserver_disk_io_bytes{io_operation="write"}'
metric_name = r'libmetrics_disk_io_bytes{io_operation="write"}'
return self.get_int_counter_value(pageserver, metric_name)
def get_peak_mem(self, pageserver) -> int:
"""
Fetch the "maxrss" metric from the pageserver
"""
metric_name = r'pageserver_maxrss_kb'
metric_name = r'libmetrics_maxrss_kb'
return self.get_int_counter_value(pageserver, metric_name)
def get_int_counter_value(self, pageserver, metric_name) -> int:

View File

@@ -0,0 +1,38 @@
from dataclasses import dataclass
from prometheus_client.parser import text_string_to_metric_families
from prometheus_client.samples import Sample
from typing import Dict, List
from collections import defaultdict
from fixtures.log_helper import log
class Metrics:
metrics: Dict[str, List[Sample]]
name: str
def __init__(self, name: str = ""):
self.metrics = defaultdict(list)
self.name = name
def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]:
res = []
for sample in self.metrics[name]:
if all(sample.labels[k] == v for k, v in filter.items()):
res.append(sample)
return res
def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample:
res = self.query_all(name, filter)
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
return res[0]
def parse_metrics(text: str, name: str = ""):
metrics = Metrics(name)
gen = text_string_to_metric_families(text)
for family in gen:
for sample in family.samples:
metrics.metrics[sample.name].append(sample)
return metrics

View File

@@ -1833,10 +1833,13 @@ class SafekeeperHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def get_metrics(self) -> SafekeeperMetrics:
def get_metrics_str(self) -> str:
request_result = self.get(f"http://localhost:{self.port}/metrics")
request_result.raise_for_status()
all_metrics_text = request_result.text
return request_result.text
def get_metrics(self) -> SafekeeperMetrics:
all_metrics_text = self.get_metrics_str()
metrics = SafekeeperMetrics()
for match in re.finditer(